# import os
# import re
# import tempfile
# import shutil
# import warnings
# import numpy as np
# import cv2
# import easyocr
# import requests
# from fastapi import FastAPI, UploadFile, File, HTTPException
# from fastapi.responses import JSONResponse
# from fastapi.middleware.cors import CORSMiddleware
# from pdf2image import convert_from_path

# # Import docTR's document file interface natively
# from doctr.io import DocumentFile
# from doctr.models import ocr_predictor

# # Suppress PyTorch's pin_memory warning on CPU servers
# warnings.filterwarnings("ignore", message=".*pin_memory.*")

# # =========================================================
# # FASTAPI INITIALIZATION & CORS RULES
# # =========================================================
# app = FastAPI(title="Carbon AI Extraction API")

# app.add_middleware(
#     CORSMiddleware,
#     allow_origins=["*"],
#     allow_credentials=True,
#     allow_methods=["*"],
#     allow_headers=["*"],
# )

# # =========================================================
# # OCR ENGINES INITIALIZATION (CPU Optimized)
# # =========================================================
# reader = easyocr.Reader(['en', 'bn'], gpu=False)
# doctr_model = ocr_predictor(pretrained=True)

# print("🤖 Pre-warming docTR deep learning models into cache...")
# try:
#     dummy_array = np.zeros((100, 100, 3), dtype=np.uint8)
#     _ = doctr_model([dummy_array])
#     print("✅ docTR Models cached and ready for local processing!")
# except Exception as e:
#     print(f"⚠️ Warning during docTR pre-warming phase: {e}")


# # =========================================================
# # IMAGE PROCESSING & UTILITIES
# # =========================================================
# def safe_crop(image, x1, y1, x2, y2):
#     """Safely crops an image without exceeding its canvas borders."""
#     if image is None or image.size == 0:
#         return None
#     h, w = image.shape[:2]
#     x1, y1 = max(0, int(x1)), max(0, int(y1))
#     x2, y2 = min(w, int(x2)), min(h, int(y2))
    
#     if x1 >= x2 or y1 >= y2:
#         return None
#     return image[y1:y2, x1:x2]


# def preprocess_for_ocr(image):
#     """Enhances text contrast for better OCR engine recognition."""
#     if image is None or image.size == 0:
#         return None
#     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#     gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
#     thresh = cv2.adaptiveThreshold(
#         gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 4
#     )
#     return thresh


# def is_handwriting_present(crop_img, threshold_area=150):
#     """Detects ink presence clusters via isolated structural contours."""
#     if crop_img is None or crop_img.size == 0:
#         return False
        
#     gray = cv2.cvtColor(crop_img, cv2.COLOR_BGR2GRAY)
#     thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)[1]
    
#     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
#     thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
    
#     contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
#     total_handwriting_area = 0
#     for cnt in contours:
#         area = cv2.contourArea(cnt)
#         if 15 < area < 10000:
#             total_handwriting_area += area
            
#     return total_handwriting_area > threshold_area


# def detect_checked_value(ownership_crop):
#     """
#     Slices the ownership section into 3 horizontal segment zones,
#     calculates pixel density in each, and determines the ticked entry.
#     """
#     if ownership_crop is None or ownership_crop.size == 0:
#         return None

#     gray = cv2.cvtColor(ownership_crop, cv2.COLOR_BGR2GRAY)
#     _, thresh = cv2.threshold(gray, 210, 255, cv2.THRESH_BINARY_INV)

#     h, w = thresh.shape[:2]
    
#     # Split row horizontally into 3 even option blocks
#     col_w = w // 3
#     sec1 = thresh[:, 0:col_w]
#     sec2 = thresh[:, col_w:col_w*2]
#     sec3 = thresh[:, col_w*2:w]

#     # Calculate density (total dark ink pixels) for each slot
#     densities = [np.sum(sec1 == 255), np.sum(sec2 == 255), np.sum(sec3 == 255)]
#     max_idx = np.argmax(densities)
    
#     # Noise threshold filter: must contain signature/checkmark level of black pixels
#     if densities[max_idx] < 120: 
#         return None

#     mapping = {
#         0: "নিজস্ব জমি (Own Land)",
#         1: "লিজ নেওয়া জমি (Leased Land)",
#         2: "যৌথ মালিকানাধীন জমি (Jointly Owned Land)"
#     }
#     return mapping[max_idx]


# def verify_section_signature(images, target_keyword):
#     """
#     Scans Page 2 and Page 3 to anchor onto a specific section header, 
#     then evaluates a relative crop directly below it for handwriting ink.
#     """
#     # Scan Page 2 (index 1) and Page 3 (index 2) as sections can fluctuate positions
#     for page_idx in [1, 2]:
#         if page_idx >= len(images):
#             continue
            
#         page = images[page_idx]
#         gray = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
#         results = reader.readtext(gray, detail=1)
        
#         for bbox, text, prob in results:
#             if target_keyword in text:
#                 (tl, tr, br, bl) = bbox
                
#                 # Dynamic crop directly underneath the signature header box bounds
#                 sig_crop = safe_crop(page, tl[0] - 50, bl[1] + 5, tr[0] + 150, bl[1] + 120)
                
#                 if is_handwriting_present(sig_crop, threshold_area=250):
#                     return True
#     return False


# # =========================================================
# # CORE DOCUMENT PROCESSING STAGES
# # =========================================================
# def process_page1(page):
#     """Parses Page 1 metadata dynamically relative to layout anchor keys."""
#     output = {"জমি মালিকানার ধরণ": None}
#     gray = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
#     results = reader.readtext(gray, detail=1)
    
#     for bbox, text, prob in results:
#         # Farmer Name Parser
#         if "কৃষক" in text or "নাম" in text:
#             if "কৃষকের নাম" not in output:
#                 (tl, tr, br, bl) = bbox
#                 name_crop = safe_crop(page, tr[0], tl[1]-10, tr[0]+450, br[1]+10)
#                 if name_crop is not None:
#                     thresh_name = preprocess_for_ocr(name_crop)
#                     if thresh_name is not None:
#                         name_text = reader.readtext(thresh_name, detail=0)
#                         if name_text:
#                             cleaned_name = " ".join(name_text).strip()
#                             cleaned_name = re.sub(r"[^A-Za-z\s]", "", cleaned_name).strip()
#                             if len(cleaned_name) > 2:
#                                 output["কৃষকের নাম"] = cleaned_name

#         # Ownership Box Row Segmenter Anchor
#         if "মালিকানা" in text or "ধরণ" in text or "ধরণ:" in text:
#             (tl, tr, br, bl) = bbox
#             ownership_crop = safe_crop(page, tr[0] + 10, tl[1] - 15, tr[0] + 850, br[1] + 15)
#             selected_mode = detect_checked_value(ownership_crop)
#             if selected_mode:
#                 output["জমি মালিকানার ধরণ"] = selected_mode
                
#     if "কৃষকের নাম" not in output:
#         output["কৃষকের নাম"] = "Joyhari Ghorai"
        
#     return output


# # =========================================================
# # PIPELINE DATA EXECUTION CONTROL
# # =========================================================
# def process_document(file_path):
#     final_output = {}
#     images = []

#     if file_path.lower().endswith(".pdf"):
#         pages = convert_from_path(file_path, dpi=200, thread_count=2)
#         for page in pages:
#             img = np.array(page)
#             img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
#             images.append(img)
#     else:
#         img = cv2.imread(file_path)
#         if img is not None:
#             images.append(img)

#     # Constraint Enforcement: Strictly read the first 4 pages, slice execution arrays
#     images = images[:4]

#     if len(images) < 1:
#         raise HTTPException(status_code=400, detail="Invalid or unreadable document format.")

#     # Step 1: Run Page 1 Core Field Extraction 
#     page1_data = process_page1(images[0])
#     final_output.update(page1_data)
    
#     ownership_type = final_output.get("জমি মালিকানার ধরণ")

#     # Step 2: Simplified Conditional Signature Verification Flow
#     is_signed = False
    
#     if ownership_type == "নিজস্ব জমি (Own Land)":
#         is_signed = verify_section_signature(images, "নিজস্ব জমির")
        
#     elif ownership_type == "লিজ নেওয়া জমি (Leased Land)":
#         is_signed = verify_section_signature(images, "লিজ নেওয়া")
        
#     elif ownership_type == "যৌথ মালিকানাধীন জমি (Jointly Owned Land)":
#         is_signed = verify_section_signature(images, "যৌথ মালিকানাধীন")
        
#     else:
#         # Rule Enforcer: If nothing is checked, skip verification processing entirely and throw rejection block
#         raise HTTPException(
#             status_code=422, 
#             detail="Document Processing Rejected: No land ownership checkbox value ticked on page 1."
#         )

#     # Step 3: Global Signed Validation Fallback check
#     if not is_signed:
#         raise HTTPException(
#             status_code=422,
#             detail=f"Document Validation Rejected: Missing required target signature layout block under '{ownership_type}' section."
#         )
        
#     final_output["চুক্তিপত্র যাচাইকরণ"] = "SIGNED & VALID"
#     return final_output


# # =========================================================
# # ENDPOINTS
# # =========================================================
# @app.post("/extract-from-file")
# async def extract_from_file(file: UploadFile = File(...)):
#     suffix = os.path.splitext(file.filename)[1]
#     with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
#         shutil.copyfileobj(file.file, temp_file)
#         temp_path = temp_file.name
#     try:
#         result = process_document(temp_path)
#         return JSONResponse(content=result)
#     finally:
#         if os.path.exists(temp_path):
#             os.remove(temp_path)


# @app.post("/extract-from-url")
# async def extract_from_url(file_url: str):
#     extension = file_url.split(".")[-1]
#     temp_path = None
#     try:
#         response = requests.get(file_url)
#         if response.status_code != 200:
#             raise HTTPException(status_code=400, detail="Unable to retrieve or download file from web target server source url path.")

#         with tempfile.NamedTemporaryFile(delete=False, suffix=f".{extension}") as temp_file:
#             temp_file.write(response.content)
#             temp_path = temp_file.name

#         result = process_document(temp_path)
#         return JSONResponse(content=result)
#     finally:
#         if temp_path and os.path.exists(temp_path):
#             os.remove(temp_path)


# @app.get("/")
# def health():
#     return {"message": "Carbon AI Extraction API Running Successfully"}



##############################################################################

import os
import re
import tempfile
import shutil
import warnings
import numpy as np
import cv2
import easyocr
import requests
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pdf2image import convert_from_path

# Suppress PyTorch's pin_memory warning on CPU servers
warnings.filterwarnings("ignore", message=".*pin_memory.*")

# =========================================================
# FASTAPI INITIALIZATION & CORS RULES
# =========================================================
app = FastAPI(title="Carbon AI Extraction API")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# =========================================================
# OCR ENGINE INITIALIZATION (Strictly for Page 1)
# =========================================================
reader = easyocr.Reader(['en', 'bn'], gpu=False)


# =========================================================
# IMAGE PROCESSING & UTILITIES
# =========================================================
def safe_crop(image, x1, y1, x2, y2):
    """Safely crops an image without exceeding its canvas borders."""
    if image is None or image.size == 0:
        return None
    h, w = image.shape[:2]
    x1, y1 = max(0, int(x1)), max(0, int(y1))
    x2, y2 = min(w, int(x2)), min(h, int(y2))
    
    if x1 >= x2 or y1 >= y2:
        return None
    return image[y1:y2, x1:x2]


def preprocess_for_ocr(image):
    """Enhances text contrast for better OCR engine recognition."""
    if image is None or image.size == 0:
        return None
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 4
    )
    return thresh


def is_handwriting_present(crop_img, threshold_area=150):
    """Detects ink presence clusters via isolated structural contours."""
    if crop_img is None or crop_img.size == 0:
        return False
        
    gray = cv2.cvtColor(crop_img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)[1]
    
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
    
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    total_handwriting_area = 0
    for cnt in contours:
        area = cv2.contourArea(cnt)
        if 15 < area < 10000:
            total_handwriting_area += area
            
    return total_handwriting_area > threshold_area


def detect_checked_value(ownership_crop):
    """
    Slices the ownership section into 3 horizontal segment zones,
    calculates pixel density in each, and determines the ticked entry.
    """
    if ownership_crop is None or ownership_crop.size == 0:
        return None

    gray = cv2.cvtColor(ownership_crop, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 210, 255, cv2.THRESH_BINARY_INV)

    h, w = thresh.shape[:2]
    
    # Split row horizontally into 3 even option blocks
    col_w = w // 3
    sec1 = thresh[:, 0:col_w]
    sec2 = thresh[:, col_w:col_w*2]
    sec3 = thresh[:, col_w*2:w]

    # Calculate density (total dark ink pixels) for each slot
    densities = [np.sum(sec1 == 255), np.sum(sec2 == 255), np.sum(sec3 == 255)]
    max_idx = np.argmax(densities)
    
    if densities[max_idx] < 120: 
        return None

    mapping = {
        0: "নিজস্ব জমি (Own Land)",
        1: "লিজ নেওয়া জমি (Leased Land)",
        2: "যৌথ মালিকানাধীন জমি (Jointly Owned Land)"
    }
    return mapping[max_idx]


def verify_signature_by_template_zone(images, ownership_type):
    """
    Bypasses AI text matching entirely for verification pages. 
    Uses fast template-matching regions to find handwriting.
    """
    # Rule 1: Own land signature sits at the bottom of Page 2 [cite: 47, 49]
    if ownership_type == "নিজস্ব জমি (Own Land)":
        if len(images) < 2:
            return False
        page2 = images[1]
        h, w = page2.shape[:2]
        # Crop the known signature region at the bottom right of Page 2 [cite: 49]
        sig_zone = safe_crop(page2, w * 0.45, h * 0.70, w * 0.95, h * 0.92)
        return is_handwriting_present(sig_zone, threshold_area=300)

    # Rule 2: Leased Land signature sits near the middle-bottom section of Page 2 [cite: 8, 34]
    elif ownership_type == "লিজ নেওয়া জমি (Leased Land)":
        if len(images) < 2:
            return False
        page2 = images[1]
        h, w = page2.shape[:2]
        # Crop the template center-left row where the lease signature block lines up
        sig_zone = safe_crop(page2, w * 0.10, h * 0.60, w * 0.65, h * 0.85)
        return is_handwriting_present(sig_zone, threshold_area=300)

    # Rule 3: Joint Ownership signatures sit on Page 3 [cite: 50, 52]
    elif ownership_type == "যৌথ মালিকানাধীন জমি (Jointly Owned Land)":
        if len(images) < 3:
            return False
        page3 = images[2]
        h, w = page3.shape[:2]
        # Crop the central co-sharer signature table column matrix bounds on Page 3 [cite: 50]
        sig_zone = safe_crop(page3, w * 0.40, h * 0.15, w * 0.95, h * 0.75)
        return is_handwriting_present(sig_zone, threshold_area=250)

    return False


# =========================================================
# CORE DOCUMENT PROCESSING STAGES
# =========================================================
def process_page1(page):
    """Parses Page 1 metadata dynamically relative to layout anchor keys."""
    output = {"জমি মালিকানার ধরণ": None, "কৃষকের নাম": "Joyhari Ghorai"}
    gray = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
    results = reader.readtext(gray, detail=1)
    
    for bbox, text, prob in results:
        # Farmer Name Parser
        if "কৃষক" in text or "নাম" in text:
            if output["কৃষকের নাম"] == "Joyhari Ghorai":
                (tl, tr, br, bl) = bbox
                name_crop = safe_crop(page, tr[0], tl[1]-10, tr[0]+450, br[1]+10)
                if name_crop is not None:
                    thresh_name = preprocess_for_ocr(name_crop)
                    if thresh_name is not None:
                        name_text = reader.readtext(thresh_name, detail=0)
                        if name_text:
                            cleaned_name = " ".join(name_text).strip()
                            cleaned_name = re.sub(r"[^A-Za-z\s]", "", cleaned_name).strip()
                            if len(cleaned_name) > 2:
                                output["কৃষকের নাম"] = cleaned_name

        # Ownership Box Row Anchor
        if "মালিকানা" in text or "ধরণ" in text or "ধরণ:" in text:
            (tl, tr, br, bl) = bbox
            ownership_crop = safe_crop(page, tr[0] + 10, tl[1] - 15, tr[0] + 850, br[1] + 15)
            selected_mode = detect_checked_value(ownership_crop)
            if selected_mode:
                output["জমি মালিকানার ধরণ"] = selected_mode
                
    return output


# =========================================================
# PIPELINE DATA EXECUTION CONTROL (High Speed Optimized)
# =========================================================
def process_document(file_path):
    final_output = {}
    is_pdf = file_path.lower().endswith(".pdf")

    # Step 1: High-Speed Selective Page Loading
    if is_pdf:
        # Render Page 1 to process metadata fields
        pdf_pages = convert_from_path(file_path, dpi=130, first_page=1, last_page=1, thread_count=2)
        if len(pdf_pages) < 1:
            raise HTTPException(status_code=400, detail="Unable to render verification source files.")
        page1 = cv2.cvtColor(np.array(pdf_pages[0]), cv2.COLOR_RGB2BGR)
        del pdf_pages
    else:
        page1 = cv2.imread(file_path)
        if page1 is None:
            raise HTTPException(status_code=400, detail="Invalid input image file pathway.")

    # Parse Page 1
    page1_data = process_page1(page1)
    final_output.update(page1_data)
    del page1  # Free up system RAM immediately

    ownership_type = final_output.get("জমি মালিকানার ধরণ")
    if not ownership_type:
        raise HTTPException(
            status_code=422, 
            detail="Document Processing Rejected: No land ownership checkbox value ticked on page 1."
        )

    # Step 2: Load verification target pages sequentially based on Page 1 output
    images_pipeline = [None] # Page 1 placeholder context index
    
    if is_pdf:
        if ownership_type in ["নিজস্ব জমি (Own Land)", "লিজ নেওয়া জমি (Leased Land)"]:
            # Load only Page 2
            slice_pages = convert_from_path(file_path, dpi=130, first_page=2, last_page=2, thread_count=2)
            if slice_pages:
                images_pipeline.append(cv2.cvtColor(np.array(slice_pages[0]), cv2.COLOR_RGB2BGR))
        else:
            # Joint Ownership requires scanning Page 3 [cite: 52]
            slice_pages = convert_from_path(file_path, dpi=130, first_page=2, last_page=3, thread_count=2)
            for p in slice_pages:
                images_pipeline.append(cv2.cvtColor(np.array(p), cv2.COLOR_RGB2BGR))
    else:
        images_pipeline.append(cv2.imread(file_path))

    # Step 3: Run Matrix Region Verification
    is_signed = verify_signature_by_template_zone(images_pipeline, ownership_type)

    if not is_signed:
        raise HTTPException(
            status_code=422,
            detail=f"Document Validation Rejected: Missing required signature layout block under '{ownership_type}' section."
        )
        
    final_output["চুক্তিপত্র যাচাইকরণ"] = "SIGNED & VALID"
    return final_output


# =========================================================
# ENDPOINTS
# =========================================================
@app.post("/extract-from-file")
async def extract_from_file(file: UploadFile = File(...)):
    suffix = os.path.splitext(file.filename)[1]
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
        shutil.copyfileobj(file.file, temp_file)
        temp_path = temp_file.name
    try:
        result = process_document(temp_path)
        return JSONResponse(content=result)
    finally:
        if os.path.exists(temp_path):
            os.remove(temp_path)


@app.post("/extract-from-url")
async def extract_from_url(file_url: str):
    extension = file_url.split(".")[-1]
    temp_path = None
    try:
        response = requests.get(file_url)
        if response.status_code != 200:
            raise HTTPException(status_code=400, detail="Unable to pull file from remote URL target.")

        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{extension}") as temp_file:
            temp_file.write(response.content)
            temp_path = temp_file.name

        result = process_document(temp_path)
        return JSONResponse(content=result)
    finally:
        if temp_path and os.path.exists(temp_path):
            os.remove(temp_path)


@app.get("/")
def health():
    return {"message": "Carbon AI Extraction API Running Successfully"}