######################
# This is the latest code got from Claude 4.5 which is running on 188 server 
#####################

# from fastapi import FastAPI, HTTPException, Request
# from pydantic import BaseModel
# import pytesseract
# import re
# import requests
# import tempfile
# import pdfplumber
# import os
# from pdf2image import convert_from_path
# from fastapi.responses import JSONResponse
# import logging

# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# OCR_LANG = "ben+eng"
# DPI = 400

# app = FastAPI(title="Land PDF Search API")


# class SearchRequest(BaseModel):
#     pdf_urls: list[str]
#     JL_No: list[str]
#     Daag_No: list[str]


# BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")


# def normalize_text(text: str) -> str:
#     text = text.translate(BN_TO_EN)
#     text = re.sub(r"\s+", " ", text).strip()
#     return text


# def clean_cell(cell: str | None) -> str:
#     if not cell:
#         return ""
#     return normalize_text(cell)


# # ─────────────────────────────────────────────────────────────────────────────
# # JL: anchor on the Bengali/English label to avoid false positives
# # ─────────────────────────────────────────────────────────────────────────────
# def extract_jl_numbers(text: str) -> set[str]:
#     labeled = set(re.findall(
#         r"(?:জে\.?\s*এল\.?\s*নং|J\.?\s*L\.?\s*No\.?)\s*[:\-]?\s*(\d{2,4})",
#         text,
#         re.IGNORECASE
#     ))
#     if labeled:
#         return labeled
#     # Fallback: any 2-4 digit standalone number
#     return set(re.findall(r"\b\d{2,4}\b", text))


# # ─────────────────────────────────────────────────────────────────────────────
# # Daag: just a 2-4 digit number followed by a Bengali character
# # Matches "182 দলা ..." and "796 খাস জমি ..." -> captures 182, 796
# # ─────────────────────────────────────────────────────────────────────────────
# DAAG_LINE_PATTERN = re.compile(
#     r"\b(\d{2,4})\s+[\u0980-\u09FF]",
#     re.UNICODE
# )


# def extract_daag_from_pdf_text(pdf_path: str) -> set[str]:
#     results = set()
#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             # Primary: table-based extraction
#             table_settings = {
#                 "vertical_strategy": "lines",
#                 "horizontal_strategy": "lines",
#                 "snap_tolerance": 3,
#                 "join_tolerance": 3,
#             }
#             for table in page.extract_tables(table_settings):
#                 for row in table:
#                     if not row:
#                         continue
#                     daag_cell = clean_cell(row[0])
#                     if re.fullmatch(r"\d{2,4}", daag_cell):
#                         results.add(daag_cell)

#             # Fallback: raw text scan
#             text = normalize_text(page.extract_text() or "")
#             results.update(DAAG_LINE_PATTERN.findall(text))

#     return results


# def extract_daag_numbers_ocr(text: str) -> set[str]:
#     text = normalize_text(text)
#     text = re.sub(r"দা\s*গ", "দাগ", text)
#     return set(DAAG_LINE_PATTERN.findall(text))


# def download_pdf(url: str, folder: str) -> str:
#     path = os.path.join(folder, os.path.basename(url))
#     r = requests.get(url, timeout=30)
#     r.raise_for_status()
#     with open(path, "wb") as f:
#         f.write(r.content)
#     return path


# @app.post("/extract")
# def extract_land_data(payload: SearchRequest):
#     req_jl = set(payload.JL_No)
#     req_daag = set(payload.Daag_No)
#     found_jl = set()
#     found_daag = set()
#     errors = []

#     with tempfile.TemporaryDirectory() as tmpdir:
#         for url in payload.pdf_urls:

#             # Download
#             try:
#                 pdf_path = download_pdf(url, tmpdir)
#             except Exception as e:
#                 msg = f"Failed to download {url}: {e}"
#                 logger.error(msg)
#                 errors.append(msg)
#                 continue

#             # OCR for JL numbers
#             ocr_text = ""
#             try:
#                 images = convert_from_path(pdf_path, dpi=DPI)
#                 for img in images:
#                     page_ocr = pytesseract.image_to_string(
#                         img, lang=OCR_LANG, config="--psm 6"
#                     )
#                     ocr_text += " " + normalize_text(page_ocr)

#                 found_jl.update(req_jl & extract_jl_numbers(ocr_text))
#             except Exception as e:
#                 msg = f"OCR failed for {url}: {e}"
#                 logger.error(msg)
#                 errors.append(msg)

#             # Primary: text-layer Daag extraction
#             try:
#                 daag_from_pdf = extract_daag_from_pdf_text(pdf_path)
#                 found_daag.update(req_daag & daag_from_pdf)
#             except Exception as e:
#                 msg = f"pdfplumber extraction failed for {url}: {e}"
#                 logger.error(msg)
#                 errors.append(msg)

#             # Fallback: OCR-based Daag extraction for anything still missing
#             if (req_daag - found_daag) and ocr_text:
#                 found_daag.update(req_daag & extract_daag_numbers_ocr(ocr_text))

#     response = {
#         "found": {
#             "JL_No": sorted(found_jl),
#             "Daag_No": sorted(found_daag),
#         },
#         "not_found": {
#             "JL_No": sorted(req_jl - found_jl),
#             "Daag_No": sorted(req_daag - found_daag),
#         },
#         "errors": errors,
#     }

#     if not response["not_found"]["JL_No"] and not response["not_found"]["Daag_No"]:
#         return response

#     raise HTTPException(status_code=422, detail=response)


import logging
import os
import re
import tempfile
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Set

import requests
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from pdf2image import convert_from_path
import pytesseract
import pdfplumber
from PIL import Image  # for preprocessing

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

OCR_LANG = "ben+eng"
DPI = 200                    # ← Reduced from 400 (biggest single win, usually sufficient)
MAX_WORKERS = 4              # Adjust based on your CPU cores / memory (4-8 is safe)

app = FastAPI(title="Land PDF Search API")

class SearchRequest(BaseModel):
    pdf_urls: list[str]
    JL_No: list[str]
    Daag_No: list[str]


BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")


def normalize_text(text: str) -> str:
    text = text.translate(BN_TO_EN)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def clean_cell(cell: str | None) -> str:
    return normalize_text(cell) if cell else ""


def extract_jl_numbers(text: str) -> set[str]:
    labeled = set(re.findall(
        r"(?:জে\.?\s*এল\.?\s*নং|J\.?\s*L\.?\s*No\.?)\s*[:\-]?\s*(\d{2,4})",
        text, re.IGNORECASE
    ))
    return labeled or set(re.findall(r"\b\d{2,4}\b", text))


DAAG_LINE_PATTERN = re.compile(r"\b(\d{2,4})\s+[\u0980-\u09FF]", re.UNICODE)


def preprocess_image_for_ocr(img: Image.Image) -> Image.Image:
    """Light preprocessing → faster + often better OCR"""
    if img.mode != "L":
        img = img.convert("L")                    # grayscale
    # Optional: simple contrast enhancement (cheap)
    # img = img.point(lambda p: p * 1.2 if p > 80 else p)
    return img


def ocr_page(img: Image.Image) -> str:
    img = preprocess_image_for_ocr(img)
    return pytesseract.image_to_string(
        img,
        lang=OCR_LANG,
        config="--psm 6 -c tessedit_do_invert=0"   # small speed gain
    )


def extract_jl_from_ocr(pdf_path: str) -> set[str]:
    """Parallel OCR across pages using ThreadPoolExecutor"""
    try:
        images = convert_from_path(
            pdf_path,
            dpi=DPI,
            thread_count=MAX_WORKERS,      # pdf2image internal parallelism
            fmt="jpeg"                     # JPEG is 2-3x faster than PNG
        )

        ocr_texts = []
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            future_to_img = {executor.submit(ocr_page, img): img for img in images}
            for future in as_completed(future_to_img):
                try:
                    text = future.result()
                    ocr_texts.append(normalize_text(text))
                except Exception as e:
                    logger.warning(f"OCR failed on one page: {e}")

        full_ocr = " ".join(ocr_texts)
        return extract_jl_numbers(full_ocr)

    except Exception as e:
        logger.error(f"OCR failed for {pdf_path}: {e}")
        return set()


def extract_daag_from_pdf_text(pdf_path: str) -> set[str]:
    results: Set[str] = set()
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Table extraction (keep as-is, it's reliable)
                table_settings = {
                    "vertical_strategy": "lines",
                    "horizontal_strategy": "lines",
                    "snap_tolerance": 3,
                    "join_tolerance": 3,
                }
                for table in page.extract_tables(table_settings):
                    for row in table or []:
                        if row and row[0]:
                            daag_cell = clean_cell(row[0])
                            if re.fullmatch(r"\d{2,4}", daag_cell):
                                results.add(daag_cell)

                # Fallback text scan (fast)
                text = normalize_text(page.extract_text() or "")
                results.update(DAAG_LINE_PATTERN.findall(text))
    except Exception as e:
        logger.error(f"pdfplumber failed for {pdf_path}: {e}")

    return results


def extract_daag_numbers_ocr(ocr_text: str) -> set[str]:
    text = normalize_text(ocr_text)
    text = re.sub(r"দা\s*গ", "দাগ", text)
    return set(DAAG_LINE_PATTERN.findall(text))


def download_pdf(url: str, folder: str) -> str:
    path = os.path.join(folder, os.path.basename(url))
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    with open(path, "wb") as f:
        f.write(r.content)
    return path


@app.post("/extract")
async def extract_land_data(payload: SearchRequest):   # ← make endpoint async
    req_jl = set(payload.JL_No)
    req_daag = set(payload.Daag_No)
    found_jl: Set[str] = set()
    found_daag: Set[str] = set()
    errors = []

    with tempfile.TemporaryDirectory() as tmpdir:
        # Download all PDFs first (IO-bound → can be parallelized if many)
        pdf_paths = []
        for url in payload.pdf_urls:
            try:
                pdf_paths.append(download_pdf(url, tmpdir))
            except Exception as e:
                msg = f"Failed to download {url}: {e}"
                logger.error(msg)
                errors.append(msg)

        # Process each PDF (CPU-heavy part)
        for pdf_path in pdf_paths:
            # 1. JL via OCR (most expensive)
            try:
                jl_from_ocr = extract_jl_from_ocr(pdf_path)
                found_jl.update(req_jl & jl_from_ocr)
            except Exception as e:
                msg = f"JL OCR failed for {pdf_path}: {e}"
                logger.error(msg)
                errors.append(msg)

            # 2. Daag via pdfplumber (primary, fast enough)
            try:
                daag_from_pdf = extract_daag_from_pdf_text(pdf_path)
                found_daag.update(req_daag & daag_from_pdf)
            except Exception as e:
                msg = f"pdfplumber extraction failed for {pdf_path}: {e}"
                logger.error(msg)
                errors.append(msg)

            # 3. OCR fallback for remaining Daag only (cheaper now)
            if req_daag - found_daag:
                # Re-use OCR text if we already did it, otherwise run light OCR
                # For simplicity we re-run (still faster than original)
                try:
                    images = convert_from_path(pdf_path, dpi=DPI, fmt="jpeg")
                    ocr_text = " ".join(
                        normalize_text(ocr_page(img)) for img in images
                    )
                    found_daag.update(req_daag & extract_daag_numbers_ocr(ocr_text))
                except Exception as e:
                    logger.warning(f"Daag OCR fallback failed: {e}")

    response = {
        "found": {
            "JL_No": sorted(found_jl),
            "Daag_No": sorted(found_daag),
        },
        "not_found": {
            "JL_No": sorted(req_jl - found_jl),
            "Daag_No": sorted(req_daag - found_daag),
        },
        "errors": errors,
    }

    if not response["not_found"]["JL_No"] and not response["not_found"]["Daag_No"]:
        return response

    raise HTTPException(status_code=422, detail=response)



# @app.post("/quick_check")
# async def quick_check(request: Request):
#     return JSONResponse(
#         status_code=200,
#         content={
#             "found": {
#                 "JL_No": ["63", "106", "125"],
#                 "Daag_No": ["182", "261", "946"],
#             },
#             "not_found": {
#                 "JL_No": [],
#                 "Daag_No": [],
#             },
#         },
#     )