"""
Banglarbhumi Land Record PDF Extractor — Dynamic Layout Mapping (DLM)
Original extraction preserved + Safe Bengali Transliteration
"""

import re, json, sys, subprocess
from pathlib import Path
from difflib import SequenceMatcher


# ==============================
# BASIC UTILS
# ==============================
BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

def bn_to_en(t):
    return t.translate(BN_TO_EN)

DECIMAL_RE  = re.compile(r"^\d+\.\d+$")
KHATIAN_RE  = re.compile(r"^[\d/]+$")
SKIP_TEXTS  = {"ব্যাক্তি", "Nil", "Remarks", "Click", "Here", "(", ")", "/"}

# ==============================
# ✅ NAME NORMALIZATION
# ==============================
def normalize_name(name: str) -> str:
    if not name:
        return ""

    name = name.lower().strip()
    name = re.sub(r"[^\w\s]", " ", name)
    name = re.sub(r"\s+", " ", name)

    return name


def name_match_score(name1: str, name2: str) -> float:
    """
    Returns similarity score (0 to 1)
    """

    # use your transliteration automatically
    n1 = normalize_name(transliterate_bengali_name(name1))
    n2 = normalize_name(transliterate_bengali_name(name2))

    if not n1 or not n2:
        return 0.0

    return round(SequenceMatcher(None, n1, n2).ratio(), 2)

# ==============================
# ✅ NAME NORMALIZATION
# ==============================


# ==============================
# ✅ SAFE TRANSLITERATION ONLY
# ==============================
def transliterate_bengali_name(name: str) -> str:
    """
    SAFE transliteration:
    - DOES NOT affect extraction
    - Only converts if Bengali present
    """

    if not name:
        return ""

    name = name.strip()

    # Skip if already English
    if not any('\u0980' <= ch <= '\u09FF' for ch in name):
        return name.title()

    try:
        from indic_transliteration.sanscript import transliterate, BENGALI, ITRANS

        result = transliterate(name, BENGALI, ITRANS)

        # cleanup
        result = result.replace("~", "").replace(".", "")
        result = result.strip()

        return result.title()

    except Exception:
        return name


# ==============================
# WORD EXTRACTION
# ==============================
def get_words(pdf_path: str) -> list[dict]:
    result = subprocess.run(
        ["pdftotext", "-bbox", pdf_path, "-"],
        capture_output=True, text=True, encoding="utf-8"
    )

    if result.returncode != 0:
        raise RuntimeError(f"pdftotext -bbox failed: {result.stderr}")

    words, page_num = [], 0

    for chunk in re.split(r'<page\b[^>]*>', result.stdout):
        for m in re.finditer(
            r'<word xMin="([\d.]+)" yMin="([\d.]+)" xMax="([\d.]+)" yMax="([\d.]+)">([^<]*)</word>',
            chunk
        ):
            text = m.group(5).strip()

            if text:
                words.append({
                    "text": text,
                    "xmin": float(m.group(1)),
                    "ymin": float(m.group(2)),
                    "xmax": float(m.group(3)),
                    "ymax": float(m.group(4)),
                    "page": page_num,
                })

        page_num += 1

    return words


# ==============================
# HEADER EXTRACTION
# ==============================
def get_layout_text(pdf_path: str) -> str:
    r = subprocess.run(
        ["pdftotext", "-layout", pdf_path, "-"],
        capture_output=True, text=True, encoding="utf-8"
    )
    return r.stdout


# def extract_header(text: str) -> dict:
#     h = {}

#     m = re.search(r"জে\.এল\s+নং\s+([\d০-৯]+)", text)
#     if m:
#         h["jl_no"] = bn_to_en(m.group(1))

#     m = re.search(r"^\s{3,10}([\d০-৯/]{2,6})\s{2,}\S+\s+[\d.]+\s+Click Here", text, re.MULTILINE)
#     if m:
#         h["daag_no"] = bn_to_en(m.group(1))

#     m = re.search(r"মৌজাঃ\s*(\S+)", text)
#     if m:
#         h["mouza"] = m.group(1).strip()

#     m = re.search(r"ব্লকঃ\s*([A-Z][A-Z0-9\-]*)", text)
#     if m:
#         h["block"] = m.group(1).strip()

#     m = re.search(r"জেলাঃ\s*([A-Z]+)", text)
#     if m:
#         h["district"] = m.group(1).strip()

#     return h


def extract_header(text: str) -> dict:
    h = {}

    # JL No
    m = re.search(r"জে\.এল\s+নং\s+([\d০-৯]+)", text)
    if m:
        h["jl_no"] = bn_to_en(m.group(1))

    # ✅ FIXED DAAG REGEX (Click Here optional)
    m = re.search(
        r"^\s{2,15}([\d০-৯/]{2,6})\s{2,}\S+\s+[\d.]+(?:\s+Click\s*Here)?",
        text,
        re.MULTILINE
    )
    if m:
        h["daag_no"] = bn_to_en(m.group(1))

    # Mouza
    m = re.search(r"মৌজাঃ\s*(\S+)", text)
    if m:
        h["mouza"] = m.group(1).strip()

    # Block
    m = re.search(r"ব্লকঃ\s*([A-Z][A-Z0-9\-]*)", text)
    if m:
        h["block"] = m.group(1).strip()

    # District
    m = re.search(r"জেলাঃ\s*([A-Z]+)", text)
    if m:
        h["district"] = m.group(1).strip()

    # ✅ TOTAL LAND (NEW)
    m = re.search(
        r"জিমর\s+মাট\s+পিরমাণ\(একর\)\s*[:\-]?\s*([\d০-৯.]+)",
        text
    )
    if m:
        h["total_land_acre"] = bn_to_en(m.group(1))
    
    return h

# ==============================
# TOTAL LAND IN ACRES
# ==============================

def extract_total_land_from_words(words):
    """
    Robust DLM-based extraction for:
    জিমর মাট পিরমাণ(একর)

    Strategy:
    1. Find keyword words containing 'পিরমাণ' or 'পরিমাণ'
    2. Look right-side / nearby words (same line priority)
    3. Pick first valid decimal number
    """

    for i, w in enumerate(words):
        txt = w["text"]

        # detect keyword (covers OCR variations)
        if "পিরমাণ" in txt or "পরিমাণ" in txt:

            base_y = w["ymin"]
            base_page = w["page"]

            # ---- FIRST: same line search (most accurate)
            same_line = [
                x for x in words
                if x["page"] == base_page
                and abs(x["ymin"] - base_y) < 3
                and x["xmin"] > w["xmax"]
            ]

            same_line_sorted = sorted(same_line, key=lambda x: x["xmin"])

            for x in same_line_sorted:
                val = bn_to_en(x["text"])
                if re.match(r"^\d+\.\d+$", val):
                    return val

            # ---- SECOND: nearby fallback search
            for j in range(i + 1, min(i + 8, len(words))):
                val = bn_to_en(words[j]["text"])
                if re.match(r"^\d+\.\d+$", val):
                    return val

    return ""


# ==============================
# COLUMN MAPPING
# ==============================
COLUMN_ANCHORS = {
    "khatian": "খতিয়ান",
    "owner": "রায়তের",
    "father": "পিতা",
    "ansha": "অংশ",
    "area": "পরিমাণ",
    "dakhaldaar": "দখলদার",
    "remarks": "মন্তব্য",
}


def build_column_map(words: list[dict]) -> dict | None:
    header_y = next((w["ymin"] for w in words if "খতিয়ান" in w["text"]), None)

    if header_y is None:
        return None

    header_words = [w for w in words if abs(w["ymin"] - header_y) < 12]

    col_x = {}

    for col, keyword in COLUMN_ANCHORS.items():
        for hw in header_words:
            if keyword in hw["text"] and col not in col_x:
                col_x[col] = hw["xmin"]
                break

    if len(col_x) < 4:
        return None

    # ---- IMPORTANT CALIBRATION (unchanged) ----
    khatian_nums = [
        w for w in words
        if w["ymin"] > header_y + 10
        and KHATIAN_RE.match(w["text"])
        and w["xmin"] < 70
    ]

    father_x = col_x.get("father", 999)

    owner_content = [
        w for w in words
        if w["ymin"] > header_y + 10
        and w["xmin"] > 50
        and w["xmin"] < father_x
        and w["text"] not in SKIP_TEXTS
        and not re.fullmatch(r'[\d./,\-()]+', w["text"])
    ]

    if khatian_nums and owner_content:
        max_khatian_xmax = max(w["xmax"] for w in khatian_nums)
        min_owner_x = min(w["xmin"] for w in owner_content)
        col_x["owner"] = (max_khatian_xmax + min_owner_x) / 2

    # boundaries
    sorted_cols = sorted(col_x.items(), key=lambda x: x[1])
    col_map = {}

    for i, (name, x_start) in enumerate(sorted_cols):
        x_end = sorted_cols[i + 1][1] if i + 1 < len(sorted_cols) else 9999
        col_map[name] = (x_start, x_end)

    return col_map


def col_of(word: dict, col_map: dict) -> str | None:
    xc = (word["xmin"] + word["xmax"]) / 2

    for col, (x0, x1) in col_map.items():
        if x0 <= xc < x1:
            return col

    return None


# ==============================
# ROW GROUPING
# ==============================
def group_into_rows(words: list[dict], tol: float = 2.0) -> list[list[dict]]:
    if not words:
        return []

    sw = sorted(words, key=lambda w: (w["page"], w["ymin"], w["xmin"]))

    rows, cur = [], [sw[0]]

    for w in sw[1:]:
        if w["page"] == cur[0]["page"] and abs(w["ymin"] - cur[0]["ymin"]) <= tol:
            cur.append(w)
        else:
            rows.append(cur)
            cur = [w]

    rows.append(cur)
    return rows


# ==============================
# ✅ ORIGINAL KHATIAN LOGIC (IMPORTANT)
# ==============================
def extract_khatian_entries(words: list[dict], col_map: dict) -> list[dict]:
    header_y = next((w["ymin"] for w in words if "খতিয়ান" in w["text"]), 0)

    data_words = [
        w for w in words
        if w["ymin"] > header_y + 10
        and w["text"] not in SKIP_TEXTS
        and not w["text"].startswith("Nil")
    ]

    rows = group_into_rows(data_words, tol=2.0)

    def words_in_col(row_words, col):
        return [w for w in row_words if col_of(w, col_map) == col]

    def text_in_col(row_words, col):
        ws = sorted(words_in_col(row_words, col), key=lambda w: w["xmin"])
        return " ".join(w["text"] for w in ws).strip()

    # ---- FIND DATA ROWS ----
    data_rows = []

    for i, row in enumerate(rows):
        khatian_raw = text_in_col(row, "khatian")
        khatian_no = bn_to_en(khatian_raw)

        if not re.fullmatch(r"\d[\d/]*", khatian_no):
            continue

        ansha = bn_to_en(text_in_col(row, "ansha"))
        area = bn_to_en(text_in_col(row, "area"))

        if not (DECIMAL_RE.match(ansha) or DECIMAL_RE.match(area)):
            continue

        data_rows.append({
            "row_idx": i,
            "row": row,
            "khatian_no": khatian_no,
            "ansha": ansha if DECIMAL_RE.match(ansha) else "",
            "area": area if DECIMAL_RE.match(area) else "",
            "data_y": row[0]["ymin"],
            "data_page": row[0]["page"],
        })

    # ---- EXTRACT NAMES (CRITICAL WINDOW LOGIC) ----
    entries = []

    for idx, dr in enumerate(data_rows):
        prev_data_y = data_rows[idx - 1]["data_y"] if idx > 0 else header_y
        data_page = dr["data_page"]
        data_y = dr["data_y"]

        window = [
            w for w in data_words
            if w["page"] == data_page and prev_data_y < w["ymin"] < data_y
        ]

        # OWNER
        owner_words = sorted(words_in_col(window, "owner"),
                             key=lambda w: (w["ymin"], w["xmin"]))

        owner_bn = " ".join(w["text"] for w in owner_words).strip()

        # FATHER
        father_words = sorted(
            words_in_col(window, "father") + words_in_col(dr["row"], "father"),
            key=lambda w: (w["ymin"], w["xmin"])
        )

        father_bn = " ".join(w["text"] for w in father_words).strip()

        entries.append({
            "khatian_no": dr["khatian_no"],
            "owner_name_bn": owner_bn,
            "owner_name": transliterate_bengali_name(owner_bn),
            "father_husband_name_bn": father_bn,
            "father_husband_name": transliterate_bengali_name(father_bn),
            "ansha": dr["ansha"],
            "area_acres": dr["area"],
        })

    return entries


# ==============================
# MAIN
# ==============================
def extract_land_record(pdf_path: str) -> dict:
    words = get_words(pdf_path)
    layout = get_layout_text(pdf_path)
    header = extract_header(layout)
    col_map = build_column_map(words)
    total_land = extract_total_land_from_words(words)
    

    if col_map is None:
        return {
            "source_file": Path(pdf_path).name,
            "error": "DLM failed",
            **header
        }

    entries = extract_khatian_entries(words, col_map)

    return {
            "source_file": Path(pdf_path).name,
            "jl_no": header.get("jl_no", ""),
            "daag_no": header.get("daag_no", ""),
            "mouza": header.get("mouza", ""),
            "block": header.get("block", ""),
            "district": header.get("district", ""),
            "total_land_acre": total_land,   # ✅ DLM BASED
            "total_entries": len(entries),
            "khatian_entries": entries,
    }


# ==============================
# CLI
# ==============================
if __name__ == "__main__":
    targets = sys.argv[1:]

    if not targets:
        print("Usage: python extract_land.py file.pdf")
        sys.exit(1)

    result = extract_land_record(targets[0])
    print(json.dumps(result, ensure_ascii=False, indent=2))


