import camelot
import pdfplumber
import fitz  # PyMuPDF
import json
import os

PDF_PATH = "/var/www/html/land-ocr/input_pdfs/1764746873181-519NetureeSAINTHIA.pdf"
OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

all_tables_json = []

# ---------- Open PDF for marking ----------
doc = fitz.open(PDF_PATH)

# ---------- Loop through pages ----------
for page_number in range(len(doc)):
    page_index = page_number + 1
    print(f"Processing page {page_index}")

    tables_found = False

    # ---------- Try Camelot (best for ruled tables) ----------
    try:
        tables = camelot.read_pdf(
            PDF_PATH,
            pages=str(page_index),
            flavor="lattice"
        )

        if tables.n > 0:
            tables_found = True

            for t_index, table in enumerate(tables):
                df = table.df

                headers = df.iloc[0].tolist()
                rows = df.iloc[1:].values.tolist()

                table_json = {
                    "page": page_index,
                    "table_index": t_index,
                    "headers": headers,
                    "rows": rows
                }

                all_tables_json.append(table_json)

                # ---- Draw bounding box ----
                page = doc[page_number]
                x1, y1, x2, y2 = table._bbox
                page.draw_rect(
                    (x1, y1, x2, y2),
                    color=(1, 0, 0),
                    width=2
                )

    except Exception as e:
        print("Camelot failed:", e)

    # ---------- Fallback: pdfplumber ----------
    if not tables_found:
        with pdfplumber.open(PDF_PATH) as pdf:
            page = pdf.pages[page_number]
            tables = page.extract_tables()

            for t_index, table in enumerate(tables):
                headers = table[0]
                rows = table[1:]

                table_json = {
                    "page": page_index,
                    "table_index": t_index,
                    "headers": headers,
                    "rows": rows
                }

                all_tables_json.append(table_json)

# ---------- Save JSON ----------
json_path = os.path.join(OUTPUT_DIR, "tables.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(all_tables_json, f, ensure_ascii=False, indent=2)

# ---------- Save marked PDF ----------
marked_pdf_path = os.path.join(OUTPUT_DIR, "tables_marked.pdf")
doc.save(marked_pdf_path)
doc.close()

print("✅ Done")
print("JSON:", json_path)
print("Marked PDF:", marked_pdf_path)
