import easyocr
import re
import cv2

def extract_name(image_path):
    # Initialize EasyOCR reader for English only
    reader = easyocr.Reader(['en'])
    
    # Run OCR
    results = reader.readtext(image_path, detail=0)
    
    # Join results into one text block
    text = "\n".join(results)
    
    # Aadhaar: "Name : <English Name>"
    match = re.search(r"Name[^A-Za-z]*([A-Za-z\s]+)", text, re.IGNORECASE)
    if match:
        candidate = match.group(1).strip()
        if candidate.lower() not in ["male", "female", "transgender"]:
            return candidate
    
    # Voter ID: "Elector's Name : <English Name>"
    match = re.search(r"Elector'?s Name[^A-Za-z]*([A-Za-z\s]+)", text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    
    # Fallback: first English-only line that isn't gender
    for line in results:
        line = line.strip()
        if re.match(r"^[A-Za-z\s]+$", line) and len(line.split()) >= 2:
            if line.lower() not in ["male", "female", "transgender"]:
                return line
    return None

if __name__ == "__main__":
    file_path = "id_card.jpeg"  # Aadhaar or Voter card image
    name = extract_name(file_path)
    if name:
        print("Extracted English Name:", name)
    else:
        print("Name could not be detected.")



########################################################
# import cv2
# import pytesseract
# import re
# import os
# from pdf2image import convert_from_path

# # Optional: set Tesseract path for Windows
# # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# def convert_pdf_to_image(pdf_path):
#     pages = convert_from_path(pdf_path, dpi=300)
#     image_path = pdf_path.replace(".pdf", ".jpg")
#     pages[0].save(image_path, "JPEG")
#     return image_path

# def preprocess_image(image_path):
#     img = cv2.imread(image_path)
#     if img is None:
#         raise FileNotFoundError(f"Image not found or cannot be opened: {image_path}")
    
#     # Upscale for better OCR
#     img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    
#     # Convert to grayscale
#     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
#     # Adaptive thresholding
#     thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
#                                    cv2.THRESH_BINARY, 31, 10)
    
#     return thresh

# def extract_name(text):
#     # Robust regex for "Elector's Name"
#     match = re.search(r"Elector'?s Name[^A-Za-z]*([A-Za-z\s]+)", text, re.IGNORECASE)
#     if match:
#         return match.group(1).strip()
    
#     # Fallback: generic "Name"
#     match = re.search(r"Name[^A-Za-z]*([A-Za-z\s]+)", text, re.IGNORECASE)
#     if match:
#         return match.group(1).strip()
    
#     # Fallback: first English-only line
#     for line in text.splitlines():
#         line = line.strip()
#         if re.match(r"^[A-Za-z\s]+$", line) and len(line.split()) >= 2:
#             return line
#     return None

# def get_english_name(file_path):
#     if file_path.lower().endswith(".pdf"):
#         file_path = convert_pdf_to_image(file_path)
    
#     processed_img = preprocess_image(file_path)
#     text = pytesseract.image_to_string(processed_img, lang="eng")
    
#     name = extract_name(text)
#     if not name:
#         print("OCR Text:\n", text)  # Debug print
#     return name

# if __name__ == "__main__":
#     file_path = "voter.jpeg"  # or .pdf
#     name = get_english_name(file_path)
#     if name:
#         print("Extracted English Name:", name)
#     else:
#         print("Name could not be detected.")