o
    i|4                  	   @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ e	
ddZdd Zed	Zed
Zh dZde	de	fddZde	de	defddZde	de	fddZde	dee fddZde	de	fddZde	defddZdd Zd d!d"d#d$d%d&d'Zd(ee dedB fd)d*Zd+ed,ede	dB fd-d.Zd=d(ee d0edeee  fd1d2Zd(ee d,edee fd3d4Zde	defd5d6Z e!d7krej"d8d Z#e#se$d9 e%d8 e e#d Z&e$ej'e&d:d;d< dS dS )>u   
Banglarbhumi Land Record PDF Extractor — Dynamic Layout Mapping (DLM)
Original extraction preserved + Safe Bengali Transliteration
    N)Path)SequenceMatcheru   ০১২৩৪৫৬৭৮৯
0123456789c                 C   s
   |  tS )N)	translateBN_TO_EN)t r   */var/www/html/banglarbhumi/extract_land.pybn_to_en   s   
r
   
^\d+\.\d+$z^[\d/]+$>      ব্যাক্তিNilHereClickRemarks()/namereturnc                 C   s4   | sdS |    } tdd| } tdd| } | S )N z[^\w\s] z\s+)lowerstripresub)r   r   r   r	   normalize_name   s   r   name1name2c                 C   s:   t t| }t t|}|r|sdS ttd|| dS )z+
    Returns similarity score (0 to 1)
    g        N   )r   transliterate_bengali_nameroundr   ratio)r   r   n1n2r   r   r	   name_match_score%   s
   r%   c                 C   s   | sdS |   } tdd | D s|  S z#ddlm}m}m} || ||}|dddd}|  }| W S  tyC   |  Y S w )zg
    SAFE transliteration:
    - DOES NOT affect extraction
    - Only converts if Bengali present
    r   c                 s   s(    | ]}d |  kodkn  V  qdS )u   ঀu   ৿Nr   ).0chr   r   r	   	<genexpr>H   s   & z-transliterate_bengali_name.<locals>.<genexpr>r   )transliterateBENGALIITRANS~.)	r   anytitleindic_transliteration.sanscriptr)   r*   r+   replace	Exception)r   r)   r*   r+   resultr   r   r	   r    ;   s   
r    pdf_pathc                 C   s   t jdd| dgdddd}|jdkrtd|j g d}}td	|jD ]8}td
|D ]+}|	d
 }|rY||t|	dt|	dt|	dt|	d|d q.|d7 }q&|S )N	pdftotextz-bbox-Tutf-8capture_outputtextencodingr   zpdftotext -bbox failed: z<page\b[^>]*>zT<word xMin="([\d.]+)" yMin="([\d.]+)" xMax="([\d.]+)" yMax="([\d.]+)">([^<]*)</word>      r         )r:   xminyminxmaxymaxpage)
subprocessrun
returncodeRuntimeErrorstderrr   splitstdoutfinditergroupr   appendfloat)r4   r3   wordspage_numchunkmr:   r   r   r	   	get_words]   s2   



	rT   c                 C   s    t jdd| dgdddd}|jS )Nr5   z-layoutr6   Tr7   r8   )rE   rF   rK   )r4   rr   r   r	   get_layout_text   s
   
rV   r:   c                 C   s   i }t d| }|rt|d|d< t d| t j}|r&t|d|d< t d| }|r7|d |d< t d| }|rH|d |d	< t d
| }|rY|d |d< t d| }|rjt|d|d< |S )Nu(   জে\.এল\s+নং\s+([\d০-৯]+)r=   jl_nouB   ^\s{2,15}([\d০-৯/]{2,6})\s{2,}\S+\s+[\d.]+(?:\s+Click\s*Here)?daag_nou   মৌজাঃ\s*(\S+)mouzau$   ব্লকঃ\s*([A-Z][A-Z0-9\-]*)blocku   জেলাঃ\s*([A-Z]+)districtuU   জিমর\s+মাট\s+পিরমাণ\(একর\)\s*[:\-]?\s*([\d০-৯.]+)total_land_acre)r   searchr
   rM   	MULTILINEr   )r:   hrS   r   r   r	   extract_header   s6   r`   c                    s   t | D ]g\}d }d|v sd|v rkd d   fdd| D }t|dd	 d
}|D ]}t|d }td|rE|    S q1t|d t|d t| D ]}t| | d }td|rj|    S qTqdS )u&  
    Robust DLM-based extraction for:
    জিমর মাট পিরমাণ(একর)

    Strategy:
    1. Find keyword words containing 'পিরমাণ' or 'পরিমাণ'
    2. Look right-side / nearby words (same line priority)
    3. Pick first valid decimal number
    r:   u   পিরমাণ   পরিমাণrA   rD   c                    s@   g | ]}|d   krt |d  dk r|d d kr|qS )rD   rA   r>   r@   rB   abs)r&   x	base_pagebase_ywr   r	   
<listcomp>   s    z1extract_total_land_from_words.<locals>.<listcomp>c                 S      | d S Nr@   r   rd   r   r   r	   <lambda>       z/extract_total_land_from_words.<locals>.<lambda>keyr   r=      r   )	enumeratesortedr
   r   matchrangeminlen)rP   itxt	same_linesame_line_sortedrd   valjr   re   r	   extract_total_land_from_words   s*    r~      খতিয়ানu   রায়তেরu   পিতাu	   অংশra   u   দখলদারu   মন্তব্য)khatianownerfatheranshaarea
dakhaldaarremarksrP   c                    sV  t dd | D d d u rd S fdd| D }i }t D ]\}}|D ]}||d v r9||vr9|d ||<  nq%qt|dk rCd S fdd| D }|d	d
  fdd| D }|rz|rztdd |D }tdd |D }	||	 d |d< t| dd d}
i }t|
D ]\}\}}|d t|
k r|
|d  d nd}||f||< q|S )Nc                 s   $    | ]}d |d v r|d V  qdS r   r:   rA   Nr   r&   rh   r   r   r	   r(        " z#build_column_map.<locals>.<genexpr>c                    s$   g | ]}t |d    dk r|qS )rA      rb   r   header_yr   r	   ri     s   $ z$build_column_map.<locals>.<listcomp>r:   r@   r?   c                    s:   g | ]}|d   d krt |d r|d dk r|qS )rA   
   r:   r@   F   )
KHATIAN_RErt   r   r   r   r	   ri     s    r   i  c                    sT   g | ]&}|d  d kr(|d dkr(|d  k r(|d t vrtd|d s|qS )rA   r   r@   2   r:   z[\d./,\-()]+)
SKIP_TEXTSr   	fullmatchr   father_xr   r   r	   ri   (  s    c                 s       | ]}|d  V  qdS )rB   Nr   r   r   r   r	   r(   2      c                 s   r   )r@   Nr   r   r   r   r	   r(   3  r   r   r   c                 S   rj   )Nr=   r   rl   r   r   r	   rm   7  rn   z"build_column_map.<locals>.<lambda>ro   r=   i'  )	nextCOLUMN_ANCHORSitemsrw   getmaxrv   rs   rr   )rP   header_wordscol_xcolkeywordhwkhatian_numsowner_contentmax_khatian_xmaxmin_owner_xsorted_colscol_maprx   r   x_startx_endr   r   r	   build_column_map  s<   
	$r   wordr   c                 C   sL   | d | d  d }|  D ]\}\}}||  kr!|k r#|  S  qqd S )Nr@   rB   r   )r   )r   r   xcr   x0x1r   r   r	   col_ofA  s   r          @tolc                 C   s   | sg S t | dd d}g |d g}}|dd  D ](}|d |d d kr:t|d |d d  |kr:|| q|| |g}q|| |S )Nc                 S   s   | d | d | d fS )NrD   rA   r@   r   rh   r   r   r	   rm   R  s    z!group_into_rows.<locals>.<lambda>ro   r   r=   rD   rA   )rs   rc   rN   )rP   r   swrowscurrh   r   r   r	   group_into_rowsN  s   0

r   c                    s  t dd | D dfdd| D }t|dd} fdd	fd
d}g }t|D ]P\}}||d}t|}	td|	s?q+t||d}
t||d}t|
sXt|sXq+||||	t|
rd|
ndt|rl|nd|d d |d d d q+g }t|D ]q\}}|dkr||d  d n|d |d fdd|D }t	|ddd d}d
dd |D  }t	|d|d d d d d}d
d!d |D  }||d" |t||t||d |d d# q|S )$Nc                 s   r   r   r   r   r   r   r	   r(   e  r   z*extract_khatian_entries.<locals>.<genexpr>r   c                    s:   g | ]}|d   d kr|d t vr|d ds|qS )rA   r   r:   r   )r   
startswithr   r   r   r	   ri   g  s    z+extract_khatian_entries.<locals>.<listcomp>r   )r   c                    s    fdd| D S )Nc                    s   g | ]}t | kr|qS r   )r   r   )r   r   r   r	   ri   q  s    zAextract_khatian_entries.<locals>.words_in_col.<locals>.<listcomp>r   )	row_wordsr   )r   )r   r	   words_in_colp  s   z-extract_khatian_entries.<locals>.words_in_colc                    s.   t  | |dd d}ddd |D  S )Nc                 S   rj   rk   r   r   r   r   r	   rm   t  rn   z>extract_khatian_entries.<locals>.text_in_col.<locals>.<lambda>ro   r   c                 s   r   r:   Nr   r   r   r   r	   r(   u  r   z?extract_khatian_entries.<locals>.text_in_col.<locals>.<genexpr>)rs   joinr   )r   r   ws)r   r   r	   text_in_cols  s   z,extract_khatian_entries.<locals>.text_in_colr   z\d[\d/]*r   r   r   rA   rD   )row_idxrow
khatian_nor   r   data_y	data_pager=   r   r   c                    s8   g | ]}|d   kr|d   k rk rn n|qS )rD   rA   r   r   )r   r   prev_data_yr   r	   ri     s
    (r   c                 S      | d | d fS NrA   r@   r   r   r   r   r	   rm         z)extract_khatian_entries.<locals>.<lambda>ro   r   c                 s   r   r   r   r   r   r   r	   r(     r   r   r   c                 S   r   r   r   r   r   r   r	   rm     r   c                 s   r   r   r   r   r   r   r	   r(     r   r   )r   owner_name_bn
owner_namefather_husband_name_bnfather_husband_namer   
area_acres)r   r   rr   r
   r   r   
DECIMAL_RErt   rN   rs   r   r   r    )rP   r   
data_wordsr   r   	data_rowsrx   r   khatian_rawr   r   r   entriesidxdrwindowowner_wordsowner_bnfather_words	father_bnr   )r   r   r   r   r   r   r	   extract_khatian_entriesd  sj   







r   c              
   C   s   t | }t| }t|}t|}t|}|d u r"t| jdd|S t||}t| j|dd|dd|dd|dd|dd|t	||d		S )
Nz
DLM failed)source_fileerrorrW   r   rX   rY   rZ   r[   )	r   rW   rX   rY   rZ   r[   r\   total_entrieskhatian_entries)
rT   rV   r`   r   r~   r   r   r   r   rw   )r4   rP   layoutheaderr   
total_landr   r   r   r	   extract_land_record  s,   





r   __main__r=   z&Usage: python extract_land.py file.pdfFr   )ensure_asciiindent)r   )(__doc__r   jsonsysrE   pathlibr   difflibr   str	maketransr   r
   compiler   r   r   r   rO   r%   r    listdictrT   rV   r`   r~   r   r   r   r   r   r   __name__argvtargetsprintexitr3   dumpsr   r   r   r	   <module>   sJ     

"$".06$X!
