
    i|4              	          d Z ddlZddlZddlZddlZddlmZ ddlmZ e	
                    dd          Zd Z ej        d          Z ej        d	          Zh d
Zde	de	fdZde	de	defdZde	de	fdZde	dee         fdZde	de	fdZde	defdZd ZddddddddZd ee         dedz  fd!Zd"ed#ede	dz  fd$Zd0d ee         d&edeee                  fd'Zd ee         d#edee         fd(Zde	defd)Z e!d*k    r[ej"        d+d         Z#e#s e$d,            ej%        d+            e e#d                   Z& e$ ej'        e&d-d./                     dS dS )1u   
Banglarbhumi Land Record PDF Extractor — Dynamic Layout Mapping (DLM)
Original extraction preserved + Safe Bengali Transliteration
    N)Path)SequenceMatcheru   ০১২৩৪৫৬৭৮৯
0123456789c                 6    |                      t                    S )N)	translateBN_TO_EN)ts    */var/www/html/banglarbhumi/extract_land.pybn_to_enr      s    ;;x       
^\d+\.\d+$z^[\d/]+$>      ব্যাক্তি/NilHereClickRemarks()namereturnc                     | sdS |                                                                  } t          j        dd|           } t          j        dd|           } | S )N z[^\w\s] z\s+)lowerstripresub)r   s    r
   normalize_namer      sS     r::<<D6*c4((D6&#t$$DKr   name1name2c                     t          t          |                     }t          t          |                    }|r|sdS t          t          d||                                          d          S )z+
    Returns similarity score (0 to 1)
    g        N   )r   transliterate_bengali_nameroundr   ratio)r    r!   n1n2s       r
   name_match_scorer)   %   sl     
2599	:	:B	2599	:	:B R sr2..4466:::r   c                    | sdS |                                  } t          d | D                       s|                                 S 	 ddlm}m}m}  || ||          }|                    dd                              dd          }|                                 }|                                S # t          $ r | cY S w xY w)zg
    SAFE transliteration:
    - DOES NOT affect extraction
    - Only converts if Bengali present
    r   c              3   6   K   | ]}d |cxk    odk    nc V  dS )u   ঀu   ৿N ).0chs     r
   	<genexpr>z-transliterate_bengali_name.<locals>.<genexpr>H   s>      99bx2))))))))999999r   r   )transliterateBENGALIITRANS~.)	r   anytitleindic_transliteration.sanscriptr0   r1   r2   replace	Exception)r   r0   r1   r2   results        r
   r$   r$   ;   s      r::<<D 99D99999 zz||RRRRRRRRRRtWf55 R((00b99||~~   s   A(B0 0B?>B?pdf_pathc                    t          j        dd| dgddd          }|j        dk    rt          d|j                   g d}}t          j        d	|j                  D ]}t          j        d
|          D ]}|	                    d          
                                }|r|                    |t          |	                    d                    t          |	                    d                    t          |	                    d                    t          |	                    d                    |d           |dz  }|S )N	pdftotextz-bbox-Tutf-8capture_outputtextencodingr   zpdftotext -bbox failed: z<page\b[^>]*>zT<word xMin="([\d.]+)" yMin="([\d.]+)" xMax="([\d.]+)" yMax="([\d.]+)">([^<]*)</word>      r#         )rB   xminyminxmaxymaxpage)
subprocessrun
returncodeRuntimeErrorstderrr   splitstdoutfinditergroupr   appendfloat)r;   r:   wordspage_numchunkmrB   s          r
   	get_wordsr\   ]   sN   ^	gx-$  F
 AEfmEEFFF!8E*FM::  c
 
 	 	A 771::##%%D  !!''!**--!!''!**--!!''!**--!!''!**--$     	ALr   c                 H    t          j        dd| dgddd          }|j        S )Nr=   z-layoutr>   Tr?   r@   )rM   rN   rS   )r;   rs     r
   get_layout_textr_      s4    	i3/$	 	 	A 8Or   rB   c                    i }t          j        d|           }|r%t          |                    d                    |d<   t          j        d| t           j                  }|r%t          |                    d                    |d<   t          j        d|           }|r*|                    d                                          |d<   t          j        d|           }|r*|                    d                                          |d	<   t          j        d
|           }|r*|                    d                                          |d<   t          j        d|           }|r%t          |                    d                    |d<   |S )Nu(   জে\.এল\s+নং\s+([\d০-৯]+)rE   jl_nouB   ^\s{2,15}([\d০-৯/]{2,6})\s{2,}\S+\s+[\d.]+(?:\s+Click\s*Here)?daag_nou   মৌজাঃ\s*(\S+)mouzau$   ব্লকঃ\s*([A-Z][A-Z0-9\-]*)blocku   জেলাঃ\s*([A-Z]+)districtuU   জিমর\s+মাট\s+পিরমাণ\(একর\)\s*[:\-]?\s*([\d০-৯.]+)total_land_acre)r   searchr   rU   	MULTILINEr   )rB   hr[   s      r
   extract_headerrj      sw   
A 		=tDDA *aggajj))'
 		M
	 	A
 	 ,

++) 		,d33A (WWQZZ%%'''
 		94@@A (WWQZZ%%'''
 		/66A +

((*** 		`	 	A 	 4'

33
Hr   c           
        	
 t          |           D ]\  }

d         }d|v sd|v rщ
d         	
d         	
fd| D             }t          |d           }|D ]2}t          |d                   }t          j        d	|          r|c c S 3t          |d
z   t          |dz   t          |                               D ]8}t          | |         d                   }t          j        d	|          r|c c S 9dS )u&  
    Robust DLM-based extraction for:
    জিমর মাট পিরমাণ(একর)

    Strategy:
    1. Find keyword words containing 'পিরমাণ' or 'পরিমাণ'
    2. Look right-side / nearby words (same line priority)
    3. Pick first valid decimal number
    rB   u   পিরমাণ   পরিমাণrI   rL   c                     g | ]>}|d          k    r0t          |d         z
            dk     r|d         d         k    <|?S )rL   rI   rF   rH   rJ   abs)r-   x	base_pagebase_yws     r
   
<listcomp>z1extract_total_land_from_words.<locals>.<listcomp>   sc       V9	))&	F*++a//fI&	))  *))r   c                     | d         S NrH   r,   rp   s    r
   <lambda>z/extract_total_land_from_words.<locals>.<lambda>   s
    qy r   keyr   rE      r   )	enumeratesortedr   r   matchrangeminlen)rX   itxt	same_linesame_line_sortedrp   valjrq   rr   rs   s           @@@r
   extract_total_land_from_wordsr      s]    %    1i  3&&*>#*E*EvYF&	I        I  &i5H5HIII%  qy))8M3// JJJJJ 1q5#a!eSZZ"8"899  uQx/008M3// JJJJJ 2r      খতিয়ানu   রায়তেরu   পিতাu	   অংশrl   u   দখলদারu   মন্তব্য)khatianownerfatheranshaarea
dakhaldaarremarksrX   c                    t          d | D             d           d S fd| D             }i }t                                          D ]%\  }}|D ]}||d         v r||vr|d         ||<    n&t          |          dk     rd S fd| D             }|                    dd          fd	| D             }|r?|r=t          d
 |D                       }t          d |D                       }	||	z   dz  |d<   t          |                                d           }
i }t          |
          D ]8\  }\  }}|dz   t          |
          k     r|
|dz            d         nd}||f||<   9|S )Nc              3   :   K   | ]}d |d         v |d         V  dS r   rB   rI   Nr,   r-   rs   s     r
   r/   z#build_column_map.<locals>.<genexpr>  5      VV11HAfI1U1UQvY1U1U1U1UVVr   c                 L    g | ] }t          |d          z
            dk     |!S )rI      rn   r-   rs   header_ys     r
   rt   z$build_column_map.<locals>.<listcomp>  s4    GGG!AfI,@(A(AB(F(FA(F(F(Fr   rB   rH   rG   c                     g | ]?}|d          dz   k    r.t                               |d                   r|d         dk     =|@S )rI   
   rB   rH   F   )
KHATIAN_REr~   r   s     r
   rt   z$build_column_map.<locals>.<listcomp>  s]       V9x"}$$QvY'' %fINN 	
 NNr   r   i  c                     g | ]U}|d          dz   k    rD|d         dk    r8|d         k     r,|d         t           v8t          j        d|d                   S|VS )rI   r   rH   2   rB   z[\d./,\-()]+)
SKIP_TEXTSr   	fullmatch)r-   rs   father_xr   s     r
   rt   z$build_column_map.<locals>.<listcomp>(  s{       V9x"}$$fINNfI  fIZ''_ai88 (	 	
 (''r   c              3   &   K   | ]}|d          V  dS )rJ   Nr,   r   s     r
   r/   z#build_column_map.<locals>.<genexpr>2  s&      ??Qqy??????r   c              3   &   K   | ]}|d          V  dS )rH   Nr,   r   s     r
   r/   z#build_column_map.<locals>.<genexpr>3  s&      ;;!F);;;;;;r   r#   r   c                     | d         S )NrE   r,   rw   s    r
   rx   z"build_column_map.<locals>.<lambda>7  s
    ad r   ry   rE   i'  )	nextCOLUMN_ANCHORSitemsr   getmaxr   r}   r|   )rX   header_wordscol_xcolkeywordhwkhatian_numsowner_contentmax_khatian_xmaxmin_owner_xsorted_colscol_mapr   r   x_startx_endr   r   s                   @@r
   build_column_mapr     s   VVVVVX\]]HtGGGGuGGGLE&,,..  W 	 	B"V*$$E)9)9Zc

5zzA~~t     L yy3''H      M  > >??,?????;;];;;;;*[8A=g NN;;;KG'44 ) )?D')*Q[1A1A)A)AAE"1%%t %(Nr   wordr   c                     | d         | d         z   dz  }|                                 D ]\  }\  }}||cxk    r|k     rn |c S d S )NrH   rJ   r#   )r   )r   r   xcr   x0x1s         r
   col_ofr   A  sf    
v,f
%	*B   Xb"====b=====JJJ  4r          @tolc                 h   | sg S t          | d           }g |d         g}}|dd          D ]p}|d         |d         d         k    r>t          |d         |d         d         z
            |k    r|                    |           X|                    |           |g}q|                    |           |S )Nc                 0    | d         | d         | d         fS )NrL   rI   rH   r,   rs   s    r
   rx   z!group_into_rows.<locals>.<lambda>R  s    ai6AfI%F r   ry   r   rE   rL   rI   )r}   ro   rV   )rX   r   swrowscurrs   s         r
   group_into_rowsr   N  s     		FF	G	G	GBRUG#DV  V9Av&&3qy3q6&>/I+J+Jc+Q+QJJqMMMMKK#CCKKKr   c                 d   t          d | D             d          fd| D             }t          |d          }fdfd}g }t          |          D ]\  }} ||d          }t          |          }	t	          j        d	|	          s7t           ||d
                    }
t           ||d                    }t                              |
          st                              |          s|                    |||	t                              |
          r|
ndt                              |          r|nd|d         d         |d         d         d           g }t          |          D ]5\  }}|dk    r||dz
           d         n|d         |d         fd|D             }t           |d          d           }d
                    d |D                                                       }t           |d           |d         d          z   d           }d
                    d |D                                                       }|                    |d         |t          |          |t          |          |d
         |d         d           7|S )Nc              3   :   K   | ]}d |d         v |d         V  dS r   r,   r   s     r
   r/   z*extract_khatian_entries.<locals>.<genexpr>e  r   r   r   c                     g | ]=}|d          dz   k    r,|d         t           v |d                             d          ;|>S )rI   r   rB   r   )r   
startswithr   s     r
   rt   z+extract_khatian_entries.<locals>.<listcomp>g  s_       V9x"}$$fIZ''&	$$U++ ( 	
'''r   r   )r   c                 $    fd| D             S )Nc                 <    g | ]}t          |          k    |S r,   )r   )r-   rs   r   r   s     r
   rt   zAextract_khatian_entries.<locals>.words_in_col.<locals>.<listcomp>q  s-    BBBaq'(:(:c(A(A(A(A(Ar   r,   )	row_wordsr   r   s    `r
   words_in_colz-extract_khatian_entries.<locals>.words_in_colp  s!    BBBBB9BBBBr   c                     t           | |          d           }d                    d |D                                                       S )Nc                     | d         S rv   r,   r   s    r
   rx   z>extract_khatian_entries.<locals>.text_in_col.<locals>.<lambda>t  s
    &	 r   ry   r   c              3   &   K   | ]}|d          V  dS rB   Nr,   r   s     r
   r/   z?extract_khatian_entries.<locals>.text_in_col.<locals>.<genexpr>u  s&      ..a&	......r   )r}   joinr   )r   r   wsr   s      r
   text_in_colz,extract_khatian_entries.<locals>.text_in_cols  sR    LLC006I6IJJJxx..2.....44666r   r   z\d[\d/]*r   r   r   rI   rL   )row_idxrow
khatian_nor   r   data_y	data_pagerE   r   r   c                 X    g | ]&}|d          k    |d         cxk     rk     !n n|'S )rL   rI   r,   )r-   rs   r   r   prev_data_ys     r
   rt   z+extract_khatian_entries.<locals>.<listcomp>  sW     
 
 
yI%%+&	*J*J*J*JF*J*J*J*J*J *J*J*Jr   r   c                 "    | d         | d         fS NrI   rH   r,   r   s    r
   rx   z)extract_khatian_entries.<locals>.<lambda>  s    AfIqy+A r   ry   r   c              3   &   K   | ]}|d          V  dS r   r,   r   s     r
   r/   z*extract_khatian_entries.<locals>.<genexpr>  s&      ;;!AfI;;;;;;r   r   r   c                 "    | d         | d         fS r   r,   r   s    r
   rx   z)extract_khatian_entries.<locals>.<lambda>  s    1V9ai0 r   c              3   &   K   | ]}|d          V  dS r   r,   r   s     r
   r/   z*extract_khatian_entries.<locals>.<genexpr>  s&      ==1QvY======r   r   )r   owner_name_bn
owner_namefather_husband_name_bnfather_husband_namer   
area_acres)r   r   r|   r   r   r   
DECIMAL_REr~   rV   r}   r   r   r$   )rX   r   
data_wordsr   r   	data_rowsr   r   khatian_rawr   r   r   entriesidxdrwindowowner_wordsowner_bnfather_words	father_bnr   r   r   r   r   s    `                  @@@@@r
   extract_khatian_entriesr   d  sv   VVVVVXYZZH     J :3///DC C C C C7 7 7 7 7
 ID//  3!k#y11k**
|K44 	S'2233C0011  '' 	:+;+;D+A+A 	$(..u55=UU2&,,T22:DD!fVnQ
 
 	 	 	 	 GY''    R69Aggia(228{O	H
 
 
 
 
 
!
 
 
 \\&'::!A!AC C C 88;;{;;;;;AACC L**\\"U)X-N-NN00
 
 

 HH=======CCEE	\*%4X>>&/#=i#H#H[V*
 
 	 	 	 	 Nr   c           
         t          |           }t          |           }t          |          }t          |          }t	          |          }|t          |           j        dd|S t          ||          }t          |           j        |                    dd          |                    dd          |                    dd          |                    dd          |                    dd          |t          |          |d		S )
Nz
DLM failed)source_fileerrorra   r   rb   rc   rd   re   )	r   ra   rb   rc   rd   re   rf   total_entrieskhatian_entries)
r\   r_   rj   r   r   r   r   r   r   r   )r;   rX   layoutheaderr   
total_landr   s          r
   extract_land_recordr     s   hEX&&FF##Fu%%G.u55J >>.!
 
 
 	
 &eW55G  >>.ZZ,,zz)R00ZZ,,ZZ,,

:r22) \\&
 
 
r   __main__rE   z&Usage: python extract_land.py file.pdfFr#   )ensure_asciiindent)r   )(__doc__r   jsonsysrM   pathlibr   difflibr   str	maketransr   r   compiler   r   r   r   rW   r)   r$   listdictr\   r_   rj   r   r   r   r   r   r   r   __name__argvtargetsprintexitr:   dumpsr,   r   r
   <module>r     s?   
 !                                     # # # # # # ==9<HH! ! ! bj''
bj%%
\\\

     ;C ; ; ; ; ; ;,S S    D T
    Hc c    D( ( ( ( ( (\) ) )` '$ && 3DJ 34$; 3 3 3 3l  t     4: E Dd<L    ,R4: R Rd R R R Rp# $    B zhqrrlG 6777  ,,F	E*$*V%
:
:
:;;;;; r   