
    i|4              	          S r SSKrSSKrSSKrSSKrSSKJr  SSKJr  \	R                  SS5      rS r\R                  " S5      r\R                  " S	5      r1 S
krS\	S\	4S jrS\	S\	S\4S jrS\	S\	4S jrS\	S\\   4S jrS\	S\	4S jrS\	S\4S jrS rSSSSSSSS.rS \\   S\S-  4S! jrS"\S#\S\	S-  4S$ jrS/S \\   S%\S\\\      4S& jjrS \\   S#\S\\   4S' jrS\	S\4S( jr \!S):X  aT  \RD                  S*S r#\#(       d  \$" S+5        \RJ                  " S*5        \ " \#S   5      r&\$" \RN                  " \&S,S-S.95        gg)0u   
Banglarbhumi Land Record PDF Extractor — Dynamic Layout Mapping (DLM)
Original extraction preserved + Safe Bengali Transliteration
    N)Path)SequenceMatcheru   ০১২৩৪৫৬৭৮৯
0123456789c                 ,    U R                  [        5      $ N)	translateBN_TO_EN)ts    */var/www/html/banglarbhumi/extract_land.pybn_to_enr      s    ;;x      
^\d+\.\d+$z^[\d/]+$>   ()/NilHereClickRemarks   ব্যাক্তিnamereturnc                     U (       d  gU R                  5       R                  5       n [        R                  " SSU 5      n [        R                  " SSU 5      n U $ )N z[^\w\s] z\s+)lowerstripresub)r   s    r   normalize_namer       sE    ::<D66*c4(D66&#t$DKr   name1name2c                     [        [        U 5      5      n[        [        U5      5      nU(       a  U(       d  g[        [        SX#5      R	                  5       S5      $ )z#
Returns similarity score (0 to 1)
g        N   )r    transliterate_bengali_nameroundr   ratio)r!   r"   n1n2s       r   name_match_scorer*   %   sI     
259	:B	259	:BRr.446::r   c                 P   U (       d  gU R                  5       n [        S U  5       5      (       d  U R                  5       $  SSKJnJnJn  U" XU5      nUR                  SS5      R                  SS5      nUR                  5       nUR                  5       $ ! [         a    U s $ f = f)zW
SAFE transliteration:
- DOES NOT affect extraction
- Only converts if Bengali present
r   c              3   L   #    U  H  nS Us=:*  =(       a    S:*  Os  v   M     g7f)u   ঀu   ৿N ).0chs     r   	<genexpr>-transliterate_bengali_name.<locals>.<genexpr>H   s      9Dbx2))))Ds   "$r   )transliterateBENGALIITRANS~.)	r   anytitleindic_transliteration.sanscriptr2   r3   r4   replace	Exception)r   r2   r3   r4   results        r   r%   r%   ;   s     ::<D 9D999zz|RRtf5 R(00b9||~ s   AB B%$B%pdf_pathc                 h   [         R                  " SSU S/SSSS9nUR                  S:w  a  [        SUR                   35      e/ Sp2[
        R                  " S	UR                  5       H  n[
        R                  " S
U5       H  nUR                  S5      R                  5       nU(       d  M+  UR                  U[        UR                  S5      5      [        UR                  S5      5      [        UR                  S5      5      [        UR                  S5      5      US.5        M     US-  nM     U$ )N	pdftotextz-bbox-Tutf-8capture_outputtextencodingr   zpdftotext -bbox failed: z<page\b[^>]*>zT<word xMin="([\d.]+)" yMin="([\d.]+)" xMax="([\d.]+)" yMax="([\d.]+)">([^<]*)</word>      r$         )rD   xminyminxmaxymaxpage)
subprocessrun
returncodeRuntimeErrorstderrr   splitstdoutfinditergroupr   appendfloat)r=   r<   wordspage_numchunkmrD   s          r   	get_wordsr^   ]   s
   ^^	gx-$F
 A5fmm_EFF!8*FMM:c
A 771:##%Dt !!''!*-!!''!*-!!''!*-!!''!*-$ 
  	A# ;& Lr   c                 P    [         R                  " SSU S/SSSS9nUR                  $ )Nr?   z-layoutr@   TrA   rB   )rO   rP   rU   )r=   rs     r   get_layout_textra      s.    	i3/$	A 88Or   rD   c                 
   0 n[         R                  " SU 5      nU(       a  [        UR                  S5      5      US'   [         R                  " SU [         R                  5      nU(       a  [        UR                  S5      5      US'   [         R                  " SU 5      nU(       a"  UR                  S5      R                  5       US'   [         R                  " SU 5      nU(       a"  UR                  S5      R                  5       US	'   [         R                  " S
U 5      nU(       a"  UR                  S5      R                  5       US'   [         R                  " SU 5      nU(       a  [        UR                  S5      5      US'   U$ )Nu(   জে\.এল\s+নং\s+([\d০-৯]+)rG   jl_nouB   ^\s{2,15}([\d০-৯/]{2,6})\s{2,}\S+\s+[\d.]+(?:\s+Click\s*Here)?daag_nou   মৌজাঃ\s*(\S+)mouzau$   ব্লকঃ\s*([A-Z][A-Z0-9\-]*)blocku   জেলাঃ\s*([A-Z]+)districtuU   জিমর\s+মাট\s+পিরমাণ\(একর\)\s*[:\-]?\s*([\d০-৯.]+)total_land_acre)r   searchr   rW   	MULTILINEr   )rD   hr]   s      r   extract_headerrl      s3   
A 			=tDAaggaj)'
 			M
	A
 	
+) 			,d3AWWQZ%%''
 			94@AWWQZ%%''
 			/6A
((** 			`	A 	'
3
Hr   c           
      <   [        U 5       GH  u  pUS   nSU;   d  SU;   d  M  US   nUS   nU  Vs/ s H5  nUS   U:X  d  M  [        US   U-
  5      S:  d  M%  US   US   :  d  M3  UPM7     nn[        US	 S
9nU H3  n[        US   5      n	[        R
                  " SU	5      (       d  M/  U	s  s  $    [        US-   [        US-   [        U 5      5      5       H5  n
[        X
   S   5      n	[        R
                  " SU	5      (       d  M1  U	s  s  $    GM
     gs  snf )u
  
Robust DLM-based extraction for:
জিমর মাট পিরমাণ(একর)

Strategy:
1. Find keyword words containing 'পিরমাণ' or 'পরিমাণ'
2. Look right-side / nearby words (same line priority)
3. Pick first valid decimal number
rD   u   পিরমাণ   পরিমাণrK   rN   rH   rJ   rL   c                     U S   $ NrJ   r-   xs    r   <lambda>/extract_total_land_from_words.<locals>.<lambda>   s    qyr   keyr   rG      r   )		enumerateabssortedr   r   matchrangeminlen)rZ   iwtxtbase_y	base_pagerr   	same_linesame_line_sortedvaljs              r   extract_total_land_from_wordsr      s6    % i  3&*>#*EvYF&	I ! aV9	) &	F*+a/  fI&	) 5    &i5HI%qy)88M3//J & 1q5#a!eSZ"89ux/088M3//J :3 !< )s   DD
D)D   খতিয়ানu   রায়তেরu   পিতাu	   অংশrn   u   দখলদারu   মন্তব্য)khatianownerfatheranshaarea
dakhaldaarremarksrZ   c                    [        S U  5       S 5      nUc  g U  Vs/ s H  n[        US   U-
  5      S:  d  M  UPM     nn0 n[        R                  5        H'  u  pVU H  nXgS   ;   d  M  XT;  d  M  US   XE'     M%     M)     [	        U5      S:  a  g U  Vs/ s H=  nUS   US-   :  d  M  [
        R                  US   5      (       d  M0  US   S:  d  M;  UPM?     nnUR                  S	S
5      n	U  Vs/ s HY  nUS   US-   :  d  M  US   S:  d  M  US   U	:  d  M'  US   [        ;  d  M6  [        R                  " SUS   5      (       a  MW  UPM[     n
nU(       a5  U
(       a.  [        S U 5       5      n[        S U
 5       5      nX-   S-  US'   [        UR                  5       S S9n0 n[        U5       H.  u  nu  nnUS-   [	        U5      :  a
  XS-      S   OSnUU4UU'   M0     U$ s  snf s  snf s  snf )Nc              3   B   #    U  H  nS US   ;   d  M  US   v   M     g7fr   rD   rK   Nr-   r.   r   s     r   r0   #build_column_map.<locals>.<genexpr>  "     V11HAfI1UYQvY   rK      rD   rJ   rI   
   F   r   i  2   z[\d./,\-()]+c              3   *   #    U  H	  oS    v   M     g7f)rL   Nr-   r   s     r   r0   r   2  s     ?,Qy,   c              3   *   #    U  H	  oS    v   M     g7f)rJ   Nr-   r   s     r   r0   r   3  s     ;]F)]r   r$   r   c                     U S   $ )NrG   r-   rq   s    r   rs   "build_column_map.<locals>.<lambda>7  s    adr   ru   rG   i'  )nextry   COLUMN_ANCHORSitemsr~   
KHATIAN_REr{   get
SKIP_TEXTSr   	fullmatchmaxr}   rz   rx   )rZ   header_yr   header_wordscol_xcolkeywordhwkhatian_numsfather_xowner_contentmax_khatian_xmaxmin_owner_xsorted_colscol_mapr   r   x_startx_ends                      r   build_column_mapr     s7   VVX\]H$Gu!AfI,@(AB(FAuLGE&,,.BV*$)9Z
  / 5zA~ aV9x"}$ 	
QvY' 	
 fIN 	
5   yy3'H aV9x"}$ 	
fIN 	
 fI  	
 fIZ'	 	

 _ai8 	
5   ?,??;];;*8A=g N;KG'4?D')*Q[1A)AE"1%t %( 5 N[ HsF   G*G*G/*G/	G/G/2G4G4G4G4+G4G4wordr   c                     U S   U S   -   S-  nUR                  5        H  u  nu  pEXBs=::  a  U:  d  M   Us  $   M     g )NrJ   rL   r$   )r   )r   r   xcr   x0x1s         r   col_ofr   A  sG    
v,f
%	*B Xb=b=J  ) r   tolc                    U (       d  / $ [        U S S9n/ US   /pCUSS   HT  nUS   US   S   :X  a.  [        US   US   S   -
  5      U::  a  UR                  U5        M@  UR                  U5        U/nMV     UR                  U5        U$ )Nc                     U S   U S   U S   4$ )NrN   rK   rJ   r-   r   s    r   rs   !group_into_rows.<locals>.<lambda>R  s    ai6AfI%Fr   ru   r   rG   rN   rK   )rz   ry   rX   )rZ   r   swrowscurr   s         r   group_into_rowsr   N  s    		F	GBRUG#VV9Av&3qy3q6&>/I+Jc+QJJqMKK#C  	KKKr   c                 4  ^^ [        S U  5       S5      nU  Vs/ s H=  nUS   US-   :  d  M  US   [        ;  d  M   US   R                  S5      (       a  M;  UPM?     nn[        USS9nU4S	 jmU4S
 jn/ n[	        U5       H  u  pU" U	S5      n
[        U
5      n[        R                  " SU5      (       d  M7  [        U" U	S5      5      n[        U" U	S5      5      n[        R                  U5      (       d  [        R                  U5      (       d  M  UR                  UU	U[        R                  U5      (       a  UOS[        R                  U5      (       a  UOSU	S   S   U	S   S   S.5        M     / n[	        U5       GH  u  nnUS:  a
  XS-
     S   OUnUS   nUS   nU Vs/ s H&  nUS   U:X  d  M  UUS   s=:  a  U:  d  M   O  M$  UPM(     nn[        T" US5      S S9nSR                  S U 5       5      R                  5       n[        T" US5      T" US   S5      -   S S9nSR                  S U 5       5      R                  5       nUR                  US   U[        U5      U[        U5      US   US   S.5        GM
     U$ s  snf s  snf ) Nc              3   B   #    U  H  nS US   ;   d  M  US   v   M     g7fr   r-   r   s     r   r0   *extract_khatian_entries.<locals>.<genexpr>e  r   r   r   rK   r   rD   r          @)r   c                 X   > U  Vs/ s H  n[        UT5      U:X  d  M  UPM     sn$ s  snf r   )r   )	row_wordsr   r   r   s      r   words_in_col-extract_khatian_entries.<locals>.words_in_colp  s)    $B9aq'(:c(A9BBBs   ''c                 r   > [        T" X5      S S9nSR                  S U 5       5      R                  5       $ )Nc                     U S   $ rp   r-   r   s    r   rs   >extract_khatian_entries.<locals>.text_in_col.<locals>.<lambda>t  s    &	r   ru   r   c              3   *   #    U  H	  oS    v   M     g7frD   Nr-   r   s     r   r0   ?extract_khatian_entries.<locals>.text_in_col.<locals>.<genexpr>u  s     .2a&	2r   )rz   joinr   )r   r   wsr   s      r   text_in_col,extract_khatian_entries.<locals>.text_in_cols  s3    L06IJxx.2..4466r   r   z\d[\d/]*r   r   r   rN   )row_idxrow
khatian_nor   r   data_y	data_pagerG   r   r   r   c                     U S   U S   4$ NrK   rJ   r-   r   s    r   rs   )extract_khatian_entries.<locals>.<lambda>  s    AfIqy+Ar   ru   r   c              3   *   #    U  H	  oS    v   M     g7fr   r-   r   s     r   r0   r     s     ;{!fI{r   r   r   c                     U S   U S   4$ r   r-   r   s    r   rs   r     s    1V9ai0r   c              3   *   #    U  H	  oS    v   M     g7fr   r-   r   s     r   r0   r     s     =1vYr   r   )r   owner_name_bn
owner_namefather_husband_name_bnfather_husband_namer   
area_acres)r   r   
startswithr   rx   r   r   r   
DECIMAL_REr{   rX   rz   r   r   r%   )rZ   r   r   r   
data_wordsr   r   	data_rowsr   r   khatian_rawr   r   r   entriesidxdrprev_data_yr   r   windowowner_wordsowner_bnfather_words	father_bnr   s    `                       @r   extract_khatian_entriesr   d  s   VVXYZH aV9x"}$ 	
fIZ' 	
 &	$$U+ 	
5   :3/DC7
 ID/!#y1k*
||K44S'23C01  '':+;+;D+A+A$(..u55U2&,,T22D!fVnQ
 	 "0 GY'R69Agia(28{O	H "
!!yI% *5&	*JF*J *J z 	 
 \&':!AC 88;{;;AAC *\"U)X-NN0

 HH===CCE	\*%4X>&/#=i#H[V*
 	1 (D N_d
s-   JJJJ)J:JJJc           
         [        U 5      n[        U 5      n[        U5      n[        U5      n[	        U5      nUc  [        U 5      R                  SS.UE$ [        X5      n[        U 5      R                  UR                  SS5      UR                  SS5      UR                  SS5      UR                  SS5      UR                  SS5      U[        U5      US	.	$ )
Nz
DLM failed)source_fileerrorrc   r   rd   re   rf   rg   )	r   rc   rd   re   rf   rg   rh   total_entrieskhatian_entries)
r^   ra   rl   r   r   r   r   r   r   r~   )r=   rZ   layoutheaderr   
total_landr   s          r   extract_land_recordr    s    hEX&FF#Fu%G.u5J >..!
 
 	
 &e5G  >..ZZ,zz)R0ZZ,ZZ,

:r2) \&
 
r   __main__rG   z&Usage: python extract_land.py file.pdfFr$   )ensure_asciiindent)r   )(__doc__r   jsonsysrO   pathlibr   difflibr   str	maketransr	   r   compiler   r   r   r    rY   r*   r%   listdictr^   ra   rl   r   r   r   r   r   r   r  __name__argvtargetsprintexitr<   dumpsr-   r   r   <module>r     s  
 !    # ==9<H! jj'
jj%
\

  ;C ; ; ;,S S D T
 Hc c D( ( (\)` '$ &&3DJ 34$; 3l  t 4: E Dd<L ,R4: R Rd Rp# $ B zhhqrlG67 ,F	$**V%
:; r   