
    2/jSY                         d Z ddlZddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dZdZd	Zd
ZdZ ej        ed           dedefdZd Zd Zd Zd Zd Zedk    r e             dS dS )u~  
match_pipeline.py

HYBRID LAND MATCHING PIPELINE
=============================

Business logic preserved.
Only extraction engine upgraded.

NEW FLOW:

formatted_output.json
    ↓
Download PDFs
    ↓
main_orchestrator.py
    ↓
DL Pipeline
    ↓
If low confidence → DLM fallback
    ↓
Final extraction JSON
    ↓
Matching Logic (UNCHANGED)
    ↓
Grouped Result JSON

    N)Path)datetime)ThreadPoolExecutoras_completed)process_land_recordzformatted_output.jsongrouped_result	downloads   <   T)exist_oknamereturnc                     | sdS |                                                                  } t          j        dd|           } t          j        dd|           } | S )N z[^\w\s] z\s+)lowerstripresub)r   s    ,/var/www/html/banglarbhumi/match_pipeline.pynormalize_namer   @   sS     r::<<D6*c4((D6&#t$$DK    c                     ddl m} t          |           }t          |          }|r|sdS t           |d ||                                          d          S )Nr   )SequenceMatcher   )difflibr   r   roundratio)name1name2r   n1n2s        r   name_match_scorer#   Q   st    ''''''			B			B R qb"%%++--	  r   c                    	 t          j        | t                    }|                                 | d| d}t          j                            t          |          }t          |d          5 }|	                    |j
                   d d d            n# 1 swxY w Y   |S # t          $ r9}t          d|             t          t          |                     Y d }~d S d }~ww xY w)N)timeout_z.pdfwbu   ❌ PDF download failed: )requestsgetREQUEST_TIMEOUTraise_for_statusospathjoinDOWNLOAD_DIRopenwritecontent	Exceptionprintstr)urlrel_ididxresponsefilenamer-   fes           r   download_pdfr=   e   s<   <#
 
 

 	!!###((s(((w||
 

 $ 	&GGH$%%%	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	&    /#//000c!ffttttts<   A,B" .B	B" BB" BB" "
C%,.C  C%c                    g }g }g }| D ]/}|                     dd          }|r|                    |           0d}|D ]_}d}	d }
|D ]}t          ||          }||	k    r|}	|}
|	dk    r|                    ||
|	d           ||	z  }J|                    |           `|r|t          |          z  }nd}||t	          |d          dS )N
owner_namer   r   ffffff?)expectedmatched_withscorer   )matchedmissingco_farmer_match_score)r)   appendr#   lenr   )extracted_entriesexpected_namesrD   rE   extracted_namesentryownertotal_scorerA   
best_score
best_matchextrC   final_scores                 r   match_cofarmersrS      sP   
 GGO
 # * *		,++ 	*""5)))
 K" % %

" 		! 		!C$ E
 z!!"
 
NN$ *#     :%KK NN8$$$$
  !C$7$77 !&{A!6!6  r   c           	         |                      d          }t          d           t          d|            t          d           |                      dg           }|                      dg           }|                      dg           }|                      dg           }|d	g g d
g d}|s
dg|d<   ||fS g }t          |d          D ]\  }}	t          d|            t          |	||          }
|
s,	 t	          |
          }|d                             |
|                     di                                d          |                     di                                d          d           |                     dg           }|                    |           |                     dd          }|r||vr|d                             d           |                     dd          }|r||vr|d                             d           D# t          $ rE}t          dt          |                      |d                             d           Y d }~d }~ww xY wt          ||          }|d         |d<   |d                             |d                    |d         |d<   |d         d k    rd|d         vrd|d         vrd!|d"<   t          t                              |d                             |d<   ||fS )#Nr7   z
============================zProcessing REL_ID: z============================JL_NoDaag_Noz
co-farmerspdf_urlsREJECTEDr   )r7   statusrD   rE   rF   	documentspdf_missingrE      )startu   
📄 Downloading PDF rZ   metadatapipeline_useddl_confidence)pdfpipeliner`   khatian_entriesjl_nor   jl_mismatchdaag_nodaag_mismatchu   ❌ Extraction failed: extraction_failedrD   rF   r@   ACCEPTEDrY   )r)   r4   	enumerater=   r   rG   extendr3   r5   rS   listdictfromkeys)rowr7   expected_jlexpected_daagrJ   rW   finalall_entriesr8   r6   pdf_path
extractionentriesextracted_jlextracted_daagr<   match_results                    r   process_single_recordrz      s   WWXF	
+,,,	
(
(
()))	
)***'''2&&KGGIr**MWW\2..Nwwz2&&H !" E  )?iu}
 Kha000 P PS---...
 
  	C	
 -X66J+%%&NN  #o&&!+" " #o&&
' 
' 
 
 
 !nn! G
 w'''
 &>> L
  {22)$++%   (^^ N
  !66)$++'    	 	 	4CFF44555)###       		 # L
 $I.E)	)L3444%1&E
!" 	%&$..y!1115#333$h eI&'' E) 5=s   3DH


I:IIc            	      H   t          j                     } t          t          dd          5 }t          j        |          }d d d            n# 1 swxY w Y   t          dt          |           d           i }t          t                    5 fd|D             }t          |          D ]}||         }	 |
                                \  }}||t          |          <   t          d|            G# t          $ rK}	t          d	|                    d
                      t          t          |	                     Y d }	~	d }	~	ww xY w	 d d d            n# 1 swxY w Y   t          j                                        d          }
t"           d|
 d}t          |dd          5 }t          j        ||dd           d d d            n# 1 swxY w Y   t          d|            t'          t          j                     | z
  d          }t          d| d           d S )Nrzutf-8)encodingu   
🚀 Loaded z records)max_workersc                 H    i | ]}                     t          |          |S  )submitrz   ).0ro   executors     r   
<dictcomp>zmain.<locals>.<dictcomp>  sD     
 
 

 	 OO%  	
 
 
r   u   ✅ Completed REL_ID: u   ❌ Failed REL_ID: r7   z%Y%m%d_%H%M%Sr&   z.jsonwr   F)indentensure_asciiu   
💾 Saved: u   
⏱ Total Time: s)timer0   
INPUT_FILEjsonloadr4   rH   r   MAX_WORKERSr   resultr5   r3   r)   r   nowstrftimeOUTPUT_PREFIXdumpr   )
start_timer;   rowsgrouped_resultsfuturesfuturero   r7   r   r<   	timestampoutput_file
total_timer   s                @r   mainr   l  s=   J
 
j#	0	0	0 Ay||               

.3t99
.
.
.///O
 

 
 
  	
 
 
 

 
 
 
 #7++ 	 	F&/C!'/5F,5V55       =#''(*;*;==   c!ff	                             J '' I
 ++9+++  
k3	1	1	1 
Q			
 	
 	
 	

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

(;
(
()))	j 	 J
 

,z
,
,
,-----s`   AAA(E8;C43E4
E	>AE?EE		EE E)GGG__main__)__doc__r,   r   r   r   r(   pathlibr   r   concurrent.futuresr   r   main_orchestratorr   r   r   r/   r   r*   makedirsr5   r   r#   r=   rS   rz   r   __name__r   r   r   <module>r      s^  
 : 
			 				                ? ? ? ? ? ? ? ?
 2 1 1 1 1 1
 %
  L4 ( ( ( (
 
 
 
 
 
"  (  FC C CTS S StO. O. O.j zDFFFFF r   