
    TAi                    (   S r SSKJr  SSKrSSKrSSKrSSKrSSKrSSKrSSK	r	SSK
r
SSKrSSKJr  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKrSSK J!r!  SSK"J#r#  SSK"J$r$  SSK"J%r%  SSK"J&r&  SSK"J'r'  SSK"J(r(  SSK"J)r)  SSK"J*r*  SSK"J+r+  SSK,J-r-  SSK.J/r/  SSK.J0r0  SSK1J2r2  SSK1J3r3  SSK4J5r5  SSK6J7r7  \8" \\-   \-   5      r9\9Ru                  S 5        S! r;S" r<SQS# jr=/ S$Qr>\>/ S%Q-   r?\>/ S&Q-   r@\?\?\@\?\@-   S'.rASRS( jrBSRS) jrC " S* S+5      rDSSS, jrESTS- jrFSUS. jrGS/ rH          SVS0 jrIS1 rJS2 rKS3 rLS4 rMS5 rNS6 rOS7 rPS8 rQS9 rRS: rSSWS; jrTSWS< jrUSXS= jrVSYS> jrWS? rXS@ 4       SZSA jjrYSXSB jrZS[SC jr[S\SD jr\ S\       S]SE jjr]  S^           S_SF jjr^  S`SG jr_  S`SH jr`          SaSI jra SbSJ jrbSK rcSL rdScSM jre       SdSN jrfSeSO jrg    SfSP jrhg)gz1General helper utilities to parse the pdf tables.    )annotationsN)groupby)
itemgetter)Path)Any)Callable)urlparse)uses_netloc)uses_params)uses_relative)Request)urlopen)PDFPageAggregator)LAParams)LTAnno)LTChar)LTContainer)LTImage)LTItem)
LTTextLine)LTTextLineHorizontal)LTTextLineVertical)PDFDocument)PDFPageInterpreter)PDFResourceManager)PDFPage)PDFTextExtractionNotAllowed)	PDFParser)StrByteType c                \     [        U 5      R                  [        ;   $ ! [         a     gf = f)zCheck to see if a URL has a valid protocol.

Parameters
----------
url : str or unicode

Returns
-------
isurl : bool
    If url has a valid protocol return True otherwise False.

F)	parse_urlscheme_VALID_URLS	Exception)urls    I/var/www/html/land-ocr/venv/lib/python3.13/site-packages/camelot/utils.pyis_urlr(   3   s.    ~$$33 s    
++c                    SnU (       aW  U[         R                  " [        R                  [        R                  -   [        R
                  -   5      -  nU S-  n U (       a  MW  U$ )zGenerate a random string .

Parameters
----------
length : int
    The length of the string to return.

Returns
-------
string
    returns a random string
r       )randomchoicestringdigitsascii_lowercaseascii_uppercase)lengthrets     r'   random_stringr3   F   sW     C
v}}MMF222V5K5KK
 	
 	!	 &
 J    c                &   [        S5       S3n[        R                  " SSS9 nSSS.n[        U S	U5      n[	        U5      nUR                  5       R                  5       nUS
:w  a  [        S5      eUR                  UR                  5       5        S	S	S	5        [        R                  R                  [        R                  R                  WR                  5      U5      n[        R                   " UR                  U5        U$ ! , (       d  f       Nx= f)zDownload file from specified URL.

Parameters
----------
url : str

Returns
-------
filepath : Union[StrByteType, Path]
    Temporary filepath.

   z.pdfwbF)deletezMozilla/5.0z<gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1)z
User-AgentzAccept-EncodingNzapplication/pdfzFile format not supported)r3   tempfileNamedTemporaryFiler   r   infoget_content_typeNotImplementedErrorwritereadospathjoindirnamenameshutilmove)r&   filenamefheadersrequestobjcontent_typefilepaths           r'   download_urlrN   \   s      "#4(H		$	$T%	8A (]
 #tW-gxxz224,,%&ABB	
 
9 ww||BGGOOAFF3X>H
KK!O 
9	8s   A,D
D)	flag_sizemargins
split_text
strip_texttable_areastable_regionsbackend)columnsedge_tolrow_tol
column_tol)process_background
line_scale	copy_text
shift_textline_tol	joint_tolthreshold_blocksizethreshold_constant
iterations
resolutionuse_fallback)streamnetworklatticehybridc           	         [         U   n[        U R                  5       5      R                  [        U5      5      nU(       a3  [	        SR                  SR                  [        U5      5      U5      5      eg)zValidates input keyword arguments.

Parameters
----------
kwargs : [type]
    [description]
flavor : str, optional
    [description], by default "lattice"

Raises
------
ValueError
    [description]
z"{} cannot be used with flavor='{}',N)flavor_to_kwargssetkeys
difference
ValueErrorformatrB   sorted)kwargsflavorparser_kwargsisecs       r'   validate_inputrv      sa     %V,Mv{{}((]);<D0778NPVW
 	
 r4   c                    [         U   n[        U R                  5       5      nU H  nXB;  d  M
  U R                  U5        M     U $ )zRemove extra key - value pairs from a kwargs dictionary.

Parameters
----------
kwargs : [type]
    [description]
flavor : str, optional
    [description], by default "lattice"

Returns
-------
[type]
    [description]

)rk   listrm   pop)rr   rs   rt   kwargs_keyskeys        r'   remove_extrar|      s@      %V,Mv{{}%K#JJsO  Mr4   c                  $    \ rS rSrSrS rS rSrg)TemporaryDirectory   zAA class method that will be used to create temporary directories.c                    [         R                  " 5       U l        [        R                  " [
        R                  U R                  5        U R                  $ )zJEnter the temporary directory .

Returns
-------
[type]
    [description]
)r9   mkdtemprD   atexitregisterrE   rmtree)selfs    r'   	__enter__TemporaryDirectory.__enter__   s6     $$&	 	tyy1yyr4   c                    g)zCalled when the client exits.

Parameters
----------
exc_type : [type]
    [description]
exc_value : [type]
    [description]
traceback : [type]
    [description]
N )r   exc_type	exc_value	tracebacks       r'   __exit__TemporaryDirectory.__exit__   s     	r4   )rD   N)__name__
__module____qualname____firstlineno____doc__r   r   __static_attributes__r   r4   r'   r~   r~      s    Kr4   r~   c                    [        5        nU(       a  X-   n [        R                  R                  X 5      nSSS5        U$ ! , (       d  f       W$ = f)zGenerate a new path within a temporary directory.

Parameters
----------
filename : str
extension : str

Returns
-------
file_path_in_temporary_dir : str

N)r~   r@   rA   rB   )rG   	extensiontemp_dirrA   s       r'   build_file_path_in_temp_dirr      sE     
	+Hww||H/ 
 K	 
	 Ks   +A  
Ac                
    X-   $ )zTranslate x2 by x1.

Parameters
----------
x1 : float
    The offset to apply.
x2 : float
    The original y-coordinate.

Returns
-------
float
    The translated y-coordinate.

r   )x1x2s     r'   	translater     s      7Nr4   c                
    X-  $ )zScale a given value by a factor.

Parameters
----------
value : float
    The value to scale.
factor : float
    The scaling factor.

Returns
-------
float
    The scaled value.
r   )valuefactors     r'   scaler     s     >r4   c                   U u  p#pEUu  pgn[        X&5      n[        [        [        U* U5      5      U5      n[        XF5      n[        [        [        U* U5      5      U5      n[        U5      [        U5      [        U5      [        U5      4n	U	$ )a/  Translate and scale pdf coordinate space to image coordinate space.

Parameters
----------
k : tuple
    Tuple (x1, y1, x2, y2) representing table bounding box where
    (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
    space.
factors : tuple
    Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
    first two elements are scaling factors and pdf_y is height of
    pdf.

Returns
-------
knew : tuple
    Tuple (x1, y1, x2, y2) representing table bounding box where
    (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
    space.

)r   absr   int)
kfactorsr   y1r   y2scaling_factor_xscaling_factor_ypdf_yknews
             r'   	scale_pdfr   &  s    , NBB07-	r	$B	s9eVR()+;	<B	r	$B	s9eVR()+;	<BGSWc"gs2w/DKr4   c                   Uu  pEn0 nU R                  5        H  nUu  pp[        X5      n	[        [        [        U* U
5      5      U5      n
[        X5      n[        [        [        U* U5      5      U5      n[	        X   6 u  pU Vs/ s H  n[        X5      PM     nnU Vs/ s H#  n[        [        [        U* U5      5      U5      PM%     nnS[        [	        UU5      5      0XyXU4'   M     / nU Hx  n[        US   U5      [        US   U5      p[        [        [        U* US   5      5      U5      [        [        [        U* US   5      5      U5      pUR                  XX45        Mz     / nU Hx  n[        US   U5      [        US   U5      p[        [        [        U* US   5      5      U5      [        [        [        U* US   5      5      U5      pUR                  XX45        Mz     UUU4$ s  snf s  snf )a?  Translate and scale image coordinate space to PDF coordinate space.

Parameters
----------
tables : dict
    A dictionary with table boundaries as keys (tuples of four floats)
    and a list of intersections (list of tuples of two floats) as values.
v_segments : list
    A list of vertical line segments, where each segment is a tuple
    of four floats (x1, y1, x2, y2).
h_segments : list
    A list of horizontal line segments, where each segment is a tuple
    of four floats (x1, y1, x2, y2).
factors : tuple
    A tuple (scaling_factor_x, scaling_factor_y, img_y) where the
    first two elements are scaling factors and img_y is the height of
    the image.

Returns
-------
Tuple[Dict[Tuple[float, float, float, float], Dict[str, List[Tuple[float, float]]]],
      List[Tuple[float, float, float, float]],
      List[Tuple[float, float, float, float]]]
    A tuple containing:
    - tables_new: A new dictionary with scaled table boundaries and joints.
    - v_segments_new: A new list of scaled vertical segments.
    - h_segments_new: A new list of scaled horizontal segments.
jointsr      r*      )rm   r   r   r   ziprx   append)tables
v_segments
h_segmentsr   r   r   img_y
tables_newr   r   r   r   r   j_xj_yj
j_x_scaled
j_y_scaledv_segments_newvh_segments_newhs                         r'   scale_imager   F  s   L 18-J[[]2(3y%,-/?@2(3y%,-/?@ 	?:=>#QeA0#
>RUVRUQeC	5&! 457GHRU
V d3z:67(

#$ " Nqt-.ad<L0MB#i!-.0@A#i!-.0@A  	rr./  Nqt-.ad<L0MB#i!-.0@A#i!-.0@A  	rr./  ~~557 ?Vs   G2!*G7c                   Sn[        U Vs/ s H)  oDR                  5       R                  5       (       d  M'  UPM+     sn5      n[        U Vs/ s H)  oDR                  5       R                  5       (       d  M'  UPM+     sn5      nXV:  a-  [        S U  5       5      n[        S U  5       5      nXx:  a  SOSnU$ s  snf s  snf )a  Get text rotation.

Detects if text in table is rotated or not using the current
transformation matrix (CTM) and returns its orientation.

Parameters
----------
horizontal_text : list
    List of PDFMiner LTTextLineHorizontal objects.
vertical_text : list
    List of PDFMiner LTTextLineVertical objects.
ltchar : list
    List of PDFMiner LTChar objects.

Returns
-------
rotation : string
    '' if text in table is upright, 'anticlockwise' if
    rotated 90 degree anticlockwise and 'clockwise' if
    rotated 90 degree clockwise.

r    c              3  v   #    U  H/  oR                   S    S:  =(       a    UR                   S   S:  v   M1     g7fr*   r   r   Nmatrix.0ts     r'   	<genexpr>get_rotation.<locals>.<genexpr>  s.     KUa;AHHQK!O;U   79c              3  v   #    U  H/  oR                   S    S:  =(       a    UR                   S   S:  v   M1     g7fr   r   r   s     r'   r   r     s.     OAHHQK!O?a?r   anticlockwise	clockwise)lenget_textstripsum)	charshorizontal_textvertical_textrotationr   hlenvlenr   r   s	            r'   get_rotationr     s    . H?C?ajjl.@.@.B?CDD=A=aJJL,>,>,@=ABD{KUKK	OOO&/&??[O DAs   &B7B7&B<6B<c                   U S   U S   4nU S   U S   4nU Vs/ s HI  nUS   US   S-
  :  d  M  US   US   S-   :  d  M%  US   S-
  US   s=::  a  US   S-   ::  d  MC  O  MG  UPMK     nnU Vs/ s HI  nUS   US   S-
  :  d  M  US   US   S-   :  d  M%  US   S-
  US   s=::  a  US   S-   ::  d  MC  O  MG  UPMK     nnXh4$ s  snf s  snf )a  Return all line segments present inside a bounding box.

Parameters
----------
bbox : tuple
    Tuple (x1, y1, x2, y2) representing a bounding box where
    (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
    space.
v_segments : list
    List of vertical line segments.
h_segments : list
    List of vertical horizontal segments.

Returns
-------
v_s : list
    List of vertical line segments that lie inside table.
h_s : list
    List of horizontal line segments that lie inside table.

r   r*   r   r   r   )	bboxr   r   lbrtr   v_sr   h_ss	            r'   segments_in_bboxr     s7   , q'47	B
q'47	B AQ4"Q%!) 	
 !!r!uqy 0 	
57UQY!A$5S"Q%RS)5S 	
5S 	
   AQ4"Q%!) 	
 !!r!uqy 0 	
57UQY!A$5S"Q%RS)5S 	
5S 	
  
 8O
s:   CCCC#C/C	C	C	5C	9C	c                    U R                   U R                  U R                   U R                  -   S-  U R                  U R                  U R                  U R                  -   S-  S.$ )zACalculate the coordinates of each alignment for a given textline.       @)leftrightmiddlebottomtopcenter)x0r   y0r   )textlines    r'   get_textline_coordsr     sT     ;;,3++{{;;,3 r4   c                    U R                  S5      u  pp4[        U5      n[        U5      n[        U5      n[        U5      n[        X5      [        X$5      [        X5      [        X$5      4$ )zDeserialize bbox from string ("x1,y1,x2,y2") to tuple (x1, y1, x2, y2).

Parameters
----------
bbox_str : str
    Serialized bbox with comma separated coordinates, "x1,y1,x2,y2".

Returns
-------
bbox : tuple
    Tuple (x1, y1, x2, y2).

rj   )splitfloatminmax)bbox_strr   r   r   r   s        r'   bbox_from_strr     sY     ^^C(NBB	rB	rB	rB	rBKRc"k3r;??r4   c                    U u  p#pEUu  pgpX&s=:  =(       a    U:  Os  =(       d    X(s=:  =(       a    U:  Os  =(       a/    X7s=:  =(       a    U:  Os  =(       d    X9s=:  =(       a    U:  $ s  $ )a  Check if boundingboxes overlap.

Parameters
----------
bbox1 : tuple
    Tuple (x1, y1, x2, y2) representing a bounding box where
    (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
    space.
bbox2 : tuple
    Tuple (x1, y1, x2, y2) representing a bounding box where
    (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
    space.

Returns
-------
bool
    Returns True if two bounding boxes overlap
r   )
bbox1bbox2left1bottom1right1top1left2bottom2right2top2s
             r'   bboxes_overlapr     sf    & &+"UV%*"UV##V#A)@)@&)@ 		!	!T	!=w'<'<'<'<r4   c           
         U Vs/ s HC  n[        XR                  UR                  UR                  UR                  45      (       d  MA  UPME     nnU$ s  snf )aZ  Return all text objects which overlap or are within a bounding box.

Parameters
----------
bbox : tuple
    Tuple (x1, y1, x2, y2) representing a bounding box where
    (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
    space.
textlines : List of PDFMiner text objects.

Returns
-------
t_bbox : list
    List of PDFMiner text objects.

)r   r   r   r   r   )r   	textlinesr   t_bboxs       r'   textlines_overlapping_bboxr    sC    " #UAnTDD!$$add;S&TaFUM Vs   A A	Ac                   U S   U S   4nU S   U S   4nU Vs/ s Hw  nUS   S-
  UR                   UR                  -   S-  s=::  a  US   S-   ::  d  M8  O  M<  US   S-
  UR                  UR                  -   S-  s=::  a  US   S-   ::  d  Mq  O  Mu  UPMy     nnU Vs1 s H  oDiM     nnU H{  nUR	                  5        Hd  nXx:X  a  M
  [        Xx5      (       d  M  [        U5      n	U	S:X  d  [        Xx5      U	-  S:  d  MA  [        X5      (       d  MS  UR                  U5        Mf     M}     [        U5      n
U
$ s  snf s  snf )a  Return all text objects in a bounding box.

Return the text objects which lie at least 80% inside a bounding box
across both dimensions.

Parameters
----------
bbox : tuple
    Tuple (x1, y1, x2, y2) representing a bounding box where
    (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
    space.
text : List of PDFMiner text objects.

Returns
-------
t_bbox : list
    List of PDFMiner text objects that lie inside table, discarding the overlapping ones

r   r*   r   r   r   g?)r   r   r   r   copybbox_intersect	bbox_areabbox_intersection_areabbox_longerdiscardrx   )r   textr   r   r   r  restbabbba_areaunique_boxess              r'   text_in_bboxr  -  sW   ( q'47	B
q'47	B Aa519+8r!uqy8 	
8 	
 qEAI!$$+,91	9 	
 : 	
   v!AvD))+Bxb%%#B-a<$:2$BW$LPS#S"2**R(   :L+ s#   7D8D81D8D8D8D=c                    0 n[        X5      US'   [        X5      US'   US   R                  S S9  US   R                  S S9  U$ )a  Return all text objects present inside a bounding box.

split between horizontal and vertical text.

Parameters
----------
bbox : tuple
    Tuple (x1, y1, x2, y2) representing a bounding box where
    (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
    space.
horizontal_text : List of PDFMiner text objects.
vertical_text : List of PDFMiner text objects.

Returns
-------
t_bbox : dict
    Dict of lists of PDFMiner text objects that lie inside table, with one
    key each for "horizontal" and "vertical"

horizontalverticalc                4    U R                   * U R                  4$ N)r   r   xs    r'   <lambda>'text_in_bbox_per_axis.<locals>.<lambda>r  s    addUADDMr4   r{   c                4    U R                   U R                  * 4$ r  )r   r   r  s    r'   r  r  s  s    144!$$-r4   )r  sort)r   r   r   r  s       r'   text_in_bbox_per_axisr  [  sZ    ( F'>F<%d:F:
<"9:
: 78Mr4   c                    [        U S   UR                  5      [        U S   UR                  5      [        U S   UR                  5      [        U S   UR
                  5      4$ )zAExpand (if needed) a bbox so that it fits the parameter textline.r   r*   r   r   )r   r   r   r   r   r   )r   r   s     r'   expand_bbox_with_textliner!  w  sV     	DGX[[!DGX[[!DGX[[!DGX[[!	 r4   c                    [        U 5      S:X  a  gU S   R                  U S   R                  U S   R                  U S   R                  4nU SS  H  n[        X5      nM     U$ )a5  Return the smallest bbox containing all the text objects passed as a parameters.

Parameters
----------
textlines : List of PDFMiner text objects.

Returns
-------
bbox : tuple
    Tuple (x1, y1, x2, y2) representing a bounding box where
    (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
    space.
r   Nr*   )r   r   r   r   r   r!  )r  r   tls      r'   bbox_from_textlinesr$    s`     9~aLOOYq\__ilooy|ODm(2 Kr4   c                .   [        U R                  UR                  5      n[        U R                  UR                  5      n[        U R                  UR                  5      n[        U R
                  UR
                  5      nXB:  d  XS:  a  gXB-
  X5-
  -  nU$ )a	  Return area of the intersection of the bounding boxes of two PDFMiner objects.

Parameters
----------
ba : PDFMiner text object
bb : PDFMiner text object

Returns
-------
intersection_area : float
    Area of the intersection of the bounding boxes of both objects

        )r   r   r   r   r   r   )r  r  x_lefty_topx_righty_bottomintersection_areas          r'   r	  r	    sy     FruuE"%%G255"%% H8+ )e.>?r4   c                h    U R                   U R                  -
  U R                  U R                  -
  -  $ )zReturn area of the bounding box of a PDFMiner object.

Parameters
----------
bb : PDFMiner text object

Returns
-------
area : float
    Area of the bounding box of the object

r   r   r   r   )r  s    r'   r  r    s'     EEBEEMbeebeem,,r4   c                    U R                   UR                  :  =(       aY    UR                   U R                  :  =(       a9    U R                  UR                  :  =(       a    UR                  U R                  :  $ )zReturn True if the bounding boxes of two PDFMiner objects intersect.

Parameters
----------
ba : PDFMiner text object
bb : PDFMiner text object

Returns
-------
overlaps : bool
    True if the bounding boxes intersect

r-  r  r  s     r'   r  r    sI     55BEE>RbeeruunR"%%RBEERUUNRr4   c                   / nU R                  S S9  U  Hn  nU(       a  US   S   U-   UR                  :  a)  UR                  UR                  UR                  /5        ML  [	        US   S   UR                  5      US   S'   Mp     U$ )a  Make a list of disjunct cols boundaries for a list of text objects.

Parameters
----------
tls : list of PDFMiner text object.

min_gap : minimum distance between columns. Any elements closer than
    this threshold are merged together.  This is to prevent spaces between
    words to be misinterpreted as boundaries.

Returns
-------
boundaries : list
    List x-coordinates for cols.
    [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]


c                    U R                   $ r  )r   )r#  s    r'   r  )find_columns_boundaries.<locals>.<lambda>  s    BEEr4   r  r*   )r  r   r   r   r   )tlsmin_gapcols_boundsr#  s       r'   find_columns_boundariesr7    s    & KHH!H"B 2W <ruu Druu~.!$[_Q%7!?KOA	 
 r4   c                   ^  T (       d  / $ [        T 5      S:  a  / $ [        [        U 4S j[        S[        T 5      5      5      5      nUR	                  ST S   S   5        UR                  T S   S   5        U$ )a  Find split lines given a list of boundaries between rows or cols.

Boundaries:     [ a ]         [b]     [   c   ]  [d]
Splits:         |        |         |            |  |

Parameters
----------
boundaries : list
    List of tuples of x- (for columns) or y- (for rows) coord boundaries.
    These are the (left, right most) or (bottom, top most) coordinates.

Returns
-------
anchors : list
    List of coordinates representing the split points, each half way
    between boundaries
r*   c                2   > TU S-
     S   TU    S   -   S-  $ )Nr*   r   r   r   )idx
boundariess    r'   r  +boundaries_to_split_lines.<locals>.<lambda>  s&    C!G,Q/*S/!2DDKr4   r   r3  )r   rx   maprangeinsertr   )r;  anchorss   ` r'   boundaries_to_split_linesrA    sz    & 	 :	 K!S_%	
G NN1jmA&' NN:b>!$%Nr4   c                    U $ r  r   r  s    r'   r  r    s    Qr4   c                \   [        U5      nUS:X  a  gX" US   5      ::  a  gX" US   5      :  a  US-
  $ SUS-
  pTXE:  a%  XE-   S-  nU" X   5      nXp:  a  US-   nOUnXE:  a  M%  US:X  a  gXC:X  a  US-
  $ [        U" X   5      U -
  5      [        U" XS-
     5      U -
  5      :  a  U$ US-
  $ )ax  Find the index of the closest point in sorted_list.

Parameters
----------
point : Any
    The reference sortable element to search.
sorted_list : List[Any]
    A sorted list of elements.
fn : Callable[[Any], Any], optional
    Optional accessor function, by default lambda x: x

Returns
-------
Optional[int]
    The index of the closest point, or None if the list is empty.
r   Nr3  r*   r   )r   r   )pointsorted_listfnnr   r   midmid_vals           r'   get_index_closest_pointrJ    s    & 	KA 	Av ;q>"";r?##1u QU%
,|![%&?7DE , qyy1u 2k 5()C;ax3H0IE0Q,RRaxr4   c                h    U R                   U R                  -
  UR                   UR                  -
  :  $ )a  Return True if the bounding box of the first PDFMiner object is longer or equal to the second.

Parameters
----------
ba : PDFMiner text object
bb : PDFMiner text object

Returns
-------
longer : bool
    True if the bounding box of the first object is longer or equal

)r   r   r/  s     r'   r
  r
  U  s'     EEBEEMruuruu}--r4   c                    / nU  HZ  nU(       d  UR                  U5        M  US   n[        R                  " XCUS9(       a  XC-   S-  nXBS'   MI  UR                  U5        M\     U$ )zMerge lines which are within a tolerance.

By calculating a moving mean, based on their x or y axis projections.

Parameters
----------
ar : list
line_tol : int, optional (default: 2)

Returns
-------
ret : list

r3  )abs_tolr   )r   mathisclose)arr^   r2   atemps        r'   merge_close_linesrS  f  s]     CJJqMr7D||DX6C'B

1  Jr4   c           	         U(       d  U $ [         R                  " SSR                  [        [         R                  U5      5       S3SU [         R
                  S9nU$ )zStrip any characters in `strip` that are present in `text`.

Parameters
----------
text : str
    Text to process and strip.
strip : str, optional (default: '')
    Characters that should be stripped from `text`.

Returns
-------
stripped : str
[r    ])flags)resubrB   r=  escapeUNICODE)r  r   strippeds      r'   
text_stripr]    sL     vvRWWSE*+,A.D

H Or4   c           	        / nUS:X  aW  U  Vs/ s HI  n[        U[        5      (       a  M  UR                  5       [        R                  " UR
                  SS94PMK     nnOhUS:X  aW  U  Vs/ s HI  n[        U[        5      (       a  M  UR                  5       [        R                  " UR                  SS94PMK     nnO[        S5      e0 nU H&  u  pgUR                  U/ 5      R                  U5        M(     [        U5      S:  a  [        UR                  5       5      n/ n	UR                  5        HZ  u  pzSR                  U
5      R                  5       nU(       d  M-  Xx:X  a  U	R                  SU S	35        MI  U	R                  U5        M\     SR                  U	5      nOSR                  S
 U 5       5      n[!        X5      $ s  snf s  snf )a  Flag super/subscripts.

Flag super/subscripts in text by enclosing them with <s></s>.
May give false positives.

Parameters
----------
textline : List[LTChar | LTAnno]
    List of objects implementing the LTCharProtocol.
direction : str
    Direction of the PDFMiner LTTextLine object.
strip_text : str, optional (default: '')
    Characters that should be stripped from a string before
    assigning it to a cell.

Returns
-------
str
    The processed string with flagged super/subscripts.
r  r6   )decimalsr  z;Invalid direction provided. Use 'horizontal' or 'vertical'.r*   r    z<s>z</s>c              3  *   #    U  H	  u  pUv   M     g 7fr  r   )r   r  _s      r'   r   !flag_font_size.<locals>.<genexpr>  s     0a74$as   )
isinstancer   r   nproundheightwidthro   
setdefaultr   r   r   rm   itemsrB   r   r]  )r   	directionrR   dr   size_groupsr  sizemin_sizeflistr   combined_charsfstrings                r'   flag_font_sizerr    s   0 "$AL  
a( ;QZZ\288AHHq9: 	
 

 
j	  
a( :QZZ\288AGGa89 	
 
 VWW +-K
tR(//5  ;!{'')*&,,.KDWWU^113N~#LL3~&6d!;<LL0 / ''%.''0a00g**I

s   G3G*G3Gc                    / nUR                   nUR                  5       (       a  SSUR                  5       4/$ US:X  a  [        XU5      nOUS:X  a  [	        XU5      n[        XSX$5      nU$ )a  Split textline into substrings if it spans across multiple rows/columns.

Parameters
----------
table : camelot.core.Table
    The table structure containing rows and columns.
textline : LTTextLine
    PDFMiner LTTextLine object.
direction : str
    Direction of the PDFMiner LTTextLine object, either "horizontal" or "vertical".
flag_size : bool
    Whether to highlight a substring using <s></s> if its size differs from the rest of the string.
strip_text : str
    Characters to strip from a string before assigning it to a cell.

Returns
-------
List[tuple[int, int, str]]
    A list of tuples of the form (idx, text) where idx is the index of row/column
    and text is an LTTextLine substring.
r3  r  r  )r   is_emptyr   _process_horizontal_cut_process_vertical_cut_group_and_process_chars)tabler   rj  rO   rR   cut_textr   grouped_charss           r'   split_textliner{    st    8 8:H==DR**,-..L *5DA	j	 ($?,X)XMr4   c                   / n[        U R                  5       VVs/ s H#  u  pEUS   US   ::  d  M  US   US   ::  d  M!  UPM%     nnn[        U R                  5       VVs/ s H,  u  pxUS   US   US   -   S-  s=::  a  US   ::  d  M&  O  M*  UPM.     n	nnU	(       d  U$ U	S   nU V
s/ s HB  oR                  U   U
   R                  (       d  M%  XR                  U   U
   R
                  4PMD     sn
=(       d"    US   U R                  U   S   R
                  4/nUR                   H  nU R                  U   nU H  n[        U[        5      (       al  US   UR                  UR                  -   S-  s=::  a	  US   ::  a?  O  O<UR                  UR                  -   S-  US   ::  a  UR                  XS   U45          M  [        U[        5      (       d  M  UR                  XS   U45        M     M     U$ s  snnf s  snnf s  sn
f )z(Process horizontal cuts of the textline.r   r   r*   r   r3  )	enumeratecolsrowscellsr   r   _objsrc  r   r   r   r   r   r   r   )rx  r   r   ry  ir  	x_overlapr   rr_idxcx_cutsrK   rowcuts                  r'   ru  ru    s    8:H

++daqttAw47aPQd?+    

++daqtQ$q'8IQ7N/VRSTURV/V/V+ 
  aA+4+4aAq8I8O8O!KKN1  !9 1Q<Q+..
/	0  ~~jjmC3''Fsvv!3=s1v=VVcff_)SV3FC 01C((FC 01   O7-   G8
G8G8 %G>	G>G>($H!Hc                   / n[        U R                  5       VVs/ s H#  u  pEUS   US   ::  d  M  US   US   ::  d  M!  UPM%     nnn[        U R                  5       VVs/ s H,  u  pxUS   US   US   -   S-  s=::  a  US   ::  d  M&  O  M*  UPM.     n	nnU	(       d  U$ U	S   nU V
s/ s HB  oR                  U
   U   R                  (       d  M%  XR                  U
   U   R
                  4PMD     sn
=(       d"    US   U R                  S   U   R
                  4/nUR                   H  nU R                  U   nU H  n[        U[        5      (       al  US   UR                  UR                  -   S-  s=::  a	  US   ::  a?  O  O<UR                  UR
                  -   S-  US   :  a  UR                  US   X45          M  [        U[        5      (       d  M  UR                  US   X45        M     M     U$ s  snnf s  snnf s  sn
f )z&Process vertical cuts of the textline.r*   r   r   r   r3  )r}  r  r~  r  r   r   r  rc  r   r   r   r   r   r   )rx  r   r   ry  r   y	y_overlapr  r  c_idxr  y_cutsrK   colr  s                  r'   rv  rv  .  s    8:H

++daqttAw47aPQd?+    

++daqtQ$q'8IQ7N/VRSTURV/V/V+ 
  aA+4+4aAq8I8P8P!KKN1  !9 1Q<R+..
/	0  ~~jjmC3''Fsvv!3=s1v=VVcff_)SV3Q 01C((Q 01   O7r  c                   / n[        U [        SS5      5       H  u  pV[        U5      nU(       a7  UR                  US   US   [	        U Vs/ s H  oS   PM	     snX#S945        MN  / n	U H%  nU	R                  US   R                  5       5        M'     UR                  US   US   [        SR                  U	5      U5      45        M     U$ s  snf )a  
Group characters and process them based on size flag.

Parameters
----------
cut_text : list of tuples
    Each tuple consists of (x0, y0, character), where x0 and y0 are
    coordinates and character can be an instance of LTChar or LTAnno.

flag_size : bool
    A flag indicating whether to group by font size.

direction : str
    Direction for processing the text (e.g., 'horizontal' or 'vertical').

strip_text : str
    Characters to strip from the text.

Returns
-------
list of tuples
    Each tuple consists of (x0, y0, processed_text), where processed_text
    is the grouped and processed text based on the specified conditions.
r   r*   r   rR   r    )r   r   rx   r   rr  r   r]  rB   )
ry  rO   rj  rR   rz  r{   r   
chars_listr   gcharss
             r'   rw  rw  Q  s    < 13Mh
1a(89
%[
  FF"'12z!1z2I Fadmmo.     QQBGGFOZ!HI% :,  3s   Cc           
        S/S-  u  pg[        [        U R                  5      5       GH  nUR                  UR                  -   S-  U R                  U   S   :  d  M6  UR                  UR                  -   S-  U R                  U   S   :  d  Mh  / n	U R
                   H  n
U
S   UR                  ::  a  U
S   UR                  :  a{  U
S   UR                  ::  a  UR                  OU
S   nU
S   UR                  :  a  UR                  OU
S   nU	R                  [        X-
  5      [        U
S   U
S   -
  5      -  5        M  U	R                  S5        M     [        [        [        S U	5      5      5      S:X  ax  UR                  5       R                  S5      nUR                  UR                  4nU R
                  S   S   U R
                  S   S   4n[        R                  " U SU S	U 3SS
9  UnU	R!                  [#        U	5      5      n  O   US:X  a  / S4$ [%        XXg5      nU(       a  ['        XX$US9U4$ U(       a  Xg[)        UR*                  X%S94/U4$ Xg[-        UR                  5       U5      4/U4$ )a  
Get indices of the table cell.

Get the index of a table cell where a given text object lies by
comparing their y and x-coordinates.

Parameters
----------
table : camelot.core.Table
    The table structure containing rows and columns.
t : object
    PDFMiner LTTextLine object.
direction : string
    Direction of the PDFMiner LTTextLine object.
split_text : bool, optional (default: False)
    Whether or not to split a text line if it spans across multiple cells.
flag_size : bool, optional (default: False)
    Whether to highlight a substring using <s></s> if its size is different
    from the rest of the string.
strip_text : str, optional (default: '')
    Characters that should be stripped from a string before assigning it to a cell.

Returns
-------
list
    List of tuples of the form (r_idx, c_idx, text) where r_idx and c_idx
    are row and column indices, respectively.
float
    Assignment error, percentage of text area that lies outside a cell.
    +-------+
    |       |
    |   [Text bounding box]
    |       |
    +-------+
r3  r   r   r   r*   c                    U S:g  $ )Nr3  r   r  s    r'   r  !get_table_index.<locals>.<lambda>  s    br4   
 z does not lie in column range )
stacklevel      ?)rO   rR   r  )r>  r   r  r   r   r~  r   r   r   r   rx   filterr   r   warningswarnindexr   calculate_assignment_errorr{  rr  r  r]  )rx  r   rj  rQ   rO   rR   r  r  r  lt_col_overlapr  r   r   r  
text_range	col_rangeerrors                    r'   get_table_indexr    sY   L 4!8LE3uzz?#DD144K3Aq!11qttadd{c6IEJJM

M 7  NZZQ4144<AaDADDL#$Q4144<144QqTD$%aDADDLADDadE"))#dl*;c!A$1+>N*NO"))"-   40.ABCqHzz|))$/ddADD\
"ZZ]1-uzz"~a/@A	fAj\)G	{S  E"((^)<=E- $. {3w&q>E)Z 	
 	
 >!''9TU
 	 z!**,
CDEuLLr4   c                   S/S-  u  pEpgU R                   UR                  U   S   :  a(  [        U R                   UR                  U   S   -
  5      nU R                  UR                  U   S   :  a(  [        U R                  UR                  U   S   -
  5      nU R                  UR
                  U   S   :  a(  [        U R                  UR
                  U   S   -
  5      nU R                  UR
                  U   S   :  a(  [        U R                  UR
                  U   S   -
  5      n[        U R                  U R                  -
  5      S:X  a  SO![        U R                  U R                  -
  5      n[        U R                   U R                  -
  5      S:X  a  SO![        U R                   U R                  -
  5      n	X-  n
XU-   -  XU-   -  -   U
-  nU$ )a  
Calculate the assignment error for the given text object.

Parameters
----------
t : object
    PDFMiner LTTextLine object.
table : camelot.core.Table
    The table structure containing rows and columns.
r_idx : int
    Row index where the text object is located.
c_idx : int
    Column index where the text object is located.

Returns
-------
float
    The calculated assignment error.
r      r*   r&  r  )r   r  r   r   r   r~  r   )r   rx  r  r  	y0_offset	y1_offset	x0_offset	x1_offsetr  r  charear  s               r'   r  r    s   ( 34q.I)ttejj""uzz%0334	ttejj""uzz%0334	ttejj""uzz%0334	ttejj""uzz%0334	144!$$;3&Cqtt,<A144!$$;3&Cqtt,<AUFy()ay3H.IJfTELr4   c                    Sn Sn[        U  Vs/ s H  o3S   PM	     sn5      U:w  a  [        S5      eU  H-  nUS   [        US   5      -  nUS    H  nX$SU-
  -  -  nM     M/     U$ s  snf ! [         a    Sn U$ f = f)a^  Compute Accuracy.

Calculates a score based on weights assigned to various
parameters and their error percentages.

Parameters
----------
error_weights : list
    Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
    where pn is the weight assigned to list of errors en.
    Sum of pn should be equal to 100.

Returns
-------
score : float

d   r   z&Sum of weights should be equal to 100.r*   )r   ro   r   ZeroDivisionError)error_weights	score_valscoreewweighterror_percentages         r'   compute_accuracyr    s    $ I	."1./9<EFFBUSAZ'F$&qE 1'7#788 %*   L /  Ls"   A- A(A	A- (A- -A=<A=c                    SnSnU  Hc  n[        U[        5      (       d  M  U[        U5      -  nU H5  n[        U[        5      (       d  M  UR	                  5       S:X  d  M0  US-  nM7     Me     US:X  a  gSX-  -  nU$ )zCalculates the percentage of empty strings in a two-dimensional list.

Parameters
----------
d : list
    A two-dimensional list (list of lists) containing strings.

Returns
-------
whitespace : float
    Percentage of empty cells.
r   r    r*   r&  r  )rc  rx   r   strr   )rk  
whitespacetotal_elementsr  r   whitespace_percentages         r'   compute_whitespacer  #  s     JN ac!f$Na%%!'')r/!OJ     :#>?  r4   c                   [        U S5       n[        U5      n	[        U	5      n
U
R                  (       d  [	        SU  35      e[        UUUUUUUS9n[        5       n[        XS9n[        X5      n[        [        R                  " U
5      S5      nUc  [        eUR                  U5        UR                  5       nUR                  S   nUR                  S   nUU4nUU4sSSS5        $ ! , (       d  f       g= f)a  Return a PDFMiner LTPage object and page dimension of a single page pdf.

To get the definitions of kwargs, see
https://pdfminersix.rtfd.io/en/latest/reference/composable.html.

Parameters
----------
filename : string
    Path to pdf file.
line_overlap : float
char_margin : float
line_margin : float
word_margin : float
boxes_flow : float
detect_vertical : bool
all_texts : bool

Returns
-------
layout : object
    PDFMiner LTPage object.
dim : tuple
    Dimension of pdf page in the form (width, height).

rbz Text extraction is not allowed: )line_overlapchar_marginline_marginword_margin
boxes_flowdetect_vertical	all_texts)laparamsNr   r   )openr   r   is_extractabler   r   r   r   r   nextr   create_pagesprocess_page
get_resultr   )rG   r  r  r  r  r  r  r  rH   parserdocumentr  rsrcmgrdeviceinterpreterpagelayoutrg  rf  dims                       r'   get_page_layoutr  I  s    F 
h	1v&&&-28*=  %###!+
 %&"7>(9G((2D9<--  &""$AQfos{7 
		s   CC))
C7c                    / n U  H^  n[        U[        5      (       a  UR                  U5        M+  [        U[        5      (       d  MB  [	        U5      nUR                  U5        M`     U$ ! [         a     U$ f = f)zGet charachter objects from a pdf layout.

Recursively parses pdf layout to get a list of PDFMiner LTChar

Parameters
----------
layout : object
    PDFMiner LTContainer object.

Returns
-------
result : list
    List of LTChar text objects.

)rc  r   r   r   get_char_objectsextendAttributeError)r  char_object
child_chars       r'   r  r    st      DG'6**G$G[11-g6
J'  K  Ks   A A*  A* *
A87A8c                r   / n/ n/ n/ n U  GH  n[        U[        5      (       a  UR                  U5        OM[        U[        5      (       a  UR                  U5        O&[        U[        5      (       a  UR                  U5        [        U[
        5      (       a  UR                  U5        M  [        U[        5      (       d  M  [        U5      u  pgpUR                  U5        [        U5      nUR                  U5        UR                  U5        UR                  U	5        GM     XX44$ ! [         a     Nf = f)a  Parse a PDF layout to get objects.

Recursively parses pdf layout to get a list of
PDFMiner LTImage, LTTextLineHorizontal, LTTextLineVertical objects.

Parameters
----------
layout : object
    PDFMiner LTContainer object
        ( LTPage, LTTextLineHorizontal, LTTextLineVertical).

Returns
-------
result : tuple
    Include List of LTImage objects, list of LTTextLineHorizontal objects
    and list of LTTextLineVertical objects

)rc  r   r   r   r   r   r   get_image_char_and_text_objectsr  r  r  )
r  imager  r   r   r  child_imager  child_horizontal_textchild_vertical_texts
             r'   r  r    s   . EDOMG'7++W%G%9::&&w/G%788$$W-'6**G$G[113G< T)> [)-g6
J'&&'<=$$%89# ( 66  s   B5D) A"D) )
D65D6)r&   r  returnzStrByteType | Path)rg   r  )r   r   r   r   r  r   )r   r   r   r   r  r   )
r   zBdict[tuple[float, float, float, float], list[tuple[float, float]]]r   'list[tuple[float, float, float, float]]r   r  r   ztuple[float, float, float]r  ztuple[dict[tuple[float, float, float, float], dict[str, list[tuple[float, float]]]], list[tuple[float, float, float, float]], list[tuple[float, float, float, float]]])r  r   )r  bool)r  )rD  r   rE  z	list[Any]rF  zCallable[[Any], Any]r  z
int | None)r   )r    )r   zlist[LTChar | LTAnno]rj  r  rR   r  r  r  )Fr    )rx  r   r   r   rj  r  rO   r  rR   r  r  list[tuple[int, int, str]])r  &list[tuple[int, int, LTChar | LTAnno]])
ry  r  rO   r  rj  r  rR   r  r  r  )FFr    )rk  zlist[list[str]]r  r   )      ?r  r  g?r  TT)r  zLTContainer[Any]r  zlist[LTChar])r  zLTContainer[LTItem]r  zXtuple[list[LTImage], list[LTChar], list[LTTextLineHorizontal], list[LTTextLineVertical]])ir   
__future__r   r   rN  r@   r+   rX  rE   r-   r9   r  	itertoolsr   operatorr   pathlibr   typingr   r   urllib.parser	   r"   r
   r   r   urllib.requestr   r   numpyrd  pdfminer.converterr   pdfminer.layoutr   r   r   r   r   r   r   r   r   pdfminer.pdfdocumentr   pdfminer.pdfinterpr   r   pdfminer.pdfpager   r   pdfminer.pdfparserr   pypdf._utilsr   rl   r$   r  r(   r3   rN   common_kwargstext_kwargslattice_kwargsrk   rv   r|   r~   r   r   r   r   r   r   r   r   r   r   r  r  r  r!  r$  r	  r  r  r7  rA  rJ  r
  rS  r]  rr  r{  ru  rv  rw  r  r  r  r  r  r  r  r   r4   r'   <module>r     s   7 "   	  	          . $ $ & " "  0 $ " " ' # " & 0 . , 1 1 $ 8 ( $ -+-;<   B &,> NN "  N*	 
06 @(&$@M6NM67M6 8M6 (	M6
M6`B"J	@,4*+\8.4- S":)Z DO66&6,@66r.":: HJ>+#>+03>+AD>+>+J ((( ( 	(
 (  (V + F + F6466 6 	6
  6t HJPMf#L@#!P >B:171717r4   