
    TAin>                         S r SSKrSSKrSSKrSSKrSSKJr  SSKJ	r	  SSKJ
r
  SSKJr  SSKJr  SS	KJr   " S
 S5      r " S S\5      rg)zDDefines a base parser. As well as generic methods for other parsers.    N   )Table)bbox_from_str)compute_accuracy)compute_whitespace)get_table_index)text_in_bboxc                       \ rS rSrSr        SS jrS rS rS rS r	S	 r
\S
 5       rS rS rS rS rS rS rSrg)
BaseParser   zDefines a base parser.Nc
                     Xl         X l        X0l        0 U l        S U l        X@l        XPl        X`l        Xpl        Xl	        S U l
        S U l        SU l        0 U l        U	(       d  S U l        g g )Ni,  )idtable_regionstable_areastable_bbox_parsescolumns	copy_text
split_text
strip_text
shift_text	flag_sizerootnamet_bbox
resolutionparse_details)
self	parser_idr   r   r   r   r   r   r   debugs
             P/var/www/html/land-ocr/venv/lib/python3.13/site-packages/camelot/parsers/base.py__init__BaseParser.__init__   sl     *&!#"$$$" !%D     c                 J    [        U R                  R                  5       S SS9$ )zeReturn a list of table bounding boxes sorted by position .

Returns
-------
[type]
    [description]
c                     U S   $ )N    xs    r   <lambda>)BaseParser.table_bboxes.<locals>.<lambda>>   s    1Q4r"   T)keyreverse)sortedr   keysr   s    r   table_bboxesBaseParser.table_bboxes6   s#     d,,113QUVVr"   c	                 x   Xl         Xl        X l        X0l        X@l        XPl        X`l        Xpl        U R                  u  U l        U l	        [        R                  R                  U R                   5      u  U l        n	U R                  b3  U R                  U R                  S'   U R                   U R                  S'   gg)zPrepare the page for parsing.Nr   r   )filenamelayout_kwargslayout
dimensionspageimageshorizontal_textvertical_text	pdf_width
pdf_heightospathsplitextr   r   r   r   )
r   r3   r5   r6   page_idxr8   r9   r:   r4   __s
             r   prepare_page_parseBaseParser.prepare_page_parse@   s     !*$	.**.//'GG,,T]];r)262D2DD/040@0@D}- *r"   c                     / nU R                   c  UR                  U5        U$ U R                    H)  n[        [        U5      U5      nUR                  U5        M+     U$ )zIf regions have been specified, filter textlines to these regions.

Parameters
----------
textlines : list
    list of textlines to be filtered

Returns
-------
filtered_textlines : list of textlines within the regions specified

)r   extendr	   r   )r   	textlinesfiltered_textlines
region_strregion_texts        r   _apply_regions_filter BaseParser._apply_regions_filter[   se      %%%i0
 "! #00
*=+DiP"))+6 1 "!r"   c                     U R                   (       dl  [        R                  R                  U R                  5      nU R
                  (       a  [        R                  " U S3SS9  g[        R                  " SU 3SS9  gg)zDetect image only documents and warns.

Returns
-------
has_no_text : bool
    Whether the document doesn't have any text at all.
z8 is image-based, camelot only works on text-based pages.r%   )
stacklevelzNo tables found on r   TF)r9   r=   r>   basenamer   r8   warningswarn)r   r   s     r   _document_has_no_text BaseParser._document_has_no_textq   sm     ##ww''6H{{j !> >    3H:>1Mr"   c                 ^    [        X45      nU R                  Ul        US-   Ul        X%l        U$ )a  Initialize new table object, ready to be populated.

Parameters
----------
table_idx : int
    Index of this table within the pdf page analyzed
bbox : set
    bounding box of this table within the pdf page analyzed
cols : list
    list of coordinate boundaries tuples (left, right)
rows : list
    list of coordinate boundaries tuples (bottom, top)

Returns
-------
table : camelot.core.Table

r%   )r   r7   order_bbox)r   	table_idxbboxcolsrowstables         r   _initialize_new_table BaseParser._initialize_new_table   s.    & d!YY
!mr"   c                     U$ )z
Reduces index of a text object if it lies within a spanning cell.

Only useful for some parsers (e.g. Lattice), base method is a
noop.
r&   )tidxr   s      r   _reduce_indexBaseParser._reduce_index   s	     
r"   c                    / nS H  nU R                   U    H  n[        UUUU R                  U R                  U R                  S9u  pV[        U5      S:  d  MB  US   SS S:w  d  MP  UR                  U5        [        U 5      R                  XU R                  S9nU H  u  pxn	XR                  U   U   l        M     M     M     U$ )zCompute parse errors for the table .

Parameters
----------
table : camelot.core.Table

Returns
-------
Tuple
    Parse errors
)vertical
horizontal)r   r   r   r   Nr   )re   )r   )r   r   r   r   r   lenappendtyper`   r   cellstext)
r   rZ   
pos_errors	directionr^   indiceserrorr_idxc_idxrj   s
             r   compute_parse_errorsBaseParser.compute_parse_errors   s     
 4I[[+!0#"nn#" w<!#qz"1~1"))%0"&t*":":!t #; # 3:.E$=AKK.u5: 3: , 4$ r"   c                     [        5       eNNotImplementedError)r   rW   	user_colss      r   _generate_columns_and_rows%BaseParser._generate_columns_and_rows       !##r"   c                     [        5       ert   ru   )r   rV   rW   rX   rY   kwargss         r   _generate_tableBaseParser._generate_table   rz   r"   c                     [        5       ert   ru   r/   s    r   _generate_table_bboxBaseParser._generate_table_bbox   rz   r"   c           
         U R                  5       (       a  / $ U R                  5         / n[        U R                  5       5       H  u  p#U R                  bM  U R                  U   S:w  a:  U R                  U   R                  S5      nU Vs/ s H  n[        U5      PM     nnOSnU R                  X45      u  pgpU R                  X#XgXS9n
UR                  U
5        M     U$ s  snf )z!Extract tables from the document.N ,)v_sh_s)
rQ   r   	enumerater0   r   splitfloatrx   r}   rg   )r   _tablesrV   rW   rw   crX   rY   r   r   rZ   s              r   extract_tablesBaseParser.extract_tables   s    %%''I 	!!#():):)<=OI||'DLL,Cr,I
 !LL399#>	/89y!U1Xy	9	 	#'#B#B4#S D(($#(WENN5!  >  :s   
C c           	      L   U R                   Ul        U R                  Ul        UR                  U R                  ;   a  U R                  UR                     Ul        O[        SUR                   S35        / / / / 4$ U R                  Ul        U R                  U5      n[        SU//5      Ul
        U R                  b  UR                  U R                  5        UR                  n[        R                  " U5      Ul        UR                   R"                  Ul        [%        U5      Ul        U R(                  U R*                  4Ul        / nUR/                  U R0                   Vs/ s H0  oUR2                  UR4                  UR6                  UR8                  4PM2     sn5        UR/                  U R:                   Vs/ s H0  oUR2                  UR4                  UR6                  UR8                  4PM2     sn5        XAl        U R0                  U R:                  -   Ul        gs  snf s  snf )*Record data about the origin of the table.zWarning: Bounding box z  not found in table_bbox_parses.d   N) r   flavorr3   rU   r   parseprintr   rq   r   accuracyr   copy_spanning_textdatapd	DataFramedfshaper   
whitespacer;   r<   pdf_sizerE   r9   x0y0x1y1r:   _textrF   )r   rZ   rk   r   r   r^   s         r   record_parse_metadata BaseParser.record_parse_metadata   s   ww;;$00000=EK
 (5UV r2r>!"00..u5
)C+<*=>>>%$$T^^4zz<<%hhnn-d3..$//:8L8LM8L1ttQTT144.8LMN8J8JK8J1ttQTT144.8JKL..1C1CC NKs   "7H97H!)r   r   r6   r3   r   r9   r   r8   r5   r4   r7   r   r<   r;   r   r   r   r   r   r   r   r   r   r:   )NNNFr   NFF)__name__
__module____qualname____firstlineno____doc__r    r0   rB   rJ   rQ   r[   staticmethodr`   rq   rx   r}   r   r   r   __static_attributes__r&   r"   r   r   r      st     
  &DWA6",*2  !F$$$8 Dr"   r   c                      ^  \ rS rSrSr          SU 4S jjr\SS j5       r\SS j5       r\S 5       r	\S 5       r
\S 5       rS	 rS
 rU 4S jrSrU =r$ )TextBaseParseri  z Base class for all text parsers.c           
      |   > [         TU ]  UUUUUUUS9  X@l        U R                  5         Xl        Xl        Xl        g)z:Initialize the text base parser class with default values.)r   r   r   r   r   r   N)superr    r   _validate_columnsedge_tolrow_tol
column_tol)r   r   r   r   r   r   r   r   r   r   r   r   r|   	__class__s                r   r    TextBaseParser.__init__  sN      	'#!! 	 	
   $r"   c                    Sn/ n/ nU R                  S S9  U  Vs/ s H)  oUR                  5       R                  5       (       d  M'  UPM+     nnU Hp  nUc  UR                  nOL[        R
                  " X%R                  US9(       d(  UR                  [        US S95        / nUR                  nUR                  U5        Mr     UR                  [        US S95        U$ s  snf )a  
Group PDFMiner text objects into rows vertically within a tolerance.

Parameters
----------
text : list
    List of PDFMiner text objects.
row_tol : int, optional (default: 2)

Returns
-------
rows : list
    Two-dimensional list of text objects grouped into rows.

Nc                 4    U R                   * U R                  4$ rt   )r   r   r'   s    r   r)   ,TextBaseParser._group_rows.<locals>.<lambda>M  s    !$$r"   )r+   abs_tolc                     U R                   $ rt   r   r^   s    r   r)   r   W  s    qttr"   c                     U R                   $ rt   r   r   s    r   r)   r   ]  s    qttr"   )sortget_textstripr   mathiscloserg   r-   )rj   r   row_yrY   tempr^   non_empty_texts          r   _group_rowsTextBaseParser._group_rows9  s    " 		-	.%)BTZZ\-?-?-A!TBA
 }\\%w?F4^<= KKN   	F4^45! Cs   &CCc                 `   / nU  GH$  nU(       d  UR                  U5        M  US   nUS:  al  US   US   ::  d!  [        R                  " US   US   US9(       a,  [        US   US   5      n[	        US   US   5      nXe4US'   M  UR                  U5        M  US:  d  M  US   US   ::  aj  [        R                  " US   US   [        U5      S9(       a  UR                  U5        M  [        US   US   5      n[	        US   US   5      nXe4US'   GM  UR                  U5        GM'     U$ )a  Merge column boundaries if they overlap or lie within a tolerance.

Parameters
----------
cl : list
    List of column x-coordinate tuples.
column_tol : int, optional (default: 0)

Returns
-------
merged : list
    List of merged column x-coordinate tuples.

re   r   r%   r   )rg   r   r   maxminabs)clr   mergedhigherlowerupper_boundlower_bounds          r   _merge_columnsTextBaseParser._merge_columns`  s$     Ff%r
?ayE!H,q	58Z1 '*%(F1I&>&)%(F1I&>&1%?r
f-!^ayE!H,<<q	58S_U"MM&1*-eAhq	*BK*-eAhq	*BK*5)CF2Jf-/ 0 r"   c                    U  Vs/ s H'  n[        S U 5       5      [        S U 5       5      /PM)     nn[        S[        U5      S-
  5       H$  nXE   nXES-      nUS   US   -   S-  =US'   US'   M&     XS   S'   X$S   S'   U$ s  snf )aA  
Make row coordinates continuous.

For the row to "touch"
we split the existing gap between them in half.

Parameters
----------
rows_grouped : list
    Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int

Returns
-------
rows : list
    List of continuous row y-coordinate tuples.

c              3   8   #    U  H  oR                   v   M     g 7frt   )r   .0r^   s     r   	<genexpr>,TextBaseParser._join_rows.<locals>.<genexpr>  s     !q!q   c              3   8   #    U  H  oR                   v   M     g 7frt   )r   r   s     r   r   r     s     &7QttQr   r   r%   r   re   )r   r   rangerf   )rows_grouped
text_y_max
text_y_minrrow_boundariesitop_row
bottom_rows           r   
_join_rowsTextBaseParser._join_rows  s    , CO
BNQS!q!!3&7Q&7#78, 	 
 q#n-12A$'G'A.J*1!*z!}*D)IIGAJA 3  *q! *r1
s   .Bc                 z   U(       a  [         R                  XS9nU Vs/ s H  n[        U5      PM     nnU VVs/ s H>  n[        U5      [        U5      :X  d  M  U  H  oUR                  UR
                  4PM     M@     nnnU R                  [         R                  [        U5      5      5        U $ s  snf s  snnf )aG  Adds columns to existing list.

By taking into account
the text that lies outside the current column x-coordinates.

Parameters
----------
cols : list
    List of column x-coordinate tuples.
text : list
    List of PDFMiner text objects.
ytol : int

Returns
-------
cols : list
    Updated list of column x-coordinate tuples.

)r   )	r   r   rf   r   r   r   rE   r   r-   )rX   rj   r   r   elementsr^   new_colss          r   _add_columnsTextBaseParser._add_columns  s    * !--d-DD(,-1AH-&*&*c!fH.Eq!qttqd   KK55fX6FGH .s   B2B7%B7c                 H   [        U 5      n [        S[        U 5      5       Vs/ s H  o0U   S   XS-
     S   -   S-  PM     n nU R                  SU5        U R	                  U5        [        S[        U 5      S-
  5       Vs/ s H  o0U   XS-      4PM     n nU $ s  snf s  snf )zMakes column coordinates continuous.

Parameters
----------
cols : list
    List of column x-coordinate tuples.
text_x_min : int
text_y_max : int

Returns
-------
cols : list
    Updated list of column x-coordinate tuples.

r%   r   r   )r-   r   rf   insertrg   )rX   
text_x_min
text_x_maxr   s       r   _join_columnsTextBaseParser._join_columns  s    " d|;@CI;NO;Naadq5k!n,1;NOAz"J05aTQ0GH0G1a$1u+&0GH	 P Is   BBc                     U R                   bF  U R                  b8  [        U R                   5      [        U R                  5      :w  a  [        S5      eg g g )Nz1Length of table_areas and columns should be equal)r   r   rf   
ValueErrorr/   s    r   r    TextBaseParser._validate_columns  sM    'DLL,D4##$DLL(99 !WXX : -E'r"   c                 l    U R                  XX45      nUR                  5       nU R                  U5        U$ rt   )r[   set_all_edgesr   )r   rV   rW   rX   rY   r|   rZ   s          r   r}   TextBaseParser._generate_table  s5    **9DG##%""5)r"   c                 2   > [         TU ]  U5        SUl        g)r   N)r   r   	_segments)r   rZ   r   s     r   r   $TextBaseParser.record_parse_metadata  s    %e,r"   )r   r   r   r   )
NNNFFr   2   r   r   F)r   )r   )r   r   r   r   r   r    r   r   r   r   r   r   r   r}   r   r   __classcell__)r   s   @r   r   r     s    *
 %> $ $L ( (T  >  :  .Y
 r"   r   )r   r   r=   rO   pandasr   corer   utilsr   r   r   r   r	   r   r   r&   r"   r   <module>r      sC    J  	    ! $ & #  CD CDLZZ Zr"   