
    Ցi-                        S SK r S SKrS SKrS SKrS SKJr  S SKJr  S SKJ	r	J
r
JrJrJr  S SKrS SKJr  S SKJrJrJr  SSKJrJrJr  S	 r SDS
\\\\4      S\\\\4      S\S\4S jjrS\
\\\
   4   S\S\
\\\
   4   4S jrS r S r!S r"SES jr#SFS jr$S r%S r&S r' " S S\5      r( " S S\5      r)Sr*Sr+S r,S!r-S"r.S#r/S$r0\Rb                  " \0 S%\0 S&3\Rd                  S'9r3S(\4S) jr4S* r5S+\)4S, jr6S-\S\4S. jr7S/\4S0 jr8S(\S\\S4   4S1 jr9 SGS(\S2\S3\S\\\\\4   S4   4S4 jjr:    SHS5\S6\S7\S2\S8\S\4S9 jjr;S: r<\Rb                  " S;\Rz                  5      r>\Rb                  " S<\Rz                  5      r?\Rb                  " S=5      r@\Rb                  " S=5      rAS>\R                  S\
\\4   4S? jrCS@\SA\SB\S\\\
\\4   4   4SC jrDg)I    N)Counter)deepcopy)AnyDictListTupleUnion)Image)	BaseModelcomputed_fieldmodel_validator   )calculate_bbox_areacalculate_overlap_ratio"calculate_projection_overlap_ratioc                 J    U R                   (       d  U R                  S5      n U $ )Nr   )is_validbuffer)polys    n/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddlex/inference/pipelines/paddleocr_vl/uilts.py
make_validr   "   s    =={{1~K    polygon1polygon2modereturnc                     SSK Jn  U" U 5      nU" U5      n[        U5      n[        U5      nUR	                  U5      R
                  nUR                  U5      R
                  nUS:X  a  Xg-  $ US:X  a$  [        UR
                  UR
                  5      nXh-  $ US:X  a$  [        UR
                  UR
                  5      n	Xi-  $ [        SU 35      e! [         a    [        S5      ef = f)ah  
Calculate the overlap ratio between two polygons.

Args:
    polygon1 (List[Tuple[int, int]]): First polygon represented as a list of points.
    polygon2 (List[Tuple[int, int]]): Second polygon represented as a list of points.
    mode (str, optional): Overlap calculation mode. Defaults to "union".

Returns:
    float: Overlap ratio value between 0 and 1.
r   )PolygonzPlease install Shapely library.unionsmalllargezUnknown mode: )
shapely.geometryr   ImportErrorr   intersectionarear   minmax
ValueError)
r   r   r   r   poly1poly2r$   r   
small_area
large_areas
             r   calculate_polygon_overlap_ratior-   (   s     =, HEHEuEuE%%e,11LKK##Ew##	UZZ0
((	UZZ0
((>$011#  =;<<=s   C C&layout_det_reslayout_shape_modec                    [        U 5      nUS    Vs/ s H  o3S   S:w  d  M  UPM     nn[        5       n[        [        U5      5       GH  nXF   S   u  pxpX-
  X-
  pUS:  d  US:  a  UR	                  U5        [        US-   [        U5      5       GH7  nXe;   d  X;   a  M  [        XF   S   XM   S   S5      nXF   S   S:X  d  XM   S   S:X  a@  US	:  a:  XF   S   S:X  a  UR	                  U5        XM   S   S:X  a  UR	                  U5        M}  US
:  d  M  US:w  a'  SXF   ;   a  [        XF   S   XM   S   S5      nUS
:  a  M  [        XF   S   5      n[        XM   S   5      nXF   S   XM   S   1nU1 Sk-  (       a   [        U5      S:  a  SU;  d  U1 Sk::  a  GM  UU:  a  UR	                  U5        GM&  UR	                  U5        GM:     GM     [        U5       VVs/ s H  u  nnUU;  d  M  UPM     snnUS'   U$ s  snf s  snnf )a  
Remove overlapping boxes from layout detection results based on a given overlap ratio.

Args:
    layout_det_res (Dict[str, List[Dict]]): Layout detection result dict containing a 'boxes' list.

Returns:
    Dict[str, List[Dict]]: Filtered dict with overlapping boxes removed.
boxeslabel	reference
coordinate      r    inline_formula      ?gffffff?rectpolygon_points>   sealchartimagetabler>   )	r   setrangelenaddr   r-   r   	enumerate)r.   r/   layout_det_res_filteredboxr1   dropped_indexesix1y1x2y2whjoverlap_ratiopoly_overlap_ratio
box_area_i
box_area_jlabelsidxs                       r   filter_overlap_boxesrU   N   sX    '~6.w77w<;;V7 
  eO3u:,/w1q5AE"q1uc%j)A#q';3&(>M !%558G$(88 3&x(,<<'++A.x(,<<'++A.s"$.3Cux3O)H!12EH=M4NPW*& *C/ 0,1GH
0,1GH
(7+UXg->???CKRSOf, ; 1 !+#''*#''*K * X &e,(,S?0J,(G$ #"g`(s   HH"H3Hc                 p    [        U [        R                  5      (       a  U $ [        R                  " U 5      $ )z
Convert the input to a PIL Image.

Args:
    img (PIL.Image or numpy.ndarray): Input image.

Returns:
    PIL.Image: PIL Image object.
)
isinstancer
   	fromarrayimgs    r   to_pil_imager[      s)     #u{{##
??3r   c                 p    [        U [        R                  5      (       a  [        R                  " U 5      $ U $ )z
Convert the input to a numpy array.

Args:
    img (PIL.Image or numpy.ndarray): Input image.

Returns:
    numpy.ndarray: Numpy array image.
)rW   r
   nparrayrY   s    r   to_np_arrayr_      s(     #u{{##xx}Jr   c                     U  Vs/ s H  n[        U5      R                  PM     nnU  Vs/ s H  n[        U5      R                  PM     nn[        U5      n[	        U5      nXE4$ s  snf s  snf )z
Calculate width (max of all) and height (sum) for a vertical merge of images.

Args:
    images (List[PIL.Image or np.ndarray]): List of images.

Returns:
    Tuple[int, int]: (width, height) of merged image.
)r[   widthheightr'   sum)imagesrZ   widthsheightsrL   rM   s         r   calc_merged_whrg      sc     288#l3%%F839:6C|C ''6G:FAGA4K	 9:s
   A$A)c                    U (       d  g[        U 5      S:X  a  [        U S   5      $ [        U[        5      (       a  U/[        U 5      S-
  -  n[        U5      [        U 5      S-
  :w  a  [	        S5      e[        U S   5      n[        S[        U 5      5       H  n[        X   5      nXS-
     n[        UR                  UR                  5      nUR                  UR                  -   n[        R                  " SXx4S5      n	US:X  a#  XsR                  -
  S-  n
XuR                  -
  S-  nO&US	:X  a  XsR                  -
  n
XuR                  -
  nOS=pU	R                  X:S45        U	R                  X[UR                  45        U	nM     [        U5      $ )
a  
Merge images vertically with given alignment.

Args:
    images (List[PIL.Image or np.ndarray]): List of images to merge.
    aligns (str or List[str]): Alignment(s) for each merge step ('center', 'right', 'left').

Returns:
    np.ndarray: Merged image as numpy array.
Nr6   r   z,The length of aligns must be len(images) - 1RGB   rk   rk   centerr   right)rA   r_   rW   strr(   r[   r@   r'   ra   rb   r
   newpaste)rd   alignsr/   mergedrG   img2alignrL   rM   new_imgrH   rJ   s               r   merge_imagesrv      s`    
6{a6!9%%&#S[1_-
6{c&kAo%GHH&)$F1c&k"FI&1udjj)MMDKK'))EA6?;Hll"q(Bjj.Q&Bg\\!BZZBKBf1g&d/0! #" vr   c                 x	  ^^)^* / n0 n[        U 5       H&  u  pVUS   T;   a  XdU'   M  UR                  XV45        M(     / n/ n/ n	/ n
S m)U)4S jnU4S jn[        U5       GH  u  nu  pVU(       d
  U/nU/n	/ n
M  X=S-
     u  pUS   nUS   nUS   nUS   n[        UUS5      nUS:H  =(       ai    US	:H  =(       a]    UU:H  =(       aQ    US   US
   :  =(       a?    US   US   :  =(       a-    US   US
   -
  [        US
   US   -
  US
   US   -
  5      S-  :  nUS:  =(       a    US;   =(       a    UU:H  =(       a~    US   US   :  =(       al    [	        US   US   -
  5      [        US   US   -
  US   US   -
  5      S-  :  =(       a/    T)" US   US   5      T)" US
   US
   5      -  =(       a	    U" X^U 5      nU(       a  SnOU(       a
  U" UU5      nOSnU(       d  U(       a6  UR                  U5        U	R                  U5        U
R                  U5        GM  UR                  X45        U/nU/n	/ n
GM     U(       a  UR                  X45        / nU H1  u  nn[        U5      [        U5      nnUR                  UUUU45        M3     / n[        5       m*SnU[        U 5      :  Ga  SnU GH  u  nnnnUU:X  d  M  [        U*4S jU 5       5      (       d  M-  SnU Vs/ s H
  oU   S   PM     nnU(       a  UO/ n [        U5      u  n!n"U!S:w  a  U"U!-  O
[        S5      n#U#S:  a[  [        U5       HK  u  n$n%U U%   R                  5       nU U%   S   US'   SUS'   UR                  U5        T*R                  U%5        MM     Oy[        UU U5      n&[        U5       H]  u  n$n%U U%   R                  5       nU$S:X  a  U&OSUS'   U$S:X  a  U OSUS'   US   US'   UR                  U5        T*R                  U%5        M_     / n'[        US-   U5       H  n(U(U;   d  M  U'R                  U(5        M     U' H(  n(UR                  UU(   5        T*R                  U(5        M*     US-   n  O   U(       a  GM  XT;   a*  UT*;  a$  UR                  XE   5        T*R                  U5        US-  nU[        U 5      :  a  GM  U$ s  snf )a/  
Merge blocks based on alignment and overlap logic, except for those with labels in non_merge_labels.

Args:
    blocks (List[Dict]): List of block dicts.
    non_merge_labels (List[str]): Block labels that should not be merged.

Returns:
    List[Dict]: List of processed (and possibly merged) blocks.
r2   c                 "    [        X-
  5      S:*  $ )N   abs)a1a2s     r   
is_aligned merge_blocks.<locals>.is_aligned  s    27|q  r   c                 Z   > T" U S   US   5      (       a  gT" U S   US   5      (       a  gg)Nr   leftr   rm   rl    )
block_bbox	prev_bboxr~   s     r   get_alignment#merge_blocks.<locals>.get_alignment  s6    jmYq\22
1y|44r   c                 2  > X!   S   nX    S   n[        US   US   5      n[        US   US   5      n[        US   US   5      n[        US   US   5      nXVXx/n	[        U5       H-  u  pXU4;   d	  US   T;  a  M  US   n[        X5      S:  d  M-    g   g)	NrE   r   r6   r      r2   TF)r&   r'   rC   r   )	block_idxprev_idxblocksr   r   rH   rI   rJ   rK   min_boxrT   other_block
other_bboxnon_merge_labelss                r   overlapwith_other_box+merge_blocks.<locals>.overlapwith_other_box  s    $U+	&u-
1z!}-1z!}-1z!}-1z!}-2" )& 1C8,,w'/??$U+J&w;a? !2 r   r6   rE   
horizontalr   textr   r   g333333?)r   r8   rl   NFc              3   ,   >#    U  H	  oT;  v   M     g 7fNr   ).0rG   used_indicess     r   	<genexpr>merge_blocks.<locals>.<genexpr>a  s     #Q=a\$9=s   TrZ   infmerge_alignsgroup_id)rC   appendr   r'   r{   r&   r?   rA   allrg   floatcopyrB   rv   r@   )+r   r   r/   blocks_to_mergenon_merge_blocksrT   blockmerged_groupscurrent_groupcurrent_indicescurrent_alignsr   r   rG   r   
prev_blockr   
prev_labelr   block_labeliou_his_crossis_updown_align
align_modegroup_rangesgroup_indicesrq   startendresult_blocksgroup_foundimgsr   rL   rM   aspect_ratiorN   r   
merged_imginsert_listn_idxr~   r   s+    `                                       @@r   merge_blocksr      sf    O'
>--$)S!""C<0	 ( MMON!& %_5<C"GM"eON.1u5u%	(
5\
Gn2:y,WQJ Tv%Tz)T 1	!,T 1	!,	T
 1	!,)A,1-z!}z!}/LMPSST 	 AI 
=x'
=z)
= 11-
= JqMIaL01)A,1-z!}z!}/LMPSST	
= :a=)A,7Z]IaL9:
= &cV< 	 !J&z9=JJ  '""3'!!*-  /!BC"GM"eONg 6h o>?L!.v']);sUC?@ "/ M5L
C
F
1=-E3ve|#Q=#Q Q Q"2?@-Qq	%(-@)/vR%d+1()Qq1uE%L1$(1-(@9 &y 1 6 6 8'-i'8'?e04n-%,,U3$((3 )A ".dLBS!TJ(1-(@9 &y 1 6 6 856!Vze@AQDn-,9!,<j)%,,U3$((3 )A !"519c2E 00#**51 3 )E!(()9%)@A $$U+ ) Ag? 2>@ "s,'>  !1!67S!qO F
P G As   !R7c                   ^ SSK mSU4S jjnU Vs/ s H  n[        U5      PM     snu  pVpxXu-
  n	X-
  n
U R                  5       nTR                  XU4Xx4SSS9  TR                  nSnU" X,[        X5      SS	9u  pn[        S
[        R                  " X-  5      5      nXYU-
  S-  -   nXjU-   S-  -   nTR                  UUUU4UUSUTR                  S9  U$ s  snf )a$  
Fill a rectangular area in the image with a white background and write the given token string.

Args:
    image (np.ndarray): Image to paint on.
    box (tuple): (x1, y1, x2, y2) coordinates of rectangle.
    token_str (str): Token string to write.

Returns:
    np.ndarray: Modified image.
r   N?c                    > Su  pEUnXT-
  S:  a>  XE-   S-  nTR                  XUSS9u  u  pn
XU-  :  a  XU-  :  a  UnUnOUnXT-
  S:  a  M>  UWW	4$ )N)g?
   g{Gz?r   r6   )	thickness)getTextSize)r   fontFacesquare_size
fill_ratior   rm   optimal_scalemidrL   rM   _cv2s              r   get_optimal_font_scale+paint_token.<locals>.get_optimal_font_scale  s     lT!<1$CqIIFQA++*4L0L # lT! a""r   rj   )colorr      )r   r6   r   )r   r   r   )lineType)r   )r   intr   	rectangleFONT_HERSHEY_SIMPLEXr&   r'   mathfloorputTextLINE_AA)r=   rE   	token_strr   vrH   rI   rJ   rK   box_wbox_hrZ   fontthickness_scale_ratio
font_scaletext_wtext_hfont_thicknesstext_xtext_yr   s                       @r   paint_tokenr     s    #  '**cc!fc*NBBGEGE
**,CMM#Bx"BMO ##D!7U*s"J DJJz'IJKN 6>a''F6>a''FKK	  	 J; +s   Cc                 *   S nSSK nUR                  S5        0 nUu  pgp/ n
U" [        U5      5      nUR                  U5        [	        U5       H  u  pUS   u  pnnX:  d  M  X:  d  M  UU::  d  M$  UU	::  d  M,  U
R                  U5        [        UU-
  UU-
  5      S:  a  MU  X-
  X-
  UU-
  UU-
  /nS[        X   5      -   S-   n[        U UU5      n US	   UU'   M     [	        U5       VVs/ s H  u  nnUU
;   d  M  US	   PM     nnnXU4$ s  snnf )
a  
Replace figures in a table area with tokens, return new image and token map.

Args:
    table_block_img (np.ndarray): Table image.
    table_box (list): Table bounding box [x_min, y_min, x_max, y_max].
    figures (List[Dict]): List of figure dicts (must contain 'coordinate', 'path').

Returns:
    Tuple[np.ndarray, Dict[str, str], List[str]]:
        - New table image,
        - Token-to-img HTML map,
        - List of figure paths dropped.
c                     1 Skn/ nSn[        U5      U :  aC  [        [        U5      5      U-  (       d  UR                  U5        US-  n[        U5      U :  a  MC  U$ )N>   019r   r6   )rA   r?   rn   r   )numexclude_digitsseqrG   s       r   gen_random_map0tokenize_figure_of_table.<locals>.gen_random_map  sT    (#hnAK.0

1FA #hn 
r   r   Ni   r4      [F]path)	randomseedrA   shufflerC   r   r&   rn   r   )table_block_img	table_boxfiguresr   r   	token_maptable_x_mintable_y_mintable_x_maxtable_y_max
drop_idxes
random_map	figure_idfigurefigure_x_minfigure_y_minfigure_x_maxfigure_y_maxdraw_boxr   rG   fdrop_figuress                          r   tokenize_figure_of_tabler    sE     
KKI9B6KkJG-J
NN:&w/	AGAU>L,'+++i(<,.|0KLrQ**{*{*	H s:#899C?I)/8YOO#)&>Ii + 0, +4G*<P*<$!QZIAfI*<LP|33 Qs   .D?	Dc                 F   ^^ UU4S jnSn[         R                  " XCU 5      $ )z
Replace tokens in a string with their HTML image equivalents.

Args:
    table_res_str (str): Table string with tokens.
    figure_token_map (dict): Mapping from tokens to HTML img tags.

Returns:
    str: Untokenized string.
c                   > U R                  S5      nSU S3nTR                  X R                  S5      5      nT	R                  US 5      nUc  U R                  S5      $ / nUR                  SR                  UR	                  SS5      R	                  SS	5      5      5        SR                  U5      nUR                  S:w  a  UR                  nUS
U-   S
-   -  nU$ )Nr6   r   r   r   z<img src="{}" alt="Image"" />z-
 
 

)groupgetr   formatreplacejoincontent)
matchtoken_idtokenimg_path	img_blockimg_tags
image_infoocr_contentfigure_token_mapimage_path_to_obj_maps
           r   repl(untokenize_figure_of_table.<locals>.repl  s    ;;q>XJa #''{{1~>)--h=	;;q>!HOO/66$$UB/77cB
 8,J  B&'//f{2V;;
r   z
\[F(\d+)\])resub)table_res_strr  r  r  patterns    ``  r   untokenize_figure_of_tabler#    s     ( G66'//r   c                       \ rS rSr% SrSr\\S'   Sr\\S'   \\S'   \\S'   \\S'   \\S	'   \	\S
'   Sr
\\S'   Sr\\S'   Sr\\S'   \" SS9\S\S\4S j5       5       rSrg)	TableCelli'  aK  
TableCell represents a single cell in a table.

Attributes:
    row_span (int): Number of rows spanned.
    col_span (int): Number of columns spanned.
    start_row_offset_idx (int): Start row index.
    end_row_offset_idx (int): End row index (exclusive).
    start_col_offset_idx (int): Start column index.
    end_col_offset_idx (int): End column index (exclusive).
    text (str): Cell text content.
    column_header (bool): Whether this cell is a column header.
    row_header (bool): Whether this cell is a row header.
    row_section (bool): Whether this cell is a row section.
r6   row_spancol_spanstart_row_offset_idxend_row_offset_idxstart_col_offset_idxend_col_offset_idxr   Fcolumn_header
row_headerrow_sectionbefore)r   datar   c                 
   [        U[        5      (       am  SU;   a  U$ US   R                  SS5      n[        U5      (       d<  UR	                  SS5      nU(       a  U H  nX$S   S-   -  nM     UR                  5       nX!S'   U$ )z
Create TableCell from dict, extracting 'text' property correctly.

Args:
    data (Any): Input data.

Returns:
    Any: TableCell-compatible dict.
r   bboxr  r	  text_cell_bboxesNr  )rW   r   r  rA   popstrip)clsr0  r   
text_cellsels        r   from_dict_formatTableCell.from_dict_formatC  s     dD!!~<##GR0Dt99!XX&8$?
(7c 11 )zz|Lr   r   N)__name__
__module____qualname____firstlineno____doc__r&  r   __annotations__r'  rn   r,  boolr-  r.  r   classmethodr   r9  __static_attributes__r   r   r   r%  r%  '  s      HcHc
IM4JK(#C C   $r   r%  c                   x    \ rS rSr% Sr/ r\\   \S'   Sr	\
\S'   Sr\
\S'   \\S\\\      4S j5       5       rS	rg
)	TableDatai]  z
TableData holds a table's cells, row and column counts, and provides a grid property.

Attributes:
    table_cells (List[TableCell]): List of table cells.
    num_rows (int): Number of rows.
    num_cols (int): Number of columns.
table_cellsr   num_rowsnum_colsr   c                 D   [        U R                  5       VVs/ s H:  n[        U R                  5       Vs/ s H  n[        SUUS-   UUS-   S9PM     snPM<     nnnU R                   H  n[        [        UR                  U R                  5      [        UR                  U R                  5      5       HY  n[        [        UR                  U R                  5      [        UR                  U R                  5      5       H
  nXCU   U'   M     M[     M     U$ s  snf s  snnf )zn
Returns a 2D grid of TableCell objects for the table.

Returns:
    List[List[TableCell]]: Table as 2D grid.
r	  r6   )r   r(  r)  r*  r+  )
r@   rG  rH  r%  rF  r&   r(  r)  r*  r+  )selfrG   rN   
table_datacells        r   gridTableData.gridk  s   ( 4==)
 * t}}-	 .A )*'(1u)*'(1u .	 * 	 
 $$DD--t}}=D++T]]; 114==A//?A (,qM!$		 % -	
s   DDDDr   N)r;  r<  r=  r>  r?  rF  r   r%  r@  rG  r   rH  r   propertyrM  rC  r   r   r   rE  rE  ]  sU     $&Ki%HcHcd4	?+   r   rE  z<nl>z<fcel>z<ecel>z<lcel>z<ucel>z<xcel>z+(?:<fcel>|<ecel>|<nl>|<lcel>|<ucel>|<xcel>)z.*?(?=z|$))flagssc           	      $   SSR                  [        [        [        [        [
        [        /5      -   S-   n[        R                  " X5      n[        R                  " X5      nU Vs/ s H  oDR                  5       (       d  M  UPM     nnX#4$ s  snf )z
Extract OTSL tags and text parts from the input string.

Args:
    s (str): OTSL string.

Returns:
    Tuple[List[str], List[str]]: (tokens, text_parts)
(|))r  OTSL_NL	OTSL_FCEL	OTSL_ECEL	OTSL_LCEL	OTSL_UCEL	OTSL_XCELr  findallsplitr5  )rQ  r"  tokens
text_partsr  s        r   otsl_extract_tokens_and_textr`    sz     	
))WiIy)T
U	V
	 
 ZZ#F'%J%/AZE;;=%ZJA Bs   &BBc                   ^ [         m[        R                  " UU4S j5       VVs/ s H  u  p#U(       a  M  [        U5      PM     nnn/ nSnSnU(       Ga%  [	        S U 5       5      nU H:  n	[        U	5      U:  d  M  U	R                  [        5        [        U	5      U:  a  M&  M<     / n
SnU H  n	U	 H  nU
R                  U5        U[        U 5      :  d  M%  X   U:X  d  M/  US-  nU[        U 5      :  d  ME  X   [         [        [        [        [        [        4;  d  Mm  U
R                  X   5        US-  nM     U
R                  [         5        U[        U 5      :  d  M  X   [         :X  d  M  US-  nM     U
n S nS n[        U 5       GH=  u  nnSnU[        [        4;   a  SnSnSnU[        :w  a	  XS-      nSnUU-   [        U 5      :  a  XU-      OSnSnUS-   [        U5      :  a  U[        XFS-      5      :  a
  XFS-      U   nU[        [        4;   a  UU" XGS-   U[        [        /5      -  nU[        [        4;   a  UU" XGUS-   [        [        /5      -  nUR                  [        UR                  5       UUUUU-   UUU-   S	95        U[        [        [        [        [        4;   a  US-  nU[         :X  d  GM6  US-  nSnGM@     XT4$ s  snnf )
z
Parse OTSL text and tags into TableCell objects and tag structure.

Args:
    texts (List[str]): List of tokens and text.
    tokens (List[str]): List of OTSL tags.

Returns:
    Tuple[List[TableCell], List[List[str]]]: (table_cells, split_row_tokens)
c                    > U T:H  $ r   r   )z
split_words    r   <lambda>"otsl_parse_texts.<locals>.<lambda>  s	    Zr   r   c              3   8   #    U  H  n[        U5      v   M     g 7fr   rA   r   rows     r   r   #otsl_parse_texts.<locals>.<genexpr>  s     <+;Cs3xx+;   r6   c                 x    SnUnX   U   U;   a*  US-  nUS-  nU[        X   5      :  a  U$ X   U   U;   a  M*  U$ Nr   r6   rh  )r^  c_idxr_idxwhich_tokensspan
c_idx_iters         r   count_right%otsl_parse_texts.<locals>.count_right  sY    
mJ'<7!OJAIDS//	 mJ'<7
 r   c                 t    SnUnX   U   U;   a(  US-  nUS-  nU[        U 5      :  a  U$ X   U   U;   a  M(  U$ rn  rh  )r^  ro  rp  rq  rr  
r_idx_iters         r   
count_down$otsl_parse_texts.<locals>.count_down  sX    
 '<7!OJAIDS[(	  '<7
 r   r	  r   )r   r&  r'  r(  r)  r*  r+  )rV  	itertoolsgroupbylistr'   rA   r   rX  rW  rY  rZ  r[  rC   r%  r5  )textsr^  xysplit_row_tokensrF  rp  ro  max_colsrj  	new_textstext_idxr  rt  rx  rG   r   	cell_textr&  r'  right_offsetnext_right_cellnext_bottom_cellrd  s                          @r   otsl_parse_textsr    s    J %%f.GHHDA 	QH  
 KEE <+;<<#Cc(X%

9% c(X% $ 	#C  'c%j(U_-EMH#e*,!!!!!I 2 "((9 A  W%#e*$G)CA# $$  U#4	Iy))HHLy !a%L	  ,-|+;c%j+H,&'b   "qy3/003/	:;;'7	'B5'I$9i"88K$aiI8N   Iy#99J$UQYI8N  "*%%).',x'7).',x'7
 Iy)Y	JJQJE7?QJEEU $V ((Is
   KKrK  c           
      V   U R                   nU R                  n[        U R                  5      S:X  a  gSnU R                  n[        U5       H  nUS-  n[        U5       H  nXE   U   nUR                  UR                  pUR                  UR                  pX:w  d  X:w  a  MD  [        R                  " UR                  R                  5       5      nUR                  (       a  SOSnU nUS:  a	  USU S3-  nU
S:  a	  US	U
 S3-  nUS
U SU SU S3-  nM     US-  nM     SU S3nU$ )zv
Export TableData to HTML table.

Args:
    table_data (TableData): TableData object.

Returns:
    str: HTML string.
r   r	  z<tr>thtdr6   z
 rowspan=""z
 colspan="<>z</z</tr>z<table>z</table>)rG  rH  rA   rF  rM  r@   r&  r(  r'  r*  htmlescaper   r5  r,  )rK  nrowsncolsbodyrM  rG   rN   rL  rowspanrowstartcolspancolstartr  celltagopening_tags                  r   export_to_htmlr  $  s>    EE
:!!"a'D??D5\uA"gajD!%0I0IX!%0I0IX}kk$))//"34G"00ddG$IK{G9A66{G9A66a}AgYb	;;D  	! " TF(#DKr   otsl_strc                   ^ [        U [        5      (       d   eU R                  5       n [        U ;  a	  U [        -   $ U R	                  [        5      n/ nU H  nU(       d  M  [
        R                  U5      nU(       d  M*  [        U5      nSn[        U5       H&  u  pxUR                  [        5      (       d  M!  US-   nM(     UR                  XEUS.5        M     U(       d  [        $ U(       a  [        S U 5       5      OSn	U(       a  [        S U 5       5      OSn
U	n[        X5      n[        S5      nUn[        XS-   5       H#  m[        U4S jU 5       5      nX:  d  M  UnTnM%     / nU HQ  nUS   n[        U5      nUU:  a  US	U nO[         /UU-
  -  nUU-   nUR                  S
R#                  U5      5        MS     [        R#                  U5      [        -   $ )z
Pad OTSL string to a square (rectangular) format, ensuring each row has equal number of cells.

Args:
    otsl_str (str): OTSL string.

Returns:
    str: Padded OTSL string.
r   r6   )	raw_cells	total_lenmin_lenc              3   *   #    U  H	  oS    v   M     g7f)r  Nr   ri  s     r   r   %otsl_pad_to_sqr_v2.<locals>.<genexpr>i  s     >Xcy>X   c              3   *   #    U  H	  oS    v   M     g7fr  Nr   ri  s     r   r   r  j  s     =HSK(Hr  r   c              3   F   >#    U  H  n[        US    T-
  5      v   M     g7fr  rz   )r   rj  ra   s     r   r   r  q  s#      S(3S%5%=!>!>(s   !r  Nr	  )rW   rn   r5  rV  r]  OTSL_FIND_PATTERNr\  rA   rC   
startswithrW  r   r'   r   r@   rc   rX  r  )r  linesrow_dataliner  r  r  rG   cell_strglobal_min_widthmax_total_lensearch_start
search_endmin_total_costoptimal_widthcurrent_total_costrepaired_linesrj  cellscurrent_len	new_cellspaddingra   s                         @r   otsl_pad_to_sqr_v2r  I  s    h$$$$~~Hh'!!NN7#EH%--d3		N	$Y/KA""9--a% 0 	#P	
  BJs>X>>PQAIC=H==qM#L%5J5\NM|!^4  S( SS./N!M	 5 NK %j&n}-I k][%@AGIbggi01  <<''11r   otsl_contentc                     [        U 5      n [        U 5      u  p[        X!5      u  p4[        [	        U5      U(       a  [        S U 5       5      OSUS9n[        U5      $ )z
Convert OTSL-v1.0 string to HTML. Only 6 tags allowed: <fcel>, <ecel>, <nl>, <lcel>, <ucel>, <xcel>.

Args:
    otsl_content (str): OTSL string.

Returns:
    str: HTML table.
c              3   8   #    U  H  n[        U5      v   M     g 7fr   rh  ri  s     r   r   'convert_otsl_to_html.<locals>.<genexpr>  s     ;*:3c#hh*:rl  r   )rG  rH  rF  )r  r`  r  rE  rA   r'   r  )r  r^  mixed_textsrF  r  rK  s         r   convert_otsl_to_htmlr    s_     &l3L6|DF$4[$I!K%&?O#;*:;;UVJ
 *%%r   c                     [        U 5      n[        SUS-  S-   5       H!  nX-  S:X  d  M  U SU nX1U-  -  U :X  d  M  Us  $    g)z
Find the shortest substring that repeats to form the entire string.

Args:
    s (str): Input string.

Returns:
    str or None: Shortest repeating substring, or None if not found.
r6   r   r   N)rA   r@   )rQ  nrG   	substrings       r   !find_shortest_repeating_substringr    sT     	AA1a1fqj!5A:"1IF#q(  	 "
 r   r  min_repeatsc                 0   [        [        U 5      U-  US-
  S5       Hv  nX* S nU R                  XB-  5      (       d  M"  SnU nUR                  U5      (       a#  USU*  nUS-  nUR                  U5      (       a  M#  [        U 5      XS-  -
  nU SU XE4s  $    g)a  
Detect if string ends with a repeating phrase.

Args:
    s (str): Input string.
    min_len (int): Minimum length of unit.
    min_repeats (int): Minimum repeat count.

Returns:
    Tuple[str, str, int] or None: (prefix, unit, count) if found, else None.
r6   r   Nr   )r@   rA   endswith)rQ  r  r  rG   unitcounttemp_sstart_indexs           r   find_repeating_suffixr    s     3q6k*GaK<v::d())EF//$''!
 //$'' a&EI.K\k?D// = r   r  line_thresholdchar_threshold	min_countc                    [        U 5      U:  a  U $ U R                  5       nU(       d  U $ SU;  aF  [        U5      S:  a7  [        USSS9nU(       a%  Uu  pxn	[        U5      U	-  [        U5      S-  :  a  U$ SU;  a?  [        U5      U:  a0  [        U5      nU(       a  [        U5      [        U5      -  n	X:  a  U$ U R	                  S5       V
s/ s H)  oR                  5       (       d  M  U
R                  5       PM+     nn
U(       d  U $ [        U5      nX:  a  U $ [        U5      nUR                  S5      S   u  pX:  a
  X-  S	:  a  U$ U $ s  sn
f )
as  
Detect and truncate character-level, phrase-level, or line-level repetition in content.

Args:
    content (str): Input text.
    line_threshold (int): Min lines for line-level truncation.
    char_threshold (int): Min repeats for char-level truncation.
    min_len (int): Min length for char-level check.

Returns:
    Union[str, str]: (truncated_content, info_string)
r
  d      ry   )r  r  r8   r6   r   g?)rA   r5  r  r  r]  r   most_common)r  r  r  r  r  stripped_contentsuffix_matchprefixrepeating_unitr  r  r  total_linesline_countsmost_common_lines                  r   truncate_repetitive_contentr    sU   & 7|i}} ##,<(=(C,-=qVWX,8)FE>"U*S1A-BS-HH ##,<(=(G:;KL()S-@@E&%% '.mmD&9J&9dZZ\\TZZ\&9EJe*K#%.K)55a8;E$7C#?N Ks   E+Ec                 f   SS K n[        U R                  5      S:X  a  UR                  XR                  5      nOU R                  5       nUR                  [        R                  :w  a  UR                  [        R                  5      nUR                  5       nUR                  5       nX4:X  a  U $ X$-
  X4-
  -  S-  nUR                  [        R                  5      nUR                  USSUR                  5      u  pgUR                  U5      nUc  U $ UR                  U5      u  ppX
X-   2XU-   24   nU$ )Nr   r   rk      )r   rA   shapecvtColorCOLOR_BGR2GRAYr   dtyper]   uint8astyper'   r&   	thresholdTHRESH_BINARY_INVfindNonZeroboundingRect)rZ   r   graymax_valmin_valr0  r   binarycoordsr~  r  rL   rM   croppeds                 r   crop_marginr    s   
399~||C!3!34xxzzzRXX{{288$hhjGhhjG
Nw01C7D;;rxx DdCc.C.CDIA__V$F~
!!&)JA!ae)QQY&'GNr   z#<\|TEXT_START\|>(.*?)<\|TEXT_END\|>z!<\|LOC_BEGIN\|>(.*?)<\|LOC_END\|>z<\|LOC_(\d+)\|>r=   c                 "   U R                   SS u  pUS:  aU  US:  aO  [        U 5      n US-  US-  pC [        R                  R                  nU R                  X44U5      n [        U 5      nU$ U nU$ ! [
         a    [        R                  n N@f = f)z?
Post-process the input image to extract location information.
Nr   i  )r  r[   r
   
ResamplingLANCZOSAttributeErrorresizer_   )r=   rM   rL   	process_w	process_hresample_filterinference_imgs          r   pre_process_for_spottingr  %  s     ;;r?DA4xAHU# 1ua!e9	,#..66O i3_E#E*    	,#mmO	,s   A1 1BB	input_strrL   rM   c           	      <   [        U [        5      (       d   e[        R                  U 5      n[        R                  U 5      n/ n/ n[        [        U5      [        U5      5      n[        U5       H  nX8   R                  5       n	[        R                  XH   5      n
[        U
5      S:  a  M=  [        [        [        U
SS 5      5      n[        SSS5       Vs/ s H  oU   XS-      4PM     nnU Vs/ s H  oS   S-  U-  US   S-  U-  4PM     nnUR                  U5        UR                  U	5        M     U(       a  U(       Gd$  [        [        R                  U 5      5      nSnSnUS-   [        U5      :  a  XUS-    nU Vs/ s H  n[        UR!                  S5      5      PM     nn[        SSS5       Vs/ s H  oU   XS-      4PM     nnU Vs/ s H  oS   S-  U-  US   S-  U-  4PM     nnU UUS   R#                  5        nUR                  5       n	UR                  U	5        UR                  U5        US   R%                  5       nUS-  nUS-   [        U5      :  a  M  S	R'                  U5      nXVS
.nUU4$ s  snf s  snf s  snf s  snf s  snf )zD
Post-process the input string to extract text and location blocks.
r  Nr   r   r6   g     @@   r   r  )	rec_polys	rec_texts)rW   rn   ANNOT_TEXT_REr\  LOC_BLOCK_REr&   rA   r@   r5  LOC_ITEM_REr|  mapr   r   LOC_TOKEN_REfinditerr  r   r   r  )r  rL   rM   r}  
loc_blocksr  r  r  rG   txt	loc_itemsvalsrN   ptspmatcheslast_endr  m	text_span
result_strspotting_ress                         r   post_process_for_spottingr  :  s    i%%%% !!),E%%i0JII 	CJJ(A1Xhnn''
6	y>ACYr]+,/4Q1~>~!Q!e%~>?BCs!!v!1Q4&=1#45sC  I|,,Y78!ec'l"A&E-23UC
OUD338Aq>B>aGTa%[)>CBCFG3aaD6MA%qtf}q'893CG!(U1X^^-=>I//#CS!S!Ry}}HFA !ec'l" Y'J!*CL|##5 ?C 4BGs   J2J
$J<JJ)r   )rl   auto)r  )r  ry   )r   r   r   i  )Er  rz  r   r  collectionsr   r   r   typingr   r   r   r   r	   numpyr]   PILr
   pydanticr   r   r   layout_parsing.utilsr   r   r   r   r   rn   r   r-   rU   r[   r_   rg   rv   r   r   r  r#  r%  rE  rV  rW  rX  rY  rZ  r[  NON_CAPTURING_TAG_GROUPcompileDOTALLr  r`  r  r  r  r  r  r  r  r  Sr  r  r  r   ndarrayr  r  r   r   r   <module>r     s      	   0 0   ? ?  #25c?##25c?##2 #2 	#2L@#d4j)@#>A@#	#tDz/@#F "&R]@;|94x!0H3	 3l.	 .d 					G JJv&=%>cB")) 
C *p)f"y "J72 72 72t&s &* sDy1A ( 23
+.
5c3%&: 666 6 	6
 6 	6r@ 

A244Hzz>Ejj+,zz,-BJJ 4T	? *2$2$2$"2$
3S$Y 2$r   