
    {-j                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZmZmZ ddlmZmZ d	 Z	 dMdeeeef                  deeeef                  dedefdZdej        deej        ej        f         fdZde
eee
         f         dede
eee
         f         fdZ d Z!d Z"d Z#dNdZ$dOdZ%d Z&d Z'd Z( G d d e          Z) G d! d"e          Z*d#Z+d$Z,d%Z-d&Z.d'Z/d(Z0d)Z1 ej2        e1 d*e1 d+ej3        ,          Z4d-efd.Z5d/ Z6d0e*fd1Z7d2edefd3Z8d4efd5Z9d-edeedf         fd6Z:	 dPd-ed9ed:edeeeeef         df         fd;Z;	 	 	 	 dQd>ed?ed@ed9edAedefdBZ<dC Z= ej2        dDej>                  Z? ej2        dEej>                  Z@ ej2        dF          ZA ej2        dF          ZBdGej        de
eef         fdHZCdIedJedKedeee
eef         f         fdLZDdS )R    N)Counter)deepcopy)AnyDictListTupleUnion)Image)	BaseModelcomputed_fieldmodel_validator   )calculate_overlap_ratio"calculate_projection_overlap_ratioc                 >    | j         s|                     d          } | S )Nr   )is_validbuffer)polys    n/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddlex/inference/pipelines/paddleocr_vl/uilts.py
make_validr   !   s     = {{1~~K    unionpolygon1polygon2modereturnc                    	 ddl m} n# t          $ r t          d          w xY w ||           } ||          }t          |          }t          |          }|                    |          j        }|                    |          j        }|dk    r||z  S |dk    rt          |j        |j                  }||z  S |dk    rt          |j        |j                  }	||	z  S t          d|           )a  
    Calculate the overlap ratio between two polygons.

    Args:
        polygon1 (List[Tuple[int, int]]): First polygon represented as a list of points.
        polygon2 (List[Tuple[int, int]]): Second polygon represented as a list of points.
        mode (str, optional): Overlap calculation mode. Defaults to "union".

    Returns:
        float: Overlap ratio value between 0 and 1.
    r   )PolygonzPlease install Shapely library.r   smalllargezUnknown mode: )
shapely.geometryr   ImportErrorr   intersectionarear   minmax
ValueError)
r   r   r   r   poly1poly2r#   r   
small_area
large_areas
             r   calculate_polygon_overlap_ratior,   '   s!    =,,,,,,, = = =;<<<=GHEGHEuEuE%%e,,1LKK#Ewe##	UZ00
j((	UZ00
j((0$00111s   	 #coordsc                 >   | dddf         }| dddf         }| dddf         }| dddf         }t          j        ||z
  ||z
  z            }t          j        |dddf         |dddf                   }t          j        |dddf         |dddf                   }t          j        |dddf         |dddf                   }t          j        |dddf         |dddf                   }	t          j        d||z
            }
t          j        d|	|z
            }|
|z  }t          j        |dddf         |dddf                   }t          j        dd          5  t          j        |dk    ||z  d          }ddd           n# 1 swxY w Y   ||fS )	z
    Compute pairwise overlap ratio (mode="small") and bbox areas.

    This only replaces the numeric overlap/area calculation. The caller keeps
    the original sequential dropping order because it affects inline formulas.
    Nr      r      ignore)divideinvalidg        )npabsmaximumminimumerrstatewhere)r-   x1y1x2y2areasinter_x1inter_y1inter_x2inter_y2inter_winter_h
inter_arear*   overlaps                  r   _compute_pairwise_overlap_smallrG   M   s    
1B	1B	1B	1BFBGR())Ez"QQQW+r$'{33Hz"QQQW+r$'{33Hz"QQQW+r$'{33Hz"QQQW+r$'{33HjHx/00GjHx/00G7"JE!!!T'NE$'N;;J	Hh	7	7	7 I I(:>:
+BCHHI I I I I I I I I I I I I I IE>s   &FFFlayout_det_reslayout_shape_modec                    t          |           }d |d         D             }|s||d<   |S t          j        d |D             t          j                  }|dddf         |dddf         z
  }|dddf         |ddd	f         z
  }t	          |          \  }}t                      t          t          |                    D ]}	||	         d
k     s||	         d
k     r                    |	           t          |	d	z   t          |                    D ]W}
|	v s|
v r||	|
f         }||	         d         dk    s||
         d         dk    rU|dk    rO||	         d         dk    r                    |	           ||
         d         dk    r                    |
           |dk    r|dk    r:d||	         v r0t          ||	         d         ||
         d         d          }|dk     r||	         d         ||
         d         h}|h dz  r!t          |          d	k    rd|vs|h dk    r||	         ||
         k    r                    |
           B                    |	           Yfdt          |          D             |d<   |S )a.  
    Remove overlapping boxes from layout detection results based on a given overlap ratio.

    Args:
        layout_det_res (Dict[str, List[Dict]]): Layout detection result dict containing a 'boxes' list.

    Returns:
        Dict[str, List[Dict]]: Filtered dict with overlapping boxes removed.
    c                 *    g | ]}|d          dk    |S )label	reference .0boxs     r   
<listcomp>z(filter_overlap_boxes.<locals>.<listcomp>y   s-       3w<;;V;V;V;V;Vr   boxesc                     g | ]
}|d          S )
coordinaterN   rO   s     r   rR   z(filter_overlap_boxes.<locals>.<listcomp>   s    :::Ss<(:::r   dtypeNr   r   r0   r/      rL   inline_formula      ?gffffff?rectpolygon_pointsr   >   sealchartimagetabler`   c                 "    g | ]\  }}|v	|S rN   rN   )rP   idxrQ   dropped_indexess      r   rR   z(filter_overlap_boxes.<locals>.<listcomp>   s.     ( ( (S?0J0J0J0J0Jr   )r   r4   arrayfloat64rG   setrangelenaddr,   	enumerate)rH   rI   layout_det_res_filteredrS   r-   widthsheightsoverlap_matrixr>   ijoverlap_ratiopoly_overlap_ratiolabelsrc   s                 @r   filter_overlap_boxesrt   l   sA    '~66 .w7  E  '+0(&&X::E:::"*MMMFAAAqD\F111a4L(FQQQTlVAAAqD\)G;FCCNEeeO3u:: $+ $+!9q==GAJNN"""q1uc%jj)) !	+ !	+AO##qO';';*1a40Ma!%5558G$(888 3&&Qx(,<<<'++A...Qx(,<<<'++A...s""$..3CuQx3O3O)Ha!12E!H=M4NPW* *& *C// (7+U1Xg->????? !CKKRSOOf,, ; ; ; 1 1 !8uQx''#''****#''***C!	+D( ( ( (%e,,( ( (G$ #"r   c                 b    t          | t          j                  r| S t          j        |           S )z
    Convert the input to a PIL Image.

    Args:
        img (PIL.Image or numpy.ndarray): Input image.

    Returns:
        PIL.Image: PIL Image object.
    )
isinstancer
   	fromarrayimgs    r   to_pil_imagerz      s-     #u{## 
?3r   c                 b    t          | t          j                  rt          j        |           S | S )z
    Convert the input to a numpy array.

    Args:
        img (PIL.Image or numpy.ndarray): Input image.

    Returns:
        numpy.ndarray: Numpy array image.
    )rv   r
   r4   rd   rx   s    r   to_np_arrayr|      s+     #u{## x}}Jr   c                 v    d | D             }d | D             }t          |          }t          |          }||fS )z
    Calculate width (max of all) and height (sum) for a vertical merge of images.

    Args:
        images (List[PIL.Image or np.ndarray]): List of images.

    Returns:
        Tuple[int, int]: (width, height) of merged image.
    c                 6    g | ]}t          |          j        S rN   )rz   widthrP   ry   s     r   rR   z"calc_merged_wh.<locals>.<listcomp>   s#    888#l3%888r   c                 6    g | ]}t          |          j        S rN   )rz   heightr   s     r   rR   z"calc_merged_wh.<locals>.<listcomp>   s#    :::C|C  ':::r   )r&   sum)imagesrl   rm   whs        r   calc_merged_whr      sI     98888F::6:::GFAGAa4Kr   centerautoc                    | sdS t          |           dk    rt          | d                   S t          |t                    r|gt          |           dz
  z  }t          |          t          |           dz
  k    rt	          d          d | D             }dgt          |          z  }|d         j        }t          dt          |                    D ]}||         j        }t          ||          }||dz
           }	|	dk    r||z
  dz  }
||z
  dz  }n|	dk    r||z
  }
||z
  }nd}
d}t          |          D ]}||xx         |
z  cc<   |||<   |}t          d	 |D                       }t          j
        d
||fd          }d}t          |          D ]-\  }}|                    |||         |f           ||j        z  }.t          |          S )a+  
    Merge images vertically with given alignment.

    Args:
        images (List[PIL.Image or np.ndarray]): List of images to merge.
        aligns (str or List[str]): Alignment(s) for each merge step ('center', 'right', 'left').

    Returns:
        np.ndarray: Merged image as numpy array.
    Nr/   r   z,The length of aligns must be len(images) - 1c                 ,    g | ]}t          |          S rN   )rz   r   s     r   rR   z merge_images.<locals>.<listcomp>   s     666,s##666r   r   r   rightc              3   $   K   | ]}|j         V  d S N)r   r   s     r   	<genexpr>zmerge_images.<locals>.<genexpr>  s$      33#*333333r   RGB   r   r   )rh   r|   rv   strr'   r   rg   r&   r   r
   newrj   paster   )r   alignsrI   
pil_images	x_offsetsmerged_wro   img2_wstep_walignr:   r<   ktotal_hcanvasy_offsetry   s                    r   merge_imagesr      s$     t
6{{a6!9%%%&# .S[[1_-
6{{c&kkAo%%GHHH 76v666J c*oo%I!}"H1c*oo&&  A$Xv&&q1uH8#)B6/a'BBg("B&BBBB q 	 	AaLLLBLLLL	! 33
33333GYux1?CCFHJ''  3S9Q<2333CJvr   c                 >
   )* g }i }t                     D ],\  }}|d         v r|||<   |                    ||f           -g }g }g }	g }
d ))fd}fd}t          |          D ]\  }\  }}|s	|g}|g}	g }
||dz
           \  }}|d         }|d         }|d         }|d         }t          ||d          }|dk    os|d	k    om||k    og|d         |d
         k    oU|d         |d         k     oC|d         |d
         z
  t          |d
         |d         z
  |d
         |d         z
            dz  k     }|dk    o|dv o||k    o|d         |d         k    ot	          |d         |d         z
            t          |d         |d         z
  |d         |d         z
            dz  k     o= )|d         |d                    )|d
         |d
                   z  o |||           }|rd}n|r |||          }nd}|s|rA|                    |           |	                    |           |
                    |           |                    |	|
f           |g}|g}	g }
|r|                    |	|
f           g }|D ]<\  }}t          |          t          |          }}|                    ||||f           =g }t                      *d}|t                     k     r:d}|D ]\  }}}}||k    rt          *fd|D                       rd} fd|D             }|r|ng } t          |          \  }!}"|!dk    r|"|!z  nt          d          }#|#dk    rpt          |          D ]_\  }$}% |%                                         } |%         d         |d<   d|d<   |                    |           *                    |%           `nt          || |          }&t          |          D ]n\  }$}% |%                                         }|$dk    r|&nd|d<   |$dk    r| nd|d<   |d         |d<   |                    |           *                    |%           og }'t          |dz   |          D ]}(|(|v r|'                    |(           |'D ]2}(|                    ||(                    *                    |(           3|dz   } n|r||v r4|*vr0|                    ||                    *                    |           |dz  }|t                     k     :|S )aK  
    Merge blocks based on alignment and overlap logic, except for those with labels in non_merge_labels.

    Args:
        blocks (List[Dict]): List of block dicts.
        non_merge_labels (List[str]): Block labels that should not be merged.

    Returns:
        List[Dict]: List of processed (and possibly merged) blocks.
    rL   c                 .    t          | |z
            dk    S )N   r5   )a1a2s     r   
is_alignedz merge_blocks.<locals>.is_aligned4  s    27||q  r   c                 p     | d         |d                   rdS  | d         |d                   rdS dS )Nr   leftr   r   r   rN   )
block_bbox	prev_bboxr   s     r   get_alignmentz#merge_blocks.<locals>.get_alignment7  sJ    :jmYq\22 	6Z
1y|44 	78r   c                    ||         d         }||          d         }t          |d         |d                   }t          |d         |d                   }t          |d         |d                   }t          |d         |d                   }||||g}	t          |          D ]5\  }
}|
| |fv s
|d         vr|d         }t          |	|          dk    r dS 6dS )	NrQ   r   r/   r   r0   rL   TF)r%   r&   rj   r   )	block_idxprev_idxblocksr   r   r:   r;   r<   r=   min_boxrb   other_block
other_bboxnon_merge_labelss                r   overlapwith_other_boxz+merge_blocks.<locals>.overlapwith_other_box?  s    8$U+	I&u-
1z!}--1z!}--1z!}--1z!}--r2r" )& 1 1 	 	C	8,,,w'/???$U+J&w
;;a??tt @ur   r/   rQ   
horizontalr   textr   r0   g333333?)r   rZ   r   NFc              3       K   | ]}|vV  	d S r   rN   )rP   ro   used_indicess     r   r   zmerge_blocks.<locals>.<genexpr>  s(      #Q#QaA\$9#Q#Q#Q#Q#Q#Qr   Tc                 ,    g | ]}|         d          S rx   rN   )rP   ro   r   s     r   rR   z merge_blocks.<locals>.<listcomp>  s"    @@@Qq	%(@@@r   infry   merge_alignsgroup_id)rj   appendr   r&   r5   r%   rf   rh   allr   floatcopyri   r   rg   )+r   r   rI   blocks_to_mergenon_merge_blocksrb   blockmerged_groupscurrent_groupcurrent_indicescurrent_alignsr   r   ro   r   
prev_blockr   
prev_labelr   block_labeliou_his_crossis_updown_align
align_modegroup_rangesgroup_indicesr   startendresult_blocksgroup_foundimgsr   r   r   aspect_ratiorp   r   
merged_imginsert_listn_idxr   r   s+   ``                                       @@r   merge_blocksr     s    O'' 1 1
U>---$)S!!""C<0000MMON! ! !        & %_55 3  3 <C 	"GM"eON.q1u5*u%	(
5\
Gn2:y,WWQJ Tv%Tz)T 1	!,T 1	!,	T
 1	!,)A,1-z!}z!}/LMMPSST 	 AI 
=x'
=z)
= 11-
= JqMIaL011)A,1-z!}z!}/LMMPSST	
= 
:a=)A,77*Z]IaL99:
= &%c8V<< 	  	!JJ 	&z9==JJJ 	  	   '''""3'''!!*----  />!BCCC"GM"eONN @o~>???L!. A Av'']););sUC?@@@@M55L
C
F

1= 	 	-E3ve||#Q#Q#Q#Q=#Q#Q#Q Q Q|"@@@@-@@@)/7vvR%d++1()Qq1uuE%LL1$$(1-(@(@ 4 49 &y 1 6 6 8 8'-i'8'?e04n-%,,U333$((33334 ".dLBS!T!TJ(1-(@(@ 4 49 &y 1 6 6 8 856!VVzze@AQDn-,9!,<j)%,,U333$((3333 "519c22 2 2E 000#**5111( , ,E!(()9%)@AAA $$U++++Ag 	"""s,'>'>  !1#!6777S!!!qO F

P r   c           
         ddl dfd	}d |D             \  }}}}||z
  }||z
  }	                    | ||f||fdd           j        }
d	} |||
t          ||	          d
          \  }}}t	          dt          j        ||z                      }|||z
  dz  z   }||	|z   dz  z   }                    | |||f|
|d|j                   | S )a  
    Fill a rectangular area in the image with a white background and write the given token string.
    Paints directly on the provided image (in-place).

    Args:
        image (np.ndarray): Image to paint on (modified in-place).
        box (tuple): (x1, y1, x2, y2) coordinates of rectangle.
        token_str (str): Token string to write.

    Returns:
        np.ndarray: The same image (modified in-place).
    r   N?c                     d\  }}|}||z
  dk    rI||z   dz  }                     | ||d          \  \  }}	}
|||z  k     r|	||z  k     r|}|}n|}||z
  dk    I|||	fS )N)g?
   g{Gz?r   r/   )	thickness)getTextSize)r   fontFacesquare_size
fill_ratior   r   optimal_scalemidr   r   _cv2s              r   get_optimal_font_scalez+paint_token.<locals>.get_optimal_font_scale  s     edlT!!%<1$ChqIIIFQA;+++K*4L0L0L # dlT!! a""r   c                 ,    g | ]}t          |          S rN   )int)rP   vs     r   rR   zpaint_token.<locals>.<listcomp>  s    ***c!ff***r   r   )colorr      )r   r/   r   )r   r   r   )lineType)r   )	r   	rectangleFONT_HERSHEY_SIMPLEXr%   r&   mathfloorputTextLINE_AA)r_   rQ   	token_strr   r:   r;   r<   r=   box_wbox_hfontthickness_scale_ratio
font_scaletext_wtext_hfont_thicknesstext_xtext_yr   s                     @r   paint_tokenr    sJ    JJJ# # # # # #  +*c***NBBGEGEMM%"bB8?bMQQQ #D!7!74UE**s" " "J DJz4I'IJJKKN 56>a''F56>a''FKK	  	 	 	 Lr   c                 z   d }ddl }|                    d           i }|\  }}}}	g  |t          |                    }
|                    |
           |                                 } t          |          D ]\  }}|d         \  }}}}||k    r||k    r||k    r||	k    ry                    |           t          ||z
  ||z
            dk     rZ||z
  ||z
  ||z
  ||z
  g}dt          |
|                   z   dz   }t          | ||          } |d	         ||<   fd
t          |          D             }| ||fS )a  
    Replace figures in a table area with tokens, return new image and token map.

    Args:
        table_block_img (np.ndarray): Table image.
        table_box (list): Table bounding box [x_min, y_min, x_max, y_max].
        figures (List[Dict]): List of figure dicts (must contain 'coordinate', 'path').

    Returns:
        Tuple[np.ndarray, Dict[str, str], List[str]]:
            - New table image,
            - Token-to-img HTML map,
            - List of figure paths dropped.
    c                     h d}g }d}t          |          | k     rLt          t          |                    |z  s|                    |           |dz  }t          |          | k     L|S )N>   019r   r/   )rh   rf   r   r   )numexclude_digitsseqro   s       r   gen_random_mapz0tokenize_figure_of_table.<locals>.gen_random_map
  sm    (#hhnnAKK.0 

1FA #hhnn 
r   r   Ni   rU      [F]pathc                 .    g | ]\  }}|v 	|d          S )r  rN   )rP   ro   f
drop_idxess      r   rR   z,tokenize_figure_of_table.<locals>.<listcomp>4  s&    PPP$!QZAfIr   )
randomseedrh   shuffler   rj   r   r%   r   r  )table_block_img	table_boxfiguresr  r  	token_maptable_x_mintable_y_mintable_x_maxtable_y_max
random_map	figure_idfigurefigure_x_minfigure_y_minfigure_x_maxfigure_y_maxdraw_boxr   drop_figuresr  s                       @r   tokenize_figure_of_tabler(    s       MMM
KKI9B6Kk;JG--J
NN:%**,,O&w// 2 2	6AGAU>lL,K''++++++i(((<,.|0KLLrQQ{*{*{*{*	H s:i#8999C?I)/8YOOO#)&>Ii PPPP)G*<*<PPPLI|33r   c                 B    fd}d}t          j        |||           S )z
    Replace tokens in a string with their HTML image equivalents.

    Args:
        table_res_str (str): Table string with tokens.
        figure_token_map (dict): Mapping from tokens to HTML img tags.

    Returns:
        str: Untokenized string.
    c                    |                      d          }d| d}                    ||                      d                    }	                    |d           }||                      d          S g }|                    d                    |                    dd                              dd	                               d                    |          }|j        dk    r|j        }|d
|z   d
z   z  }|S )Nr/   r  r  r   z<img src="{}" alt="Image"" />z-
 
 

)groupgetr   formatreplacejoincontent)
matchtoken_idtokenimg_path	img_blockimg_tags
image_infoocr_contentfigure_token_mapimage_path_to_obj_maps
           r   replz(untokenize_figure_of_table.<locals>.replD  s    ;;q>> X   #''u{{1~~>>)--h==	;;q>>!HOO/66$$UB//77cBB   
 8,,J B&&'/f{2V;;
r   z
\[F(\d+)\])resub)table_res_strr=  r>  r?  patterns    ``  r   untokenize_figure_of_tablerD  8  s=         ( G6'4///r   c                       e Zd ZU dZdZeed<   dZeed<   eed<   eed<   eed<   eed<   eed	<   d
Z	e
ed<   d
Ze
ed<   d
Ze
ed<    ed          ededefd                        ZdS )	TableCella  
    TableCell represents a single cell in a table.

    Attributes:
        row_span (int): Number of rows spanned.
        col_span (int): Number of columns spanned.
        start_row_offset_idx (int): Start row index.
        end_row_offset_idx (int): End row index (exclusive).
        start_col_offset_idx (int): Start column index.
        end_col_offset_idx (int): End column index (exclusive).
        text (str): Cell text content.
        column_header (bool): Whether this cell is a column header.
        row_header (bool): Whether this cell is a row header.
        row_section (bool): Whether this cell is a row section.
    r/   row_spancol_spanstart_row_offset_idxend_row_offset_idxstart_col_offset_idxend_col_offset_idxr   Fcolumn_header
row_headerrow_sectionbefore)r   datar   c                    t          |t                    rud|v r|S |d                             dd          }t          |          s?|                    dd          }|r|D ]}||d         dz   z  }|                                }||d<   |S )z
        Create TableCell from dict, extracting 'text' property correctly.

        Args:
            data (Any): Input data.

        Returns:
            Any: TableCell-compatible dict.
        r   bboxr7  r+  text_cell_bboxesNr-  )rv   r   r0  rh   popstrip)clsrQ  r   
text_cellsels        r   from_dict_formatzTableCell.from_dict_formatx  s     dD!! 
	 ~~<##GR00Dt99 $!XX&8$??
 2( 2 27c 11zz||DLr   N)__name__
__module____qualname____doc__rG  r   __annotations__rH  r   rM  boolrN  rO  r   classmethodr   rZ  rN   r   r   rF  rF  \  s            HcHc
IIIM4JK_(###C C    [ $#  r   rF  c                       e Zd ZU dZg Zee         ed<   dZe	ed<   dZ
e	ed<   eedeee                  fd                        ZdS )		TableDataz
    TableData holds a table's cells, row and column counts, and provides a grid property.

    Attributes:
        table_cells (List[TableCell]): List of table cells.
        num_rows (int): Number of rows.
        num_cols (int): Number of columns.
    table_cellsr   num_rowsnum_colsr   c           	      ~     fdt           j                  D             } j        D ]}t          t          |j         j                  t          |j         j                            D ]P}t          t          |j         j                  t          |j         j                            D ]}|||         |<   Q|S )z
        Returns a 2D grid of TableCell objects for the table.

        Returns:
            List[List[TableCell]]: Table as 2D grid.
        c                 R    g | ]"fd t          j                  D             #S )c           
      D    g | ]}t          d dz   ||dz             S )r+  r/   )r   rI  rJ  rK  rL  )rF  )rP   rp   ro   s     r   rR   z-TableData.grid.<locals>.<listcomp>.<listcomp>  sQ     	 	 	  )*'(1u)*'(1u  	 	 	r   )rg   rf  )rP   ro   selfs    @r   rR   z"TableData.grid.<locals>.<listcomp>  sY     
 
 
 	 	 	 	 t}--	 	 	
 
 
r   )	rg   re  rd  r%   rI  rJ  rK  rf  rL  )rj  
table_datacellro   rp   s   `    r   gridzTableData.grid  s    
 
 
 
 4=))
 
 

 $ 		, 		,DD-t}==D+T];;  , , 14=AA/??  , ,A (,JqM!$$	,	, r   N)r[  r\  r]  r^  rd  r   rF  r_  re  r   rf  r   propertyrm  rN   r   r   rc  rc    s           $&Ki%%%HcHcd4	?+    X ^  r   rc  z<nl>z<fcel>z<ecel>z<lcel>z<ucel>z<xcel>z+(?:<fcel>|<ecel>|<nl>|<lcel>|<ucel>|<xcel>)z.*?(?=z|$))flagssc           	          dd                     t          t          t          t          t
          t          g          z   dz   }t          j        ||           }t          j	        ||           }d |D             }||fS )z
    Extract OTSL tags and text parts from the input string.

    Args:
        s (str): OTSL string.

    Returns:
        Tuple[List[str], List[str]]: (tokens, text_parts)
    (|)c                 :    g | ]}|                                 |S rN   rV  )rP   r7  s     r   rR   z0otsl_extract_tokens_and_text.<locals>.<listcomp>  s%    AAAE5;;==A%AAAr   )
r3  OTSL_NL	OTSL_FCEL	OTSL_ECEL	OTSL_LCEL	OTSL_UCEL	OTSL_XCELr@  findallsplit)rp  rC  tokens
text_partss       r   otsl_extract_tokens_and_textr    sx     	
))WiIy)T
U
U	V
	 
 Z##F'1%%JAAZAAAJ:r   c                 &   t           d t          j        |fd          D             }g }d}d}|rKt          d |D                       }|D ]B}t	          |          |k     r-|                    t                     t	          |          |k     -Cg }d}	|D ]}|D ]}
|                    |
           |	t	          |           k     rr| |	         |
k    rf|	dz  }	|	t	          |           k     rN| |	         t           t          t          t          t          t          fvr |                    | |	                    |	dz  }	|                    t                      |	t	          |           k     r| |	         t           k    r|	dz  }	|} d }d }t          |           D ]y\  }}d}|t          t          fv r'd}d}d}|t          k    r| |dz            }d	}||z   t	          |           k     r| ||z            nd}d}|dz   t	          |          k     r-|t	          ||dz                      k     r||dz            |         }|t          t          fv r | |||dz   |t          t          g          z  }|t          t          fv r | ||||dz   t          t          g          z  }|                    t          |                                |||||z   |||z   
                     |t          t          t          t          t          fv r|dz  }|t           k    r|dz  }d}{||fS )a  
    Parse OTSL text and tags into TableCell objects and tag structure.

    Args:
        texts (List[str]): List of tokens and text.
        tokens (List[str]): List of OTSL tags.

    Returns:
        Tuple[List[TableCell], List[List[str]]]: (table_cells, split_row_tokens)
    c                 6    g | ]\  }}|t          |          S rN   )list)rP   xys      r   rR   z$otsl_parse_texts.<locals>.<listcomp>  s8       AqQ  r   c                     | k    S r   rN   )z
split_words    r   <lambda>z"otsl_parse_texts.<locals>.<lambda>  s    Z r   r   c              3   4   K   | ]}t          |          V  d S r   rh   rP   rows     r   r   z#otsl_parse_texts.<locals>.<genexpr>  s(      <<Cs3xx<<<<<<r   r/   c                     d}|}| |         |         |v r5|dz  }|dz  }|t          | |                   k    r|S | |         |         |v 5|S Nr   r/   r  )r  c_idxr_idxwhich_tokensspan
c_idx_iters         r   count_rightz%otsl_parse_texts.<locals>.count_right  sk    
UmJ'<77!OJAIDS////	 UmJ'<77
 r   c                     d}|}| |         |         |v r/|dz  }|dz  }|t          |           k    r|S | |         |         |v /|S r  r  )r  r  r  r  r  
r_idx_iters         r   
count_downz$otsl_parse_texts.<locals>.count_down!  sg    
Z '<77!OJAIDS[[((	 Z '<77
 r   r+  r   )r   rG  rH  rI  rJ  rK  rL  )rw  	itertoolsgroupbyr&   rh   r   ry  rx  rz  r{  r|  rj   rF  rV  )textsr  split_row_tokensrd  r  r  max_colsr  	new_textstext_idxr7  r  r  ro   r   	cell_textrG  rH  right_offsetnext_right_cellnext_bottom_cellr  s                        @r   otsl_parse_textsr    s    J %f.G.G.G.GHH  
 KEE  <<+;<<<<<# 	& 	&Cc((X%%

9%%% c((X%%	# 	 	C & &  '''c%jj((U8_-E-EMH#e**,,x!!!!!I 2 2 "((x999 AW%%%#e**$$xG)C)CA     U## * *4	Iy)))HHLy  !!a%L	  ,-|+;c%jj+H+Ha,&''b   "qy3/00003/	:;;;;'7	'B5'I$9i"888KK$eaiI8N    Iy#999JJ$eUQYI8N   "**%%).',x'7).',x'7  
 
 
 Iy)Y	JJJQJE7??QJEE(((r   rk  c           
      
   | j         }| j        }t          | j                  dk    rdS d}| j        }t          |          D ]}|dz  }t          |          D ]}||         |         }|j        |j        }	}|j        |j	        }}
|	|k    s||k    r9t          j        |j                                                  }|j        rdnd}| }|dk    r	|d| dz  }|
dk    r	|d	|
 dz  }|d
| d| d| dz  }|dz  }d| d}|S )z
    Export TableData to HTML table.

    Args:
        table_data (TableData): TableData object.

    Returns:
        str: HTML string.
    r   r+  z<tr>thtdr/   z
 rowspan=""z
 colspan="<>z</z</tr>z<table>z</table>)re  rf  rh   rd  rm  rg   rG  rI  rH  rK  htmlescaper   rV  rM  )rk  nrowsncolsbodyrm  ro   rp   rl  rowspanrowstartcolspancolstartr4  celltagopening_tags                  r   export_to_htmlr  Y  sd    EE
:!""a''rD?D5\\  u 	< 	<A"1gajD!%0IXG!%0IXG1}}Ak$)//"3"344G"0:dddG$,K{{6G6666{{6G6666;;;g;;;;;;DD#T###DKr   otsl_strc                    t          | t                    sJ |                                 } t          | vr
| t          z   S |                     t                    }g }|D ]}|st
                              |          }|s"t          |          }d}t          |          D ]$\  }}|	                    t                    r|dz   }%|                    |||d           |st          S |rt          d |D                       nd}	|rt          d |D                       nd}
|	}t          |	|
          }t          d          }|}t          ||dz             D ]'t          fd|D                       }||k     r|}}(g }|D ]e}|d         }t          |          }||k    r|d	|         }nt           g||z
  z  }||z   }|                    d
                    |                     ft                              |          t          z   S )z
    Pad OTSL string to a square (rectangular) format, ensuring each row has equal number of cells.

    Args:
        otsl_str (str): OTSL string.

    Returns:
        str: Padded OTSL string.
    r   r/   )	raw_cells	total_lenmin_lenc              3   &   K   | ]}|d          V  dS )r  NrN   r  s     r   r   z%otsl_pad_to_sqr_v2.<locals>.<genexpr>  s&      >>c3y>>>>>>>r   c              3   &   K   | ]}|d          V  dS r  NrN   r  s     r   r   z%otsl_pad_to_sqr_v2.<locals>.<genexpr>  s'      ==SK(======r   r   c              3   H   K   | ]}t          |d          z
            V  dS r  r   )rP   r  r   s     r   r   z%otsl_pad_to_sqr_v2.<locals>.<genexpr>  s5       S S3S%5%=!>!> S S S S S Sr   r  Nr+  )rv   r   rV  rw  r~  OTSL_FIND_PATTERNr}  rh   rj   
startswithrx  r   r&   r   rg   r   ry  r3  )r  linesrow_dataliner  r  r  ro   cell_strglobal_min_widthmax_total_lensearch_start
search_endmin_total_costoptimal_widthcurrent_total_costrepaired_linesr  cellscurrent_len	new_cellspaddingr   s                         @r   otsl_pad_to_sqr_v2r  ~  sq    h$$$$$~~Hh'!!NN7##EH 
 
 	%--d33	 		NN	$Y// 	  	 KAx""9--  a%#)PP	
 	
 	
 	
  BJQs>>X>>>>>>PQAIPC==H======qM#L%}55J5\\NM|Z!^44 " "  S S S S( S S SSS../N!MN 2 2K %jj&&n}n-II k][%@AGIbggi001111<<'''11r   otsl_contentc                     t          |           } t          |           \  }}t          ||          \  }}t          t	          |          |rt          d |D                       nd|          }t          |          S )z
    Convert OTSL-v1.0 string to HTML. Only 6 tags allowed: <fcel>, <ecel>, <nl>, <lcel>, <ucel>, <xcel>.

    Args:
        otsl_content (str): OTSL string.

    Returns:
        str: HTML table.
    c              3   4   K   | ]}t          |          V  d S r   r  r  s     r   r   z'convert_otsl_to_html.<locals>.<genexpr>  s(      ;;3c#hh;;;;;;r   r   )re  rf  rd  )r  r  r  rc  rh   r&   r  )r  r  mixed_textsrd  r  rk  s         r   convert_otsl_to_htmlr    s     &l33L6|DDFK$4[&$I$I!K!%&&?OV#;;*:;;;;;;UV  J
 *%%%r   c                     t          |           }t          d|dz  dz             D ]%}||z  dk    r| d|         }|||z  z  | k    r|c S &dS )z
    Find the shortest substring that repeats to form the entire string.

    Args:
        s (str): Input string.

    Returns:
        str or None: Shortest repeating substring, or None if not found.
    r/   r   r   N)rh   rg   )rp  nro   	substrings       r   !find_shortest_repeating_substringr    sp     	AA1a1fqj!! ! !q5A::"1"IAF#q((    4r      r   r  min_repeatsc                 ^   t          t          |           |z  |dz
  d          D ]}| | d         }|                     ||z            rbd}| }|                    |          r%|d|          }|dz  }|                    |          %t          |           ||z  z
  }| d|         ||fc S dS )a!  
    Detect if string ends with a repeating phrase.

    Args:
        s (str): Input string.
        min_len (int): Minimum length of unit.
        min_repeats (int): Minimum repeat count.

    Returns:
        Tuple[str, str, int] or None: (prefix, unit, count) if found, else None.
    r/   r   Nr   )rg   rh   endswith)rp  r  r  ro   unitcounttemp_sstart_indexs           r   find_repeating_suffixr    s     3q66k*GaK<< 	0 	0!v::d[()) 	0EF//$'' !
 //$''  a&&EAI.K\k\?D%////	0 4r   r     r4  line_thresholdchar_threshold	min_countc                    t          |           |k     r| S |                                 }|s| S d|vrUt          |          dk    rBt          |dd          }|r.|\  }}}	t          |          |	z  t          |          dz  k    r|S d|vrKt          |          |k    r8t          |          }|r't          |          t          |          z  }	|	|k    r|S d |                     d          D             }
|
s| S t          |
          }||k     r| S t          |
          }|                    d          d	         \  }}	|	|k    r|	|z  d
k    r|S | S )a  
    Detect and truncate character-level, phrase-level, or line-level repetition in content.

    Args:
        content (str): Input text.
        line_threshold (int): Min lines for line-level truncation.
        char_threshold (int): Min repeats for char-level truncation.
        min_len (int): Min length for char-level check.

    Returns:
        Union[str, str]: (truncated_content, info_string)
    r,  d   r  r   )r  r  rZ   c                 ^    g | ]*}|                                 |                                 +S rN   rv  )rP   r  s     r   rR   z/truncate_repetitive_content.<locals>.<listcomp>&  s-    JJJdTZZ\\JTZZ\\JJJr   r/   r   g?)rh   rV  r  r  r~  r   most_common)r4  r  r  r  r  stripped_contentsuffix_matchprefixrepeating_unitr  r  total_linesline_countsmost_common_lines                 r   truncate_repetitive_contentr    s   & 7||i}}  ###,<(=(=(C(C,-=qVWXXX 	,8)FNE>""U*S1A-B-BS-HHH ###,<(=(=(G(G:;KLL 	&())S-@-@@E&&%% KJgmmD&9&9JJJE e**K^##%..K)55a88;eEK$7C#?#?Nr   c                 <   dd l }t          | j                  dk    r|                    | |j                  }n|                                 }|j        t          j        k    r|	                    t          j                  }t          |                                          }t          |                                          }||k    r| S t          j        dt          j                  }t          ||dz             D ] }t          ||z
  ||z
  z  dz            ||<   !|                    ||          }|                    |dd|j                  \  }}	|                    |	          }
|
| S |                    |
          \  }}}}| |||z   |||z   f         }|S )Nr   r0      rV   r/   r      )r   rh   shapecvtColorCOLOR_BGR2GRAYr   rW   r4   uint8astyper   r&   r%   zerosrg   LUT	thresholdTHRESH_BINARY_INVfindNonZeroboundingRect)ry   r   graymax_valmin_vallutr   rQ  r   binaryr-   r  r  r   r   croppeds                   r   crop_marginr
  4  s   JJJ
39~~||C!344xxzzzRX{{28$$$((**ooG$((**ooG'
 (3bh
'
'
'C7GaK(( @ @a'kg&783>??A774DdCc.CDDIAv__V$$F~
!!&))JAq!Q!a!e)QQY&'GNr   z#<\|TEXT_START\|>(.*?)<\|TEXT_END\|>z!<\|LOC_BEGIN\|>(.*?)<\|LOC_END\|>z<\|LOC_(\d+)\|>r_   c                    | j         dd         \  }}|dk     rq|dk     rkt          |           }	 t          j        j        }n# t
          $ r t          j        }Y nw xY w|                    |dz  |dz  f|          }t          |          S | S )zG
    Post-process the input image to extract location information.
    Nr   i  )r  rz   r
   
ResamplingLANCZOSAttributeErrorresizer|   )r_   r   r   pil_imgresample_filters        r   pre_process_for_spottingr  ]  s     ;rr?DAq4xxAHHu%%	,#.6OO 	, 	, 	,#mOOO	,..!a%QAA7###Ls   A AA	input_strr   r   c           	      6   t          | t                    sJ t                              |           }t                              |           }g }g }t          t          |          t          |                    }t          |          D ]}||                                         }	t                              ||                   }
t          |
          dk     rPt          t          t          |
dd                             fdt          ddd          D             }fd|D             }|                    |           |                    |	           |r|st          t                              |                     }d}d}|dz   t          |          k     r|||dz            }d |D             fd	t          ddd          D             }fd
|D             }| ||d                                                  }|                                }	|                    |	           |                    |           |d                                         }|dz  }|dz   t          |          k     d                    |          }||d}||fS )zL
    Post-process the input string to extract text and location blocks.
    r  Nc                 6    g | ]}|         |d z            fS r/   rN   rP   rp   valss     r   rR   z-post_process_for_spotting.<locals>.<listcomp>  s*    >>>!Qa!e%>>>r   r   r   c                 H    g | ]}|d          dz  z  |d         dz  z  fS r   g     @@r/   rN   rP   pr   r   s     r   rR   z-post_process_for_spotting.<locals>.<listcomp>  s8    CCC!!v!1Q4&=1#45CCCr      c                 R    g | ]$}t          |                    d                     %S r  )r   r/  )rP   ms     r   rR   z-post_process_for_spotting.<locals>.<listcomp>  s(    333C

OO333r   c                 6    g | ]}|         |d z            fS r  rN   r  s     r   rR   z-post_process_for_spotting.<locals>.<listcomp>  s*    BBBaDGT!a%[)BBBr   c                 H    g | ]}|d          dz  z  |d         dz  z  fS r  rN   r  s     r   rR   z-post_process_for_spotting.<locals>.<listcomp>  s8    GGGaAaD6MA%qtf}q'89GGGr   r   r.  )	rec_polys	rec_texts)rv   r   ANNOT_TEXT_REr}  LOC_BLOCK_REr%   rh   rg   rV  LOC_ITEM_REr  mapr   r   LOC_TOKEN_REfinditerr   r   r3  )r  r   r   r  
loc_blocksr"  r#  r  ro   txt	loc_itemsptsmatcheslast_endr/  	text_span
result_strspotting_resr  s    ``               @r   post_process_for_spottingr3  n  s    i%%%%% !!),,E%%i00JII 	CJJJ((A1XX 
 
Ahnn''
166	y>>ACYrr]++,,>>>>uQ1~~>>>CCCCCsCCC  I |,,Y7788!ec'll""AAI&E33U333DBBBB5Aq>>BBBCGGGGG3GGGC!(U1X^^-=-="=>I//##CS!!!S!!!Ry}}HFA !ec'll"" Y''J!*CCL|##r   )r   )r   r   )r   )r  r   )r   r   r   r  )Er  r  r   r@  collectionsr   r   r   typingr   r   r   r   r	   numpyr4   PILr
   pydanticr   r   r   layout_parsing.utilsr   r   r   r   r   r   r,   ndarrayrG   rt   rz   r|   r   r   r   r  r(  rD  rF  rc  rw  rx  ry  rz  r{  r|  NON_CAPTURING_TAG_GROUPcompileDOTALLr  r  r  r  r  r  r  r  r  r
  Sr$  r%  r&  r(  r  r3  rN   r   r   <module>r?     sK         				             0 0 0 0 0 0 0 0 0 0 0 0 0 0           ? ? ? ? ? ? ? ? ? ?          #2 #25c?##25c?##2 #2 	#2 #2 #2 #2LJ
2:rz!"   >B#d4j)B#>AB#	#tDz/B# B# B# B#J         "9 9 9 9x] ] ] ]@; ; ;|;4 ;4 ;4|!0 !0 !0H3 3 3 3 3	 3 3 3l. . . . .	 . . .d 					G BJBB&=BBB")   
C    *p) p) p)f"y " " " "J72 72 72 72 72 72t&s & & & &* sDy1A    ( 23 
+.
5c3%&   : 6 666 6 	6
 6 	6 6 6 6r     F 
A24HHrz>EEbj+,,rz,--BJ 4T	?    "2$2$2$"2$
3S$Y 2$ 2$ 2$ 2$ 2$ 2$r   