
    i-              
           S r SSKrSSKrSSKJr  SSKJr  SSKJr  S rS r	S r
S	 rS
 rSS jrSSSSSSSSSS.	rSSS.r " S S\5      rg)zZ
This code is refer from: https://github.com/weizwx/html2docx/blob/master/htmldocx/h2d.py
    N)Document)BeautifulSoup)
HTMLParserc                 H    / SQnU R                  SR                  U5      SS9$ )Nz
table > trztable > thead > trztable > tbody > trztable > tfoot > trz, F	recursive)selectjoin)
table_souptable_row_selectorss     l/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddleocr/ppstructure/recovery/table_process.pyget_table_rowsr      s,     TYY':;uMM    c                 8    U (       a  U R                  SS/SS9$ / $ )NthtdFr   )find_all)rows    r   get_table_columnsr   $   s     :=3<<t<6E2Er   c                     [        U 5      nU(       a  [        US   5      O/ nSnU H-  nUR                  R                  SS5      nU[	        U5      -  nM/     X4$ )Nr   colspan   )r   r   attrsgetint)r   rowscols	col_countcolr   s         r   get_table_dimensionsr!   )   s\    *%D *.T!W%2DI))--	1-S\!	  ?r   c                 v    SR                  U R                   Vs/ s H  n[        U5      PM     sn5      $ s  snf )N )r   contentsstr)soupis     r   get_cell_htmlr(   9   s-     88T]]3]SV]3443s   6c                 v    U R                   nUR                  5       R                  U5        S =Ul        Ul         g N)_element	getparentremove_p)	paragraphps     r   delete_paragraphr1   @   s/    AKKMAD1:r   c                     U(       a  [         R                  " SSU 5      n U(       a  [         R                  " SSU 5      n [         R                  " SSU 5      n [         R                  " SSU 5      $ )a  Remove white space from a string.
Args:
    string(str): The string to remove white space from.
    leading(bool, optional): Remove leading new lines when True.
    trailing(bool, optional): Remove trailing new lines when False.
Returns:
    str: The input string with new line characters removed and white space squashed.
Examples:
    Single or multiple new line characters are replaced with space.
        >>> remove_whitespace("abc\ndef")
        'abc def'
        >>> remove_whitespace("abc\n\n\ndef")
        'abc def'
    New line characters surrounded by white space are replaced with a single space.
        >>> remove_whitespace("abc \n \n \n def")
        'abc def'
        >>> remove_whitespace("abc  \n  \n  \n  def")
        'abc def'
    Leading and trailing new lines are replaced with a single space.
        >>> remove_whitespace("\nabc")
        ' abc'
        >>> remove_whitespace("  \n  abc")
        ' abc'
        >>> remove_whitespace("abc\n")
        'abc '
        >>> remove_whitespace("abc  \n  ")
        'abc '
    Use ``leading=True`` to remove leading new line characters, including any surrounding
    white space:
        >>> remove_whitespace("\nabc", leading=True)
        'abc'
        >>> remove_whitespace("  \n  abc", leading=True)
        'abc'
    Use ``trailing=True`` to remove trailing new line characters, including any surrounding
    white space:
        >>> remove_whitespace("abc  \n  ", trailing=True)
        'abc'
z
^\s*\n+\s* z
\s*\n+\s*$z\s*\n\s*r#   z\s+)resub)stringleadingtrailings      r   remove_whitespacer9   G   sX    P r62 r62 VVKf-F66&#v&&r   bolditalic	underlinestrikesuperscript	subscript)	bstrongemr'   ussupr5   r   Courier)codeprec                   f   ^  \ rS rSrU 4S jrSS jrS rS rS rS r	S r
SS	 jrS
 rS rSrU =r$ )
HtmlToDocx   c                 h   > [         TU ]  5         SSSSS.U l        / SQU l        S U l        S U l        g )NT)fix-htmlimagestablesstylesr   )super__init__optionsr   table_styleparagraph_style)self	__class__s    r   rR   HtmlToDocx.__init__   s?    	
$
   #r   c                 2   / / S.U l         U(       a  Xl        O[        5       U l        U R                  S   U l        U R                  U l        SU l        U R                  S   U l        U R                  S   U l        S U l	        SU l
        S U l        SU l        g )N)spanlistrM   TrN   rP   Fr   )tagsdocr   rS   bsdocumentinclude_tablesinclude_imagesinclude_stylesr/   skipskip_taginstances_to_skip)rV   r_   s     r   set_initial_attrsHtmlToDocx.set_initial_attrs   s    
	 HzDH,,z*""ll84"ll84	!"r   c                 H    UR                   U l         UR                  U l        g)z1Copy settings from another instance of HtmlToDocxN)rT   rU   )rV   others     r   copy_settings_fromHtmlToDocx.copy_settings_from   s     ,,$44r   c                     / nSnU H<  nU(       a  US-  nM  UR                  U5        [        UR                  S5      5      nM>     U$ )z
Returns array containing only the highest level tables
Operates on the assumption that bs4 returns child elements immediately after
the parent element in `find_all`. If this changes in the future, this method will need to be updated
:return:
r   r   table)appendlenr   )rV   tables_soup
new_tablesnestrm   s        r   ignore_nested_tablesHtmlToDocx.ignore_nested_tables   sP     
 E	e$u~~g./D ! r   c                     [        U S5      (       d  SU l        g U R                  U R                  R	                  S5      5      U l        SU l        g )Nr&   Frm   r   )hasattrr`   rs   r&   r   rO   table_no)rV   s    r   
get_tablesHtmlToDocx.get_tables   sB    tV$$"'D//		0B0B70KLr   c                     U R                   (       a1  [        (       a&  [        US5      U l        [        U R                  5      nU R                  (       a  U R                  5         U R                  U5        g )Nhtml.parser)r^   r   r&   r%   r`   rx   feed)rV   htmls     r   run_processHtmlToDocx.run_process   sF    77}}%dM:DItyy>DOO		$r   c                    [        U[        R                  R                  5      (       d&  [	        S[        R                  R                  -  5      eUR
                  S   nUR                  S:X  a  [        U5        U R                  U5        U R                  U5        U R                  R
                  (       d  U R                  R                  S5        g g )Nz Second argument needs to be a %sr   r3   )
isinstancedocxrm   _Cell
ValueError
paragraphstextr1   rf   r~   r]   add_paragraph)rV   r}   cellunwanted_paragraphs       r   add_html_to_cellHtmlToDocx.add_html_to_cell   s    $

 0 011?$**BRBRRSS!__Q/""b(/0t$ xx""HH""2& #r   c                      U(       a  XR                   l        g U R                  (       a  U R                  U R                   l        g g ! [         a  n[	        SU R                   S35      UeS nAff = f)NzUnable to apply style .)r/   stylerU   KeyErrorr   )rV   r   es      r   apply_paragraph_style HtmlToDocx.apply_paragraph_style   sf    	V',$%%'+';';$ & 	V5d6J6J5K1MNTUU	Vs   A ,A 
A1A,,A1c                    [        US5      n[        U5      u  pEUR                  [        U5      U5      nUR                  S   Ul        [        UR                  5      n[        UR                  5      nSn	[        U5       GHP  u  p[        U5      nSnU GH1  n[        UR                  R                  SS5      5      n[        UR                  R                  SS5      5      n[        U5      nUR                  S:X  a  SU-  nX:  d  X:  a  Mz  UR                  X5      nUR                   S	:w  a(  US-  nUR                  X5      nUR                   S	:w  a  M(  UR                  U	U-   S-
  X-   S-
  5      nUU:w  a  UR#                  U5        [%        5       nUR'                  U 5        UR)                  U=(       d    S
U5        X-  nGM4     U	S-  n	GMS     g)z
To handle nested tables, we will parse tables manually as follows:
Get table soup
Create docx table
Iterate over soup and fill docx table with new instances of this parser
Tell HTMLParser to ignore any tags until the corresponding closing table tag
r{   z
Table Gridr   r   r   rowspanr   z	<b>%s</b>r3   r#   N)r   r!   	add_tablero   rP   r   r   columns	enumerater   r   r   r   r(   namer   r   mergerJ   rj   r   )rV   r}   r]   r   r   cols_lenrm   num_rowsnum_colscell_rowindexr   r   cell_colr    r   r   	cell_html	docx_cellcell_to_mergechild_parsers                        r   handle_tableHtmlToDocx.handle_table   s    #47
-j9c$i2jj.uzz?u}}%#D/JE$S)DHciimmIq9:ciimmIq9:)#.	88t# +i 7I'8+?!JJx:	nn*MH %

8 >I  nn* !&

w&*H,>,B! -OOM2)|//5--i.>3	J#7 8 MH? *r   c                    U R                   (       a  g SU R                  ;  a  [        USS5      nU R                  (       d/  U R                  R                  5       U l        U R                  5         U R                  R                  S5      nU(       a  U R                  US   U5        g U R                  R                  U5      U l
        U R                  S   nU H0  nSU;   d  M  U R                  US   5      nU R                  U5        M2     U R                   Hf  nU[        ;   a*  [        U   n[        U R                  R                  US5        U[         ;   d  MC  [         U   nXR                  R                  l        Mh     g )NrH   TahrefrZ   r   )rc   r\   r9   r/   r]   r   r   r   handle_linkadd_runrunparse_dict_stringadd_styles_to_runfont_stylessetattrfont
font_namesr   )	rV   datalinkspansrZ   r   tag
font_style	font_names	            r   handle_dataHtmlToDocx.handle_data!  s(   99 		!$T46D~~!XX335DN&&(
 yy}}S!T&\40 ~~--d3DHIIf%Ed? 224=AE**51  yy+%!,S!1JDHHMM:t<*$ *3I)2HHMM& !r   )r^   r]   r_   ra   rb   r`   re   rS   r/   rU   r   rc   rd   r&   rw   r   rT   rO   r\   r*   )__name__
__module____qualname____firstlineno__rR   rf   rj   rs   rx   r~   r   r   r   r   __static_attributes____classcell__)rW   s   @r   rJ   rJ      s<    $"#&5
"'V0d$3 $3r   rJ   )FF)__doc__r4   r   r   bs4r   html.parserr   r   r   r!   r(   r1   r9   r   r   rJ    r   r   <module>r      s}    
    "NF
 52'l 

			

 
w3 w3r   