
    TAiN                    X   S r SSKJr  SSKrSSKrSSKrSSKrSSKJr  SSK	J
r
  SSKJrJr  SSKJr  SSKJr  SS	KJrJrJr  SS
KJrJrJrJrJrJrJrJrJrJ r   SSK!J"r#  SSK!J$r$  \RJ                  " \&5      r'Sr(\RR                  r)\ " S S5      5       r* " S S\+5      r, " S S5      r-g)zhOCR transform implementation.    )annotationsN)	dataclass)pairwise)atanpi)Path)ElementTree)MatrixName	Rectangle)
BLACKBLUECYAN	DARKGREENGREENMAGENTAREDCanvasTextTextDirection)EncodableFont)GlyphlessFontg      R@c                  n    \ rS rSr% SrSrS\S'   SrS\S'   SrS\S'   Sr	S\S'   Sr
S\S	'   SrS\S
'   Srg)DebugRenderOptions-   z'A class for managing rendering options.Fboolrender_paragraph_bboxrender_baselinerender_trianglerender_line_bboxrender_word_bboxrender_space_bbox N)__name__
__module____qualname____firstlineno____doc__r   __annotations__r   r   r    r!   r"   __static_attributes__r#       X/var/www/html/land-ocr/venv/lib/python3.13/site-packages/ocrmypdf/hocrtransform/_hocr.pyr   r   -   sE    1"'4'!OT!!OT!"d""d"#t#r+   r   c                      \ rS rSrSrSrg)HocrTransformError9   z$Error while applying hOCR transform.r#   N)r$   r%   r&   r'   r(   r*   r#   r+   r,   r.   r.   9   s    .r+   r.   c                  |   \ rS rSrSr\R                  " S\R                  5      r\R                  " S\R                  5      r	\R                  " S\R                  5      r
S\" S5      \" 5       SS	.           S"S
 jjrS#S jr\S$S j5       r\S%S j5       r\S&S j5       rS'S(S jjr\S)S j5       rSSS.       S*S jjrS rS r\S 5       r            S+S jr                S,S jr\4S-S jjr\4S.S jjr\S4   S/S jjr \!S4   S/S jjr"\#S4   S/S jjr$\%S4   S.S  jjr&S!r'g)0HocrTransform=   zA class for converting documents from the hOCR format.

For details of the hOCR format, see:
http://kba.github.io/hocr-spec/1.2/.
z
        bbox \s+
        (\d+) \s+   # left: uint
        (\d+) \s+   # top: uint
        (\d+) \s+   # right: uint
        (\d+)       # bottom: uint
        z|
        baseline \s+
        ([\-\+]?\d*\.?\d*) \s+  # +/- decimal float
        ([\-\+]?\d+)            # +/- int
        zO
        textangle \s+
        ([\-\+]?\d*\.?\d*)  # +/- decimal float
        Fz/f-0-0N)debugfontnamefontdebug_render_optionsc          	     L   U(       a.  [         R                  S[        5        [        UUSUSSS9U l        OU=(       d
    [        5       U l        X l        [        R                  " [        R                  " U5      5      U l
        X@l        XPl        [        R                  " SU R                  R                  5       R                   5      nSU l        U(       a  UR%                  S5      U l        U R                  R'                  U R)                  SS5      5       H  nU R+                  U5      n	U	(       d  [-        S	5      eU	R.                  U	R0                  -
  U R
                  [2        -  -  U l        U	R6                  U	R8                  -
  U R
                  [2        -  -  U l          g
   g
)z$Initialize the HocrTransform object.z Use debug_render_options insteadF)r   r   r    r!   r   r"   z
({.*})html    divocr_pagez$hocr file is missing page dimensionsN)logwarningDeprecationWarningr   render_optionsdpir	   parseosfspathhocr	_fontname_fontrematchgetroottagxmlnsgroupfindall_child_xpathelement_coordinatesr.   urxllxINCHwidthuryllyheight)
selfhocr_filenamer@   r3   r4   r5   r6   matchesr:   coordss
             r,   __init__HocrTransform.__init__^   s=    KK:<NO"4 % %!&!&&+"'#D #7"N:L:ND%%bii&>?	!
 ((=$))*;*;*=*A*AB
 q)DJ99$$T%6%6uj%IJC--c2F()OPP **vzz1dhhoFDJ!::

2txx$GDK Kr+   c                    UR                   b  UR                   OSnU H  nX R                  U5      -  nM     X!R                  b  UR                  -  nU$ S-  nU$ )z;Return the textual content of the element and its children.r8   )text_get_element_texttail)rW   elementr^   childs       r,   r_   HocrTransform._get_element_text   s]    &||7w||RE**511D  8@ ?A@r+   c           
     X   U R                   R                  UR                  R                  SS5      5      nU(       d  g[	        [        UR                  S5      5      [        UR                  S5      5      [        UR                  S5      5      [        UR                  S5      5      5      $ )z6Get coordinates of the bounding box around an element.titler8   Nr9            )box_patternsearchattribgetr   floatrL   clsra   rY   s      r,   rO   !HocrTransform.element_coordinates   s     //((););GR)HI'--"#'--"#'--"#'--"#	
 	
r+   c                    U R                   R                  UR                  R                  SS5      5      nU(       d  g[	        UR                  S5      5      [        UR                  S5      5      4$ )z#Get baseline's slope and intercept.re   r8   )        rr   r9   rf   )baseline_patternrj   rk   rl   rm   rL   intrn   s      r,   baselineHocrTransform.baseline   sX     &&--gnn.@.@".MNW]]1%&GMM!,<(===r+   c                    U R                   R                  UR                  R                  SS5      5      nU(       d  g[	        UR                  S5      5      $ )zGet text angle of an element.re   r8   rr   r9   )textangle_patternrj   rk   rl   rm   rL   rn   s      r,   	textangleHocrTransform.textangle   sE     ''..w~~/A/A'2/NOW]]1%&&r+   c                H    SU R                    U 3nU(       a	  USU S3-  nU$ )Nz.//z	[@class='z'])rK   )rW   html_tag
html_classxpaths       r,   rN   HocrTransform._child_xpath   s1    djj\(,yB//Er+   c                0    [         R                  " SU5      $ )z;Normalize the given text using the NFKC normalization form.NFKC)unicodedata	normalize)ro   ss     r,   normalize_textHocrTransform.normalize_text   s     $$VQ//r+   T)image_filenameinvisible_textc                  [        U R                  U R                  4S9nUR                  U R                  U R
                  5        [        5       R                  SU R                  5      R                  SS5      R                  [        U R                  -  [        U R                  -  5      n[        R                  U5        UR                  R                  US9   U R                  U5        SnU R                   R#                  U R%                  SS5      5       Hk  nS	 UR#                  U R%                  S
5      5       5        H=  nSnU R'                  U5      n	U R)                  U5      n
U R+                  UUSUU	U
5        M?     Mm     U(       dR  U R                   R-                  U R%                  SS5      5      nU R'                  U5      n	U R+                  UUSUU	S5        SSS5        Ub1  UR                  R/                  USSU R                  U R                  S9  UR1                  5       R3                  U5        g! , (       d  f       Nb= f)ap  Creates a PDF file with an image superimposed on top of the text.

Text is positioned according to the bounding box of the lines in
the hOCR file.
The image need not be identical to the image used to create the hOCR
file.
It can have a lower resolution, different color mode, etc.

Arguments:
    out_filename: Path of PDF to write.
    image_filename: Image to use for this file. If omitted, the OCR text
        is shown.
    invisible_text: If True, text is rendered invisible so that is
        selectable but never drawn. If False, text is visible and may
        be seen if the image is skipped or deleted in Acrobat.
)	page_sizer   r9   cmFpocr_parc              3  t   #    U  H.  nS UR                   ;   d  M  UR                   S    S;   d  M*  Uv   M0     g7f)class>   ocr_line
ocr_headerocr_captionocr_textfloatN)rk   ).0ra   s     r,   	<genexpr>'HocrTransform.to_pdf.<locals>.<genexpr>   sA      #J'..0   w/QR G#Js   88	8spanT	ocrx_wordr:   r;   N)rS   rV   )r   rS   rV   add_fontrE   rF   r
   
translatedscaledrR   r@   r<   r3   do
save_state_debug_draw_paragraph_boxesrD   iterfindrN   _get_text_direction_get_inject_word_breaks_do_linefind
draw_imageto_pdfsave)rW   out_filenamer   r   canvaspage_matrixfound_linesparline	directioninject_word_breaksroots               r,   r   HocrTransform.to_pdf   s   2 4::t{{";<

3HZ4;;'VAr]VD488OTDHH_5	 	 			+YY!![!1,,V4Kyy))$*;*;C*KL#&<<0A0A&0I#JD #'K $ 8 8 =I)-)E)Ec)J&MM#&!* M( yy~~d&7&7z&JK 44T:	"7 2H %II  1DJJt{{ ! 
 	\*U 21s   DH==
Ic                    Uc  [         R                  $ UR                  R                  SS5      S:X  a  [         R                  $ [         R                  $ )zGet the text direction of the paragraph.

Arabic, Hebrew, Persian, are right-to-left languages.
When the paragraph element is None, defaults to left-to-right.
dirltrrtl)r   LTRrk   rl   RTL)rW   r   s     r,   r   !HocrTransform._get_text_direction  sN     ; $$$ zz~~eU+u4 	
 ""	
r+   c                t    UR                   R                  SS5      n[        R                  U5        US;   a  gg)a  Determine whether word breaks should be injected.

In Chinese, Japanese, and Korean, word breaks are not injected, because
words are usually one or two characters and separators are usually explicit.
In all other languages, we inject word breaks to help word segmentation.
langr8   >   jpnkorchi_simchi_traFT)rk   rl   r<   r3   )rW   r   r   s      r,   r   %HocrTransform._get_inject_word_breaks  s2     zz~~fb)		$77r+   c                    X!S   -  US   -   $ )z/Calculate the value of a polynomial at a point.r   r9   r#   )ro   polyxs      r,   polyvalHocrTransform.polyval$  s     7{T!W$$r+   c                   Uc  gU R                  U5      nU(       d  gUR                  UR                  ::  a'  [        R	                  SUU R                  U5      5        gU R                  X5        UR                  UR                  4n[        5       R                  " U6 R                  U R                  U5      * 5      n	U	R                  5       R                  U5      n
U R                  U5      u  p[        U5      S:  a  Sn[!        U5      nU	R                  SU
R"                  5      R                  SU5      R                  U[$        -  S-  5      nUR&                  R)                  US9   [+        US9nU
R"                  U-   nUR-                  U R.                  U5        UR1                  U(       a  S	OS5        U R3                  XR                  5       R                  U5      S5        UR&                  R5                  [6        5        UR9                  U R;                  S
U5      5      n[=        US/-   5       H  u  nnU R?                  UUUUUUUU5        M      UR&                  RA                  U5        SSS5        g! , (       d  f       g= f)zRender the text for a given line.

The canvas's coordinate system must be configured so that hOCR pixel
coordinates are mapped to PDF coordinates.
Nz:line box is invalid so we cannot render it: box=%s text=%sg{Gzt?rr   r      r   )r   rg   r   )!rO   rT   rU   r<   errorr_   _debug_draw_line_bboxrQ   r
   r   rotatedry   inverse	transformru   absr   rV   r   r   r   r   r5   rE   render_mode_debug_draw_baseline
fill_colorr   rM   rN   r   _do_line_word	draw_text)rW   r   r   	elemclassr   text_directionr   line_min_aabbtop_left_cornerline_size_aabb_matrixline_size_aabbslope	interceptslope_anglebaseline_matrixr^   fontsizeelementselem	next_elems                       r,   r   HocrTransform._do_line)  sF    < 006 1 11IIL&&t,
 ""69 ),,m.?.?@HZ(* WdnnT**+	 	 /668BB=Q==.u:E5k "
 Z>001Z9%W[2%+, 	 YY!!_!5.1D%,,y8HIIdnnh/.Qa8%%//1;;MJA II  '||D$5$5fi$HIH#+Hv,=#>i""#"&	 $? II%/ 655s   *D
I==
Jc	           
     <   Uc  gU R                  U R                  U5      R                  5       5      n	U	S:X  a  gU R                  U5      n
U
c  gUR	                  5       R                  U
5      nU R                  R                  X5      nU R                  X5        U R                  X5        U[        R                  :X  a  [        R                  SU	5        US:  a  U[        R                  :X  a*  UR                  [!        SSSSUR"                  S5      5        OJU[        R                  :X  a6  UR                  [!        SSSSUR"                  UR$                  -   S5      5        UR'                  SUR$                  -  U-  5        UR)                  U R                  R+                  U	5      5        Ub  U R                  U5      OSnUc  gU(       d  gUR	                  5       R                  U5      nU[        R                  :X  a7  [-        UR.                  UR0                  UR"                  UR2                  5      nOJU[        R                  :X  a6  [-        UR.                  UR0                  UR"                  UR2                  5      nU R5                  UW5        U R                  R                  SU5      nUS:  a  UR$                  S:  a  U[        R                  :X  a*  UR                  [!        SSSSUR"                  S5      5        OJU[        R                  :X  a6  UR                  [!        SSSSUR"                  UR$                  -   S5      5        UR'                  SUR$                  -  U-  5        UR)                  U R                  R+                  S5      5        ggg)	z"Render the text for a single word.Nr8   zRTL: %sr   r9   r   d    )r   r_   striprO   r   r   rF   
text_width_debug_draw_word_triangle_debug_draw_word_bboxr   r   r<   infor   text_transformr
   rQ   rS   horiz_scaleshowtext_encoder   rP   rU   rT   _debug_draw_space_bbox)rW   r   line_matrixr^   r   r   r   r   r   elemtxthocr_boxbox
font_widthhocr_next_boxnext_box	space_boxspace_widths                    r,   r   HocrTransform._do_line_word  s    <%%d&<&<T&B&H&H&JKb=++D1!!#--h7ZZ**7=
 	&&v3""6/ ]...HHY(>!2!22##F1aB$CD=#4#44##F2q!R3999La$PQS399_z9:IIdjj,,W56 4=3HD$$Y/d 	  
 "&&(22=A]...!#''377HLL(,,OI}000!(,,(,,OI##FI6jj++C:?y2!2!22##F1aB	q$IJ=#4#44##2q!R)H!L S9??2[@AIIdjj,,S12  3?r+   c           
     r   U R                   R                  (       d  gUR                  R                  5          UR                  R	                  U5      R                  S5        U R                  R                  U R                  SS5      5       H  nU R                  U5      R                  5       n[        U5      S:X  a  M3  U R                  U5      nUc  MI  UR                  R                  UR                  UR                  UR                   UR"                  SS9  M     SSS5        g! , (       d  f       g= f)z-Draw boxes around paragraphs in the document.N皙?r   r   r   Ffill)r?   r   r   r   stroke_color
line_widthrD   r   rN   r_   r   lenrO   rectrQ   rU   rS   rV   )rW   r   colorr   r   r   s         r,   r   )HocrTransform._debug_draw_paragraph_boxes  s    ""88YY!!#II""5)44S9		**4+<+<S)+LM006<<>w<1$2248?		KKgmmW^^RW   N $##s   C(D((
D6c           	     j   U R                   R                  (       d  gUR                  R                  5          UR                  R	                  U5      R                  S5      R                  UR                  UR                  UR                  UR                  SS9  SSS5        g! , (       d  f       g= f)z'Render the bounding box of a text line.Ng333333?Fr   )r?   r    r   r   r   r   r   rQ   rU   rS   rV   )rW   r   line_boxr  s       r,   r   #HocrTransform._debug_draw_line_bbox  sz    ""33YY!!#II""5)44T:??hllHNNHOORW @  $##   A$B$$
B2r   c                T   U R                   R                  (       d  gUR                  R                  5          UR                  R	                  U5      R                  U5      R                  UR                  UR                  UR                  UR                  5      R                  UR                  UR                  UR                  UR                  5      R                  UR                  UR                  UR                  UR                  5        SSS5        g! , (       d  f       g= f)zARender a triangle that conveys word height and drawing direction.N)r?   r   r   r   r   r   r   rQ   rU   rP   rT   rW   r   r   r  r   s        r,   r   'HocrTransform._debug_draw_word_triangle  s     ""22YY!!#II""5)44Z@EE#''377d377CGGSWWcgg6tt#''3778 $##s   CD
D'c           	     j   U R                   R                  (       d  gUR                  R                  5          UR                  R	                  U5      R                  U5      R                  UR                  UR                  UR                  UR                  SS9  SSS5        g! , (       d  f       g= f)z Render a box depicting the word.NFr   )r?   r!   r   r   r   r   r   rQ   rU   rS   rV   r  s        r,   r   #HocrTransform._debug_draw_word_bbox  s{     ""33YY!!#II""5)44Z@EE#))SZZe F  $##r  c           	     j   U R                   R                  (       d  gUR                  R                  5          UR                  R	                  U5      R                  U5      R                  UR                  UR                  UR                  UR                  SS9  SSS5        g! , (       d  f       g= f)z3Render a box depicting the space between two words.NTr   )r?   r"   r   r   r   r   r   rQ   rU   rS   rV   r  s        r,   r   $HocrTransform._debug_draw_space_bbox   s{     ""44YY!!#II  '22:>CC#))SZZd D  $##r  g      ?c                D   U R                   R                  (       d  gUR                  R                  5          UR                  R	                  U5      R                  U5      R                  UR                  UUR                  U5        SSS5        g! , (       d  f       g= f)zRender the text baseline.N)	r?   r   r   r   r   r   r   rQ   rP   )rW   r   r  baseline_llyr  r   s         r,   r   "HocrTransform._debug_draw_baseline  sn     ""22YY!!#II""5)44Z@EE	 $##s   AB
B)rF   rE   r@   rV   rD   r?   rS   rK   )rX   z
str | Pathr@   rm   r3   r   r4   r   r5   Fontr6   zDebugRenderOptions | None)ra   Elementreturnstr)ra   r  r  zRectangle | None)ra   r  r  ztuple[float, float])ra   r  r  rm   )N)r|   r  r}   z
str | Noner  r  )r   r  r  r  )r   r   r   zPath | Noner   r   r  None)r   r   r   Element | Noner   r  r   r   r   r   r   r   )r   r   r   r
   r^   r   r   rm   r   r  r   r  r   r   r   r   )r   r   )r   r   r  r   )r   r   r   r   )(r$   r%   r&   r'   r(   rG   compileVERBOSEri   rs   rx   r   r   r[   r_   classmethodrO   ru   ry   rN   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r*   r#   r+   r,   r1   r1   =   s    **	 	

	K zz	
 	

 

	 	

 h"_:>* "* 	*
 * * * 8*X 

 

 > > ' ' 0 0 '+#L+ L+ $	L+
 L+ 
L+\
 % %^&^& ^& 	^&
 ^& &^& !^&@B3B3 B3 	B3
 B3 B3 "B3 &B3 !B3H AE $ PT  58C#, 5:c		#,	 5>#		#,	    r+   r1   ).r(   
__future__r   loggingrB   rG   r   dataclassesr   	itertoolsr   mathr   r   pathlibr   	xml.etreer	   pikepdfr
   r   r   pikepdf.canvasr   r   r   r   r   r   r   r   r   r   ocrmypdf.hocrtransform._fontr   r  r   	getLoggerr$   r<   rR   r  r   	Exceptionr.   r1   r#   r+   r,   <module>r&     s    % "  	 	  !    ! + +   ? 6!


 $ $ $/ /_ _r+   