
    TAiO1                    &   % S r SSKJr  SSKrSSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJr  \R:                  " \5      rSSSSS.r S\!S'    " S S\RD                  5      r#Sr$ " S S\5      r%S%S jr&S&S jr'S'S jr(S(S jr)S)S jr*        S*S jr+S r,          S+S jr-S,S jr.S-S  jr/S.S! jr0                    S/S" jr1S0S# jr2                    S1S$ jr3g)2z"Interface to Tesseract executable.    )annotationsN)suppress)pi)fspath)Path)PIPESTDOUTCalledProcessErrorTimeoutExpired)Version)MissingDependencyErrorSubprocessOutputErrorTesseractConfigError)OrientationConfidence)get_versionrun      )autootsuzadaptive-otsusauvolazdict[str, int]TESSERACT_THRESHOLDING_METHODSc                      \ rS rSrSrS rSrg)TesseractLoggerAdapter%   z7Prepend [tesseract] to messages emitted from tesseract.c                .    U R                   US'   SU 3U4$ )Nextraz[tesseract] r   )selfmsgkwargss      T/var/www/html/land-ocr/venv/lib/python3.13/site-packages/ocrmypdf/_exec/tesseract.pyprocessTesseractLoggerAdapter.process(   s!    **wcU#V++     N)__name__
__module____qualname____firstlineno____doc__r#   __static_attributes__r&   r%   r"   r   r   %   s
    A,r%   r   a  
    v?
    (?:
        (?:(?P<epoch>[0-9]+)!)?                           # epoch
        (?P<release>[0-9]+(?:\.[0-9]+)*)                  # release segment
        (?P<pre>                                          # pre-release
            [-_\.]?
            (?P<pre_l>(a|b|c|rc|alpha|beta|pre|preview))
            [-_\.]?
            (?P<pre_n>[0-9]+)?
        )?
        (?P<post>                                         # post release
            (?:-(?P<post_n1>[0-9]+))
            |
            (?:
                [-_\.]?
                (?P<post_l>post|rev|r)
                [-_\.]?
                (?P<post_n2>[0-9]+)?
            )
        )?
        (?P<dev>                                          # dev release
            [-_\.]?
            (?P<dev_l>dev)
            [-_\.]?
            (?P<dev_n>[0-9]+)?
        )?
        (?P<date>
            [-_\.]
            (?:20[0-9][0-9] [0-1][0-9] [0-3][0-9])       # yyyy mm dd
        )?
        (?P<gitcount>
            [-_\.]?
            [0-9]+
        )?
        (?P<gitcommit>
            [-_\.]?
            g[0-9a-f]{2,10}
        )?
    )
    (?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
c                  x    \ rS rSrSr\R                  " S\-   S-   \R                  \R                  -  5      r
Srg)TesseractVersionY   zLModify standard packaging.Version regex to support Tesseract idiosyncrasies.z^\s*z\s*$r&   N)r'   r(   r)   r*   r+   recompileTESSERACT_VERSION_PATTERNVERBOSE
IGNORECASE_regexr,   r&   r%   r"   r.   r.   Y   s0    VZZ++g5rzzBMM7QFr%   r.   c                 (    [        [        SSS95      $ )N	tesseractztesseract\s(.+))regex)r.   r   r&   r%   r"   versionr9   a   s    K;MNOOr%   c                 .    [        5       [        S5      :  $ )z6Does Tesseract have -c thresholding method capability?z5.0)r9   r   r&   r%   r"   has_thresholdingr;   e   s    9&&r%   c            	        S n SS/n [        US[        [        SSS9nUR                  nUR                  5        H*  nUR                  S5      (       d  M  [        U " U5      5      e   UR                  5       tpgU Vs1 s H  oR                  5       iM     sn$ ! [         a!  n[        U " UR                  5      5      UeS nAff = fs  snf )Nc                    SnX-  nU$ )NzSTesseract failed to report available languages.
Output from Tesseract:
-----------
r&   )outputr    s     r"   
lang_error!get_languages.<locals>.lang_errork   s     	
 	
r%   r7   z--list-langsT)textstdoutstderrlogs_errors_to_stdoutcheckError)
r   r   r	   rB   r
   r   r>   
splitlines
startswithstrip)	r?   	args_tessprocr>   eline_headerrestlangs	            r"   get_languagesrQ   j   s     n-IB"&
  !!#??7##(F);<< $ &&(NG%)*TTJJLT**  B$Z%9:AB +s   "B >C
C$C  Cc                    S/nU (       a"  UR                  SSR                  U 5      /5        Ub  UR                  S[        U5      /5        U$ )Nr7   z-l+z--oem)extendjoinstr)langsengine_modeargss      r"   tess_base_argsrZ      sF    =DT388E?+,Wc+./0Kr%   c                0   ^  U 4S jn[        U" 5       5      $ )Nc               3  
  >#    TR                  5       R                  5        H\  n U R                  5       n U R                  SSS9n[	        U5      S:X  d  M4  US   R                  5       US   R                  5       4v   M^     g 7f)N:r   )maxsplitr   r   )decoderG   rI   splitlen)rM   partsbinary_outputs     r"   gen$_parse_tesseract_output.<locals>.gen   sk     !((*557D::<DJJsQJ/E5zQAhnn&a(888	 8s   AB-B)dict)rc   rd   s   ` r"   _parse_tesseract_outputrg      s    9 ;r%   c           	     ,   [        S/U5      SS[        U 5      S/-   n [        U[        [        USS9n[        UR                  5      n[        UR                  SS5      5      n[        U[!        UR                  SS5      5      S	9nU$ ! [
         a    [        SSS	9s $ [         ak  n[        UR                  5        [        UR                  5        S
UR                  ;   d  SUR                  ;   a  [        SS5      s S nA$ [        5       UeS nAff = f)Nosd--psm0rB   TrB   rC   timeoutrE   r           )angle
confidences&   Too few characters. Skipping this page   Image too largezOrientation in degreeszOrientation confidence)rZ   r   r   r   r	   r   r   r
   tesseract_log_outputrB   rC   r>   r   rg   intgetfloat)	
input_filerX   rm   args_tesseractprL   ri   ro   orient_confs	            r"   get_orientationrz      s    $UG[9z	= N-tFGSWX "!((
+C0!45E'cgg.F&J KK #  >$1== -QXX&QXX&5A!QXX-(A..#%1,-s*   B	 	D	D(AD=DDDc                    SU R                   ;   a  gU R                  S:H  =(       a1    U R                   S:H  =(       d    U R                   R                  S5      $ )N   Empty page!!Tr   r%   s2   Error in boxClipToRectangle: box outside rectangle)r>   
returncoderH   )excs    r"   _is_empty_page_errorr      sL    #**$>>Q 

c 	X::  !VW	r%   c                   [        X5      SS[        U 5      S/-   n [        U[        [        USS9n[        UR                  5      n[        UR                  SS	5      5      nS
[        -  U-  n	[         R#                  SU	S 35        U	$ ! [
         a     g[         aO  n[        UR                  5        [        UR                  5        [        U5      (       a   SnAg[        5       UeSnAff = f)z+Gets angle to deskew this page, in degrees.rj   2rB   Trl   rn   NzDeskew angler      zDeskew angle: z.3f)rZ   r   r   r   r	   r   r
   rr   rB   rC   r   r   rg   ru   rt   r   logdebug)
rv   	languagesrX   rm   rw   rx   rL   parseddeskew_radiansdeskew_degreess
             r"   
get_deskewr      s     $I;z	? N
-tFGSWX %QXX.F6::na89N2X.NII~c234   -QXX&QXX&""#%1,-s#   B	 	
C-	C-:C(C((C-c                j   [        [        [        [        S5      (       a  [        R                  OS S9nU (       d  g  U R	                  5       nUR                  5       nU GH  nUR                  S5      (       a  M  UR                  S5      (       a  M4  SU;   a  UR                  S5        MM  UR                  S	5      (       a  UR                  S
5        Mv  SU;   a  M~  SU;   a  M  SUR                  5       ;   a>  UR                  UR                  5       5        UR                  S5      S   n[        U5      eSUR                  5       ;   d  SUR                  5       ;   a"  UR                  UR                  5       5        GM"  SUR                  5       ;   a"  UR                  UR                  5       5        GMX  SUR                  5       ;   a"  UR                  UR                  5       5        GM  UR                  UR                  5       5        GM     g ! [
         a    U R	                  SS5      n GNf = f)Nr   r   utf-8ignorezTesseract Open SourcezWarning in pixReadMem
diacriticsz&lots of diacritics - possibly poor OCRzOSD: Weak marginzunsure about page orientationzError in pixScanForForegroundzError in boxClipToRectanglezparameter not found: zfound: r   error	exceptionwarningread_params_file)r   r   hasattrr   r_   UnicodeDecodeErrorrG   rH   r   lowerr   rI   r`   r   info)streamtlogrA   linesrM   problems         r"   rr   rr      s   !"300ciidD
 0}} OOE??233__455T!LLAB__/00LL89,4*d2$

4JJtzz|$jj+A.G&w//

$tzz|(CJJtzz|$$**,&LL&4::</JJtzz|$IIdjjl#1 	  0}}Wh/0s   H H21H2c                <    U S:X  a  g [         R                  S5        g )Nr   z+[tesseract] took too long to OCR - skipping)r   r   )rm   s    r"   page_timedoutr     s    !|KK=>r%   c                D    U R                  SSS9  UR                  SSS9  g)zPProduce an empty .hocr file.

Ensures page is the same size as the input image.
 r   encoding[skipped page]N)
write_text)output_hocroutput_textimages      r"   _generate_null_hocrr     s+    
 20+g>r%   c                   UR                  S5      n[        X45      nUb  UR                  S[        U5      /5        US:w  a%  [	        5       (       a  UR                  SSU 3/5        U	(       a  UR                  SU	/5        U
(       a  UR                  SU
/5        UR                  [        U 5      [        U5      S	S
/5        UR                  U5         [        U[        [        USS9nUR                  n[        U5        [        [        5         UR                  S5      R                  U5        SSS5        g! , (       d  f       g= f! [         a    [        U5        [!        XU 5         g["         aV  n[        UR$                  5        SUR$                  ;   d  SUR$                  ;   a  [!        XU 5         SnAg['        5       UeSnAff = f)z5Generate a hOCR file, which must be converted to PDF.r   Nrj   r   -cthresholding_method=--user-words--user-patternshocrtxtTrl   .txtrq   r|   )with_suffixrZ   rT   rV   r;   r   r   r   r	   rB   rr   r   FileNotFoundErrorreplacer   r   r   r
   r>   r   )rv   r   r   r   rX   
tessconfigrm   pagesegmodethresholding
user_wordsuser_patternsprefixrw   rx   rB   rL   s                   r"   generate_hocrr     s    $$R(F#I;NwK(89:q-//t';L>%JKL~z:;0-@A 6*-vf~vuMN*%<tFGSWX 	V$ '(v&..{; )((#  B 	gKjA -QXX&)_-H*E#%1,-s1   !E
 !D99
E
!G-	G6AG<GGc                F    UR                  SSS9  U R                  S5        g )Nr   r   r   r%   )r   write_bytes)
output_pdfr   s     r"   use_skip_pager   Z  s&    +g> 3r%   c                   [        X45      nUb  UR                  S[        U5      /5        UR                  SS/5        US:w  a%  [        5       (       a  UR                  SSU 3/5        U	(       a  UR                  SU	/5        U
(       a  UR                  SU
/5        UR                  [        UR                  5      -  nUR                  [        U 5      [        U5      S	S
/5        UR                  U5         [        U[        [        USS9nUR                  n[        [        5         UR                  S5      R                  U5        SSS5        [!        U5        g! , (       d  f       N= f! ["         a    [%        U5        ['        X5         g[(         aU  n[!        UR*                  5        SUR*                  ;   d  SUR*                  ;   a  ['        X5         SnAg[-        5       UeSnAff = f)zGenerate a PDF using Tesseract's internal PDF generator.

We specifically a text-only PDF which is more suitable for combining with
the input page.
Nrj   r   ztextonly_pdf=1r   r   r   r   pdfr   Trl   r   rq   r|   )rZ   rT   rV   r;   parentr   stemr   r   r   r	   rB   r   r   r   r   rr   r   r   r   r
   r>   r   )rv   r   r   r   rX   r   rm   r   r   r   r   rw   r   rx   rB   rL   s                   r"   generate_pdfr   a  s   & $I;NwK(89:4!123q-//t';L>%JKL~z:;0-@Ajoo!66F
 6*-vf~ueLM*%%tFGSWX'(v&..{; ) 	V$ )( /gj. -QXX&)_-H*2#%1,-s=   80E. (!E	E. 
E+'E. . G.	G.A G)G))G.)returnr   )r   bool)r   zset[str])rW   	list[str]rX   
int | Noner   r   )rc   bytesr   zdict[str, str])rv   r   rX   r   rm   ru   r   r   )
rv   r   r   r   rX   r   rm   ru   r   ru   )r   r   r   None)rm   ru   r   r   )r   r   r   r   r   r   r   r   )rv   r   r   r   r   r   r   r   rX   rs   r   r   rm   ru   r   rs   r   rs   r   r   )r   r   r   r   r   r   )rv   r   r   r   r   r   r   r   rX   rs   r   r   rm   ru   r   rs   r   rs   r   r   )4r+   
__future__r   loggingr0   
contextlibr   mathr   osr   pathlibr   
subprocessr   r	   r
   r   packaging.versionr   ocrmypdf.exceptionsr   r   r   ocrmypdf.pluginspecr   ocrmypdf.subprocessr   r   	getLoggerr'   r   r   __annotations__LoggerAdapterr   r2   r.   r9   r;   rQ   rZ   rg   rz   r   r   rr   r   r   r   r   r   r&   r%   r"   <module>r      s   ) "  	     G G % 
 6 0! 	2  ,W22 ,) Xw P'
+>#-8=@	!*9CNS
<&$R??8<8< 8< 	8<
 8< 8< 8< 8< 8< 8< 
8<v 9%9% 9% 	9%
 9% 9% 9% 9% 9% 9% 
9%r%   