
    TAi+                    >   S r SSKJr  SSKrSSKrSSKrSSKJr  SSKJ	r	  SSK
Jr  SSKJr  SSKJrJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  \R:                  " \5      r\	S 5       r \	S 5       r!\	S 5       r"\	SS j5       r# " S S\5      r$\	S 5       r%g)z1Built-in plugin to implement OCR using Tesseract.    )annotationsN)Image)hookimpl)	tesseract)PageContext)numeric
str_to_int)BadArgsErrorMissingDependencyError)clamp)calculate_downsampledownsample_image)	OcrEngine)check_external_programc           
     x   U R                  SS5      nUR                  SSS/ SS9  UR                  SS	[        S
[        SS5      SS9  UR                  SS	[        S[        SS5      SS9  UR                  SS	[	        [
        R                  5      SSSS9  UR                  SS[        [        S5      SSS9  UR                  SS[        [        S5      SSS9  UR                  S[        R                  S S!S"9  UR                  S#S	[        [        S$S%5      S%S&S'9  UR                  S(S)S*S+9  UR                  S,S)S-S+9  g ).N	Tesseractz!Advanced control of Tesseract OCRz--tesseract-configappendCFGz>Additional Tesseract configuration files -- see documentation.)actionmetavardefaulthelpz--tesseract-pagesegmodestorePSMr      z<Set Tesseract page segmentation mode (see tesseract --help).)r   typer   choicesr   z--tesseract-oemMODE   z|Set Tesseract 4+ OCR engine mode: 0 - original Tesseract only; 1 - neural nets LSTM only; 2 - Tesseract + LSTM; 3 - default.z--tesseract-thresholdingautoMETHODa@  Set Tesseract 5.0+ input image thresholding mode. This may improve OCR results on low quality images or those that contain high contrast color. legacy-otsu is the Tesseract default; adaptive-otsu is an improved Otsu algorithm with improved sort for background color changes; sauvola is based on local standard deviation.)r   r   r   r   r   z--tesseract-timeoutg     f@SECONDSa  Give up on OCR after the timeout, but copy the preprocessed page into the final output. This timeout is only used when using Tesseract for OCR. When Tesseract is used for other operations such as deskewing and orientation, the timeout is controlled by --tesseract-non-ocr-timeout.)r   r   r   r   z--tesseract-non-ocr-timeoutzGive up on non-OCR operations such as deskewing and orientation after timeout. This is a separate timeout from --tesseract-timeout because these operations are not as expensive as OCR.z#--tesseract-downsample-large-imagesTa  Downsample large images before OCR. Tesseract has an upper limit on the size images it will support. If this argument is given, OCRmyPDF will downsample large images to fit Tesseract. This may reduce OCR quality, on large images the most desirable text is usually larger. If this parameter is not supplied, Tesseract will error out and produce no OCR on the page in question. This argument should be used with a high value of --tesseract-timeout to ensure Tesseract has enough to time.)r   r   r   z--tesseract-downsample-aboved     ag  Downsample images larger than this size pixel size in either dimension before OCR. --tesseract-downsample-large-images downsamples only when an image exceeds Tesseract's internal limits. This argument causes downsampling to occur when an image exceeds the given size. This may reduce OCR quality, but on large images the most desirable text is usually larger.)r   r   r   r   z--user-wordsFILEa  Specify the location of the Tesseract user words file. This is a list of words Tesseract should consider while performing OCR in addition to its standard language dictionaries. This can improve OCR quality especially for specialized and technical documents.)r   r   z--user-patternsz9Specify the location of the Tesseract user patterns file.)add_argument_groupadd_argumentintranger	   r   TESSERACT_THRESHOLDING_METHODSr   floatargparseBooleanOptionalAction)parsertesss     b/var/www/html/land-ocr/venv/lib/python3.13/site-packages/ocrmypdf/builtin_plugins/tesseract_ocr.pyadd_optionsr1      s   $$[2UVDM   	!aK   	a   	"	@@A1   	UA+   	%UAD  
 	---M   	&S#u%   	J   	H      c           	        [        SSS0[        R                  S[        R                  S9  [        R                  " 5       nU[        R                  " S5      :X  a  [	        S5      eU R
                  S:X  aC  1 S	k[        U R                  5      -  (       a  [        R                  S
5        SU l        OSU l        [        R                  " 5       (       d%  U R                  S:w  a  [        R                  S5        U R                  S;   a  [        R                  S5        SS1nU[        U R                  5      -  (       a4  [        SSR                  U[        U R                  5      -  5       S35      eg )Nr   linuxztesseract-ocrz4.1.1)programpackageversion_checkerneed_versionversion_parserz5.4.0zzTesseract 5.4.0 is not supported due to regressions in this version. Please upgrade to a newer or supported older version.r    >   arafashebperz6Using sandwich renderer since there is an RTL languagesandwichhocrr   zThe installed version of Tesseract does not support changes to its thresholding method. The --tesseract-threshold argument will be ignored.)r      zdThe --tesseract-pagesegmode argument you select will disable OCR. This may cause processing to fail.equosdzZThe following languages for Tesseract's internal use and should not be issued explicitly: z, z-
Remove them from the -l/--language argument.)r   r   versionTesseractVersionr   pdf_rendererset	languagesloginfohas_thresholdingtesseract_thresholdingwarningtesseract_pagesegmoder
   join)optionstess_versionDENIED_LANGUAGESs      r0   check_optionsrR      sO   /*!)) 11 $$&Ly11'::$D
 	
 v%'#g.?.?*@@HHMN#-G #)G %%''G,J,Ja,O	

 $$.1	
 u~#g//00%yy)C0A0A,BBCD E;;
 	
 1r2   c                   [         R                  R                  SS5      R                  5       (       d@  [	        UR
                  [        U 5      -  SS5      n[        U5      [         R                  S'   O[        [         R                  S   5      n[        R                  SU5        UR                  S:w  a(  UR                  (       d  [        R                  S5        g g g )NOMP_THREAD_LIMIT       z&Using Tesseract OpenMP thread limit %dr$   zwThe --tesseract-downsample-above argument will have no effect unless --tesseract-downsample-large-images is also given.)osenvironget	isnumericr   jobslenstrr(   rH   debugtesseract_downsample_above!tesseract_downsample_large_imagesrL   )pdfinforO   tess_threadss      r0   validaterd      s     ::>>,b1;;==W\\S\91a@),\):

%&2::&89:II6E 	**e399A	
 : 	4r2   c                    [        U R                  R                  S5      nU R                  nUR                  (       a  [	        XU4SS9n[        X5      nU$ )zFilter the image before OCR.

Tesseract cannot handle images with more than 32767 pixels in either axis,
or more than 2**31 bytes. This function resizes the image to fit within
those limits.
r$   i)max_size	max_bytes)minrO   r`   ra   r   r   )pageimage	thresholdrO   sizes        r0   filter_ocr_imagerm      sQ     DLL;;UCIllG00#	2k
 !-Lr2   c                      \ rS rSrSr\S 5       r\S 5       rS r\S 5       r	\S 5       r
\SS j5       r\S	 5       r\S
 5       rSrg)TesseractOcrEngine   zImplements OCR with Tesseract.c                 >    [        [        R                  " 5       5      $ N)r^   r   rC    r2   r0   rC   TesseractOcrEngine.version   s    9$$&''r2   c                ^    U R                   S:X  a  SOSnSU S[        R                  5        3$ )Nr>   z-PDFz-hOCRzTesseract OCR )rE   ro   rC   )rO   tags     r0   creator_tagTesseractOcrEngine.creator_tag   s4    ,,
:fse1%7%?%?%A$BCCr2   c                0    S[         R                  5        3$ )NzTesseract OCR )ro   rC   )selfs    r0   __str__TesseractOcrEngine.__str__   s     2 : : <=>>r2   c                ,    [         R                  " 5       $ rr   )r   get_languages)rO   s    r0   rG   TesseractOcrEngine.languages   s    &&((r2   c                V    [         R                  " U UR                  UR                  S9$ )N)engine_modetimeout)r   get_orientationtesseract_oemtesseract_non_ocr_timeout
input_filerO   s     r0   r   "TesseractOcrEngine.get_orientation   s*    ((--55
 	
r2   c                l    [         R                  " U UR                  UR                  UR                  S9$ )N)rG   r   r   )r   
get_deskewrG   r   r   r   s     r0   r   TesseractOcrEngine.get_deskew  s3    ##''--55	
 	
r2   c                    [         R                  " U UUUR                  UR                  UR                  UR
                  UR                  UR                  UR                  UR                  S9  g )N)r   output_hocroutput_textrG   r   
tessconfigr   pagesegmodethresholding
user_wordsuser_patterns)
r   generate_hocrrG   r   tesseract_configtesseract_timeoutrM   rK   r   r   )r   r   r   rO   s       r0   r    TesseractOcrEngine.generate_hocr
  sa    !##''--//--55 77))!//	
r2   c                    [         R                  " U UUUR                  UR                  UR                  UR
                  UR                  UR                  UR                  UR                  S9  g )N)r   
output_pdfr   rG   r   r   r   r   r   r   r   )
r   generate_pdfrG   r   r   r   rM   rK   r   r   )r   r   r   rO   s       r0   r   TesseractOcrEngine.generate_pdf  sa    !!#''--//--55 77))!//	
r2   rs   N)returnr+   )__name__
__module____qualname____firstlineno____doc__staticmethodrC   rx   r|   rG   r   r   r   r   __static_attributes__rs   r2   r0   ro   ro      s    (( ( D D? ) ) 
 
 
 
 
 
 
 
r2   ro   c                     [        5       $ rr   )ro   rs   r2   r0   get_ocr_enginer   +  s    r2   )ri   r   rj   Image.Imager   r   )&r   
__future__r   r,   loggingrX   PILr   ocrmypdfr   ocrmypdf._execr   ocrmypdf._jobcontextr   ocrmypdf.clir   r	   ocrmypdf.exceptionsr
   r   ocrmypdf.helpersr   ocrmypdf.imageopsr   r   ocrmypdf.pluginspecr   ocrmypdf.subprocessr   	getLoggerr   rH   r1   rR   rd   rm   ro   r   rs   r2   r0   <module>r      s    8 "   	   $ , , D " D ) 6! 
m 
m` 
)
 
)
X 

 

4 
 
$B
 B
J 
  
 r2   