
    TAiiU              4      @   S r SSKJr  SSKrSSKrSSKrSSKrSSKJr  SSK	J
r
Jr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKrSSKJr  SSKJr  SSKJrJr  SSKJr  SSK J!r!  SSK"J#r#  SSK$J%r%J&r&  SSK'J(r(  \\)-  \*-  r+\\+-  r,\RZ                  " 5       r. " S S\5      r/SSSS.       S"S jjr0    S#S jr1        S$S jr2SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.3                                                                                                       S%S jjr3SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.&                                                                             S&S jjr4SSSSSSSSSSSSSS.                           S'S  jjr5/ S!Qr6g)(a"  Python API for OCRmyPDF.

This module provides the main Python API for OCRmyPDF, allowing you to perform
OCR operations programmatically without using the command line interface.

Main Functions:
    ocr(): The primary function for OCR processing. Takes an input PDF or image
        file and produces an OCR'd PDF with searchable text.

    configure_logging(): Set up logging to match the command line interface
        behavior, with support for progress bars and colored output.

Experimental Functions:
    _pdf_to_hocr(): Extract text from PDF pages and save as hOCR files for
        manual editing before final PDF generation.

    _hocr_to_ocr_pdf(): Convert hOCR files back to a searchable PDF after
        manual text corrections.

The API maintains thread safety through internal locking since OCRmyPDF uses
global state for plugins. Only one OCR operation can run per Python process
at a time. For parallel processing, use multiple Python processes.

Example:
    import ocrmypdf

    # Configure logging (optional)
    ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)

    # Perform OCR
    ocrmypdf.ocr('input.pdf', 'output.pdf', language='eng')

For detailed parameter documentation, see the ocr() function docstring and
the equivalent command line parameters in the OCRmyPDF documentation.
    )annotationsN)	Namespace)IterableSequence)IntEnum)IOBase)Path)BinaryIO)warn)PageNumberFilter)run_hocr_to_ocr_pdf_pipeline)run_pipelinerun_pipeline_cli)run_hocr_pipeline)get_plugin_manager)check_options)ArgumentParser
get_parser)is_iterable_notstrc                  (    \ rS rSrSrSrSrSrSrSr	g)		VerbosityJ   z&Verbosity level for configure_logging.r          N)
__name__
__module____qualname____firstlineno____doc__quietdefaultdebug	debug_all__static_attributes__r       H/var/www/html/land-ocr/venv/lib/python3.13/site-packages/ocrmypdf/api.pyr   r   J   s    0 EGEIr'   r   TF)progress_bar_friendlymanage_root_loggerplugin_managerc                  U(       a  SOSn[         R                  " U5      nUR                  [         R                  5        SnU(       a!  U(       a  UR                  R                  5       nU(       d"  [         R                  " [        R                  S9nU S:  a   UR                  [         R                  5        OEU S:  a   UR                  [         R                  5        OUR                  [         R                  5        UR                  [        5       5        U S:  a  SnOS	nSnU(       d  [         R                  " US
9nUR                  U5        UR                  U5        U S::  aj  [         R                  " S5      n	U	R                  [         R                  5        [         R                  " S5      n
U
R                  [         R                  5        U(       a  [         R                   " S5        U$ )a  Set up logging.

Before calling :func:`ocrmypdf.ocr()`, you can use this function to
configure logging if you want ocrmypdf's output to look like the ocrmypdf
command line interface. It will register log handlers, log filters, and
formatters, configure color logging to standard error, and adjust the log
levels of third party libraries. Details of this are fine-tuned and subject
to change. The ``verbosity`` argument is equivalent to the argument
``--verbose`` and applies those settings. If you have a wrapper
script for ocrmypdf and you want it to be very similar to ocrmypdf, use this
function; if you are using ocrmypdf as part of an application that manages
its own logging, you probably do not want this function.

If this function is not called, ocrmypdf will not configure logging, and it
is up to the caller of ``ocrmypdf.ocr()`` to set up logging as it wishes using
the Python standard library's logging module. If this function is called,
the caller may of course make further adjustments to logging.

Regardless of whether this function is called, ocrmypdf will perform all of
its logging under the ``"ocrmypdf"`` logging namespace. In addition,
ocrmypdf imports pdfminer, which logs under ``"pdfminer"``. A library user
may wish to configure both; note that pdfminer is extremely chatty at the
log level ``logging.INFO``.

This function does not set up the ``debug.log`` log file that the command
line interface does at certain verbosity levels. Applications should configure
their own debug logging.

Args:
    verbosity: Verbosity level.
    progress_bar_friendly: If True (the default), install a custom log handler
        that is compatible with progress bars and colored output.
    manage_root_logger: Configure the process's root logger.
    plugin_manager: The plugin manager, used for obtaining the custom log handler.

Returns:
    The toplevel logger for ocrmypdf (or the root logger, if we are managing it).
 ocrmypdfN)streamr   r   r   z/%(levelname)7s %(name)s -%(pageno)s %(message)sz%(pageno)s%(message)s)fmtpdfminerPILT)logging	getLoggersetLevelDEBUGhookget_logging_consoleStreamHandlersysstderrERRORINFO	addFilterr   	FormattersetFormatter
addHandlercaptureWarnings)	verbosityr)   r*   r+   prefixlogconsoler0   	formatterpdfminer_logpil_logs              r(   configure_loggingrJ   T   s^   Z &R:F


F
#CLLG/ %%99;''szz:1}'	a'&&()A~?%I%%#.	#NN7A~((4gmm,##E*&%Jr'   c           
        / n0 nUR                  5        GH[  u  pEUc  M  X@;   a  XSU'   M  UR                  SS5      n[        U[        5      (       a  U(       a  UR	                  SU 35        MZ  [        U5      (       a0  U H(  nUR	                  SU 35        UR	                  U5        M*     M  UR	                  SU 35        [        U[        [        -  5      (       a  UR	                  [        U5      5        M  [        U[        5      (       a  UR	                  U5        GM  [        U[        5      (       a  UR	                  [        U5      5        GMA  [        U SU S[        U5       S35      e   X#4$ )z)Convert kwargs to command line arguments._-z--z: z ())itemsreplace
isinstanceboolappendr   intfloatstrr	   	TypeErrortype)defer_kwargskwargscmdlinedeferredargvalcmd_style_argelems           r(   _kwargs_to_cmdlinera      sF    "$GHLLN; SMC- c4  M?34c""M?34t$   	M?+,c3;''NN3s8$S!!NN3T""NN3s8$se2cU"T#YKq9::A #B r'   c                   [        SS1 Sk0UD6u  pE[        U [        [        -  5      (       a  UR	                  S5        O%UR	                  [
        R                  " U 5      5        [        U[        [        -  5      (       a  UR	                  S5        O%UR	                  [
        R                  " U5      5        SU;   aA  [        US   [        [        -  5      (       a"  UR	                  S5        UR	                  S5        UR                  5         UR                  U5      nUR                  5        H  u  px[        XgU5        M     UR                  S:X  a  Xl        UR                  S:X  a  Xl        UR                  S:X  a
  US   Ul        U$ )	a  Construct an options object from the input/output files and keyword arguments.

Args:
    input_file: Input file path or file object.
    output_file: Output file path or file object.
    parser: ArgumentParser object.
    **kwargs: Keyword arguments.

Returns:
    argparse.Namespace: A Namespace object containing the parsed arguments.

Raises:
    TypeError: If the type of a keyword argument is not supported.
rY   >   parserplugins
input_fileoutput_fileprogress_barzstream://input_filezstream://output_filesidecarz	--sidecarzstream://sidecarr   )ra   rQ   r
   r   rS   osfspathenable_api_mode
parse_argsrO   setattrre   rf   rh   )	re   rf   rc   rZ   r[   r\   optionskeywordr^   s	            r(   create_optionsrp      sE   " + W
G *h/00,-ryy,-+x&011-.ryy-.Fz&*;X=NOO{#)*
(G (#& ) 22'44),, +Nr'   )3language	image_dpioutput_typerh   jobsuse_threadstitleauthorsubjectkeywordsrotate_pagesremove_backgrounddeskewcleanclean_finalunpaper_args
oversampleremove_vectors	force_ocr	skip_textredo_ocrskip_bigoptimizejpg_qualitypng_qualityjbig2_lossyjbig2_page_group_sizejbig2_thresholdpagesmax_image_mpixelstesseract_configtesseract_pagesegmodetesseract_oemtesseract_thresholdingpdf_renderertesseract_timeouttesseract_non_ocr_timeouttesseract_downsample_above!tesseract_downsample_large_imagesrotate_pages_thresholdpdfa_image_compressioncolor_conversion_strategy
user_wordsuser_patternsfast_web_viewcontinue_on_soft_render_errorinvalidate_digital_signaturesrd   r+   keep_temporary_filesrg   c       3        P   U1(       a  U2(       a  [        S5      eU1(       d  / n1O+[        U1[        [        -  5      (       a  U1/n1O[	        U15      n1[        5       R                  5        V6V7s0 s H  u  n6n7U6S;  d  M  U6U7_M     n8n6n7U8R                  U55        [        5       n9[           U2(       d  [        U15      n2U2R                  R                  U9S9  SU5;   a  [        S5        [        S	U UU9S.U8D6n:[        U:U25        [!        U:U2S9sSSS5        $ s  sn7n6f ! , (       d  f       g= f)
aB  Run OCRmyPDF on one PDF or image.

For most arguments, see documentation for the equivalent command line parameter.

This API takes a threading lock, because OCRmyPDF uses global state in particular
for the plugin system. The jobs parameter will be used to create a pool of
worker threads or processes at different times, subject to change. A Python
process can only run one OCRmyPDF task at a time.

To run parallelize instances OCRmyPDF, use separate Python processes to scale
horizontally. Generally speaking you should set jobs=sqrt(cpu_count) and run
sqrt(cpu_count) processes as a starting point. If you have files with a high page
count, run fewer processes and more jobs per process. If you have a lot of short
files, run more processes and fewer jobs per process.

A few specific arguments are discussed here:

Args:
    use_threads: Use worker threads instead of processes. This reduces
        performance but may make debugging easier since it is easier to set
        breakpoints.
    input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
        interpreted as file system path to the input file. If the object
        appears to be a readable stream (with methods such as ``.read()``
        and ``.seek()``), the object will be read in its entirety and saved to
        a temporary file. If ``input_file`` is  ``"-"``, standard input will be
        read.
    output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
        interpreted as file system path to the output file. If the object
        appears to be a writable stream (with methods such as ``.write()`` and
        ``.seek()``), the output will be written to this stream. If
        ``output_file`` is ``"-"``, the output will be written to ``sys.stdout``
        (provided that standard output does not seem to be a terminal device).
        When a stream is used as output, whether via a writable object or
        ``"-"``, some final validation steps are not performed (we do not read
        back the stream after it is written).

Raises:
    ocrmypdf.MissingDependencyError: If a required dependency program is missing or
        was not found on PATH.
    ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
        could not be read, or some other file type that is not a PDF.
    ocrmypdf.DpiError: If the input file is an image, but the resolution of the
        image is not credible (allowing it to proceed would cause poor OCR).
    ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
        file failed.
    ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
        text already, and settings did not tell us to proceed.
    ocrmypdf.InputFileError: Any other problem with the input file.
    ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
    ocrmypdf.EncryptedPdfError: If the input PDF is encrypted (password protected).
        OCRmyPDF does not remove passwords.
    ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
        valid.

Returns:
    :class:`ocrmypdf.ExitCode`
z2plugins= and plugin_manager are mutually exclusive>   rZ   re   rf   r+   rc   verbosezDocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().)re   rf   rc   rn   r+   Nr   )
ValueErrorrQ   rV   r	   listlocalsrO   updater   	_api_lockr   r7   add_optionsr   rp   r   r   );re   rf   rq   rr   rs   rh   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rd   r+   r   rg   rZ   kvcreate_options_kwargsrc   rn   s;                                                              r(   ocrr   	  s   f >MNN	GS4Z	(	()w-
 HNN$$DAqMM 	1$  
   (\F	/8N''v'6WX  
!#
 $	
 	g~.GNK 
 
s   *D;D&A!D
D%)&rq   rr   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   rd   r   c       &           [        5       R                  5        V)V*s0 s H  u  n)n*U)S;  d  M  U)U*_M     n+n)n*U+R                  U(5        [        5       n,[           U%(       d  [        U&5      n%U%R                  R                  U,S9  [        S	S1 Sk0U+D6u  n-n.U-R                  [        U 5      5        U-R                  [        U5      5        U,R                  5         U,R                  U-5      n/U.R                  5        H  u  n0n1[        U/U0U15        M     [        U/S5        [        U/SU5        [        U/U%S9sSSS5        $ s  sn*n)f ! , (       d  f       g= f)
a  Partially run OCRmyPDF and produces an output folder containing hOCR files.

Given a PDF file, this function will run OCRmyPDF up to the point where
the PDF is rasterized to images, OCRed, and the hOCR files are produced,
all of which are saved to the output folder. This is useful for applications
that want to provide an interface for users to edit the text before
rendering the final PDF.

Use :func:`hocr_to_ocr_pdf` to produce the final PDF.

For arguments not explicitly documented here, see documentation for the
equivalent command line parameter.

This API is **experimental** and subject to change.

Args:
    input_pdf: Input PDF file path.
    output_folder: Output folder path.
    **kwargs: Keyword arguments.
>   rZ   	input_pdfoutput_folderr   rY   >   rd   r   r   rf   r   r   Nr   )r   rO   r   r   r   r   r7   r   ra   rS   rV   rk   rl   rm   delattrr   )2r   r   rq   rr   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   rd   r   rZ   r   r   r   rc   r[   r\   rn   ro   r^   s2                                                     r(   _pdf_to_hocrr     s2   F HNN$$DAq<< 	1$  
   (\F	/8N''v'6. 
B
#
 	s9~&s=)* ##G,$NN,LGSGWc* -'-8 P% 
 
   D>D>CE
E)rt   ru   r   r   r   r   r   r   r   r   r   r+   rd   c                  [        5       R                  5        VVs0 s H  u  nnUS;  d  M  UU_M     nnnUR                  U5        [        5       n[           U(       d  [        U5      nUR                  R                  US9  [        S	S1 Sk0UD6u  nnUR                  [        U 5      5        UR                  [        U5      5        UR                  5         UR                  U5      nUR                  5        H  u  nn[        UUU5        M     [        US5        [        USU 5        [        UUS9sSSS5        $ s  snnf ! , (       d  f       g= f)
aM  Run OCRmyPDF on a work folder and produce an output PDF.

After running :func:`pdf_to_hocr`, this function will run OCRmyPDF on the work
folder to produce an output PDF. This function consolidates any changes made
to the hOCR files in the work folder and produces a final PDF.

For arguments not explicitly documented here, see documentation for the
equivalent command line parameter.

This API is **experimental** and subject to change.

Args:
    work_folder: Work folder path, as generated by :func:`pdf_to_hocr`.
    output_file: Output PDF file path.
    **kwargs: Keyword arguments.
>   rZ   
output_pdfwork_folderr   rY   >   rd   rf   r   re   r   r   Nr   )r   rO   r   r   r   r   r7   r   ra   rS   rV   rk   rl   rm   r   r   )r   rf   rt   ru   r   r   r   r   r   r   r   r   r   r+   rd   rZ   r   r   r   rc   r[   r\   rn   ro   r^   s                            r(   _hocr_to_ocr_pdfr      s6   L HNN$$DAq;; 	1$  
   (\F	/8N''v'6. 
B
#
 	s;'(s;'( ##G,$NN,LGSGWc* -&4+N
% 
 
r   )
r   r   r   rJ   rp   r   r   r   r   r   )rC   r   r)   rR   r*   rR   r+   zpluggy.PluginManager | None)rY   zset[str]returnz0tuple[list[str | bytes], dict[str, str | bytes]])re   PathOrIOrf   r   rc   r   r   r   )hre   r   rf   r   rq   Iterable[str] | Nonerr   
int | Noners   
str | Nonerh   zPathOrIO | Nonert   r   ru   bool | Nonerv   r   rw   r   rx   r   ry   r   rz   r   r{   r   r|   r   r}   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   float | Noner   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   os.PathLike | Noner   r   r   r   r   r   r   r   rd   zIterable[Path | str] | Noner   r   rg   r   )Nr   r	   r   r	   rq   r   rr   r   rt   r   ru   r   rv   r   rw   r   rx   r   ry   r   rz   r   r{   r   r|   r   r}   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rd   Sequence[Path | str] | Noner   r   )r   r	   rf   r	   rt   r   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rd   r   )7r!   
__future__r   r3   ri   r:   	threadingargparser   collections.abcr   r   enumr   ior   pathlibr	   typingr
   warningsr   pluggyocrmypdf._loggingr   #ocrmypdf._pipelines.hocr_to_ocr_pdfr   ocrmypdf._pipelines.ocrr   r   ocrmypdf._pipelines.pdf_to_hocrr   ocrmypdf._plugin_managerr   ocrmypdf._validationr   ocrmypdf.clir   r   ocrmypdf.helpersr   rV   bytesStrPathr   Lockr   r   rJ   ra   rp   r   r   r   __all__r   r'   r(   <module>r      s^  "H #  	 
   .       . L B = 7 . 3 /
*u
g
 NN	  #'$26XX  X 	X
 0Xv''5'T--*2-<J--h &* "## $%)##!"&!! !""#(,$(&*-1(, $)-#&*.2-159+/)-,0%)(,"&1515+/(, $mULULUL #	UL
 UL UL UL UL UL UL UL UL UL UL #UL  !UL" #UL$ %UL& 'UL( )UL*  +UL, -UL. /UL0 1UL2 3UL4 5UL6 7UL8 9UL: ;UL< &=UL> "?UL@ AULB $CULD +EULF &GULH IULJ 'KULL MULN $OULP  ,QULR !+SULT (3UULV )WULX 'YULZ  *[UL\ #]UL^ &_UL`  aULb $/cULd $/eULf )gULj &kULl mULx &* # $%)##!"&!! !&*-1(, $)-&*.2-159+/%)(,1515+/(,S\Q\Q\Q #	\Q
 \Q \Q \Q \Q \Q \Q \Q \Q #\Q \Q \Q  !\Q" #\Q$ %\Q&  '\Q( )\Q* +\Q, -\Q. /\Q0 1\Q2 $3\Q4 +5\Q6 &7\Q8 9\Q: ';\Q< $=\Q>  ,?\Q@ !+A\QB (3C\QD )E\QF #G\QH &I\QJ $/K\QL $/M\QP )Q\QR &S\QF #""#(,$()-,0"&+/!A
A
A
 	A

 A
 A
 A
 A
 A
 &A
 "A
 'A
  *A
  A
  )!A
Hr'   