
    TAiC                       S r SSKJr  SSKrSSKJrJr  SSKJrJ	r	  SSK
Jr  SSK
Jr  SSKJr  \	" S	\\5      rSSS
 jjrSS jr " S S\R,                  5      r " S S\R.                  5      rS r\" \SSSS9r\R7                  SSS/ SS9  g)z4Command line interface customization and validation.    )annotationsN)CallableMapping)AnyTypeVar)DEFAULT_ROTATE_PAGES_THRESHOLD)PROGRAM_NAME)__version__Tc                v   ^ ^^ Tb  T " T5      OSmTb  T " T5      OSmSU UU4S jjnT R                   Ul         U$ )zValidator for numeric command line parameters.

Stipulates that the value must be of type basetype (typically int or float), and
optionally, within the range [min_, max_].
Nc                z   > T" U 5      nTb  UT:  d	  Tb%  UT:  a  [         R                  " U < STT4< 35      eU$ )Nz not in valid range )argparseArgumentTypeError)svaluebasetypemax_min_s     H/var/www/html/land-ocr/venv/lib/python3.13/site-packages/ocrmypdf/cli.py_numericnumeric.<locals>._numeric   sR    43CPT,,%+T4L+;<      )r   strreturnr   )__name__)r   r   r   r   s   ``` r   numericr      sD     "-8D>4D!-8D>4D  !))HOr   c                   ^  SU 4S jjnU$ )z3Accept text on command line and convert to integer.c           	        >  TU    $ ! [          a;    [        R                  " U < SSR                  TR	                  5       5       35      S ef = f)Nz must be one of: z, )KeyErrorr   r   joinkeys)r   mappings    r   _str_to_intstr_to_int.<locals>._str_to_int+   sV    	1: 	,,%(7<<>)B(CD	s
    AA)r   r   r   int )r"   r#   s   ` r   
str_to_intr'   (   s     r   c                  >   ^  \ rS rSrSrU 4S jrS rU 4S jrSrU =r	$ )ArgumentParser6   a  Override parser's default behavior of calling sys.exit().

https://stackoverflow.com/questions/5943249/python-argparse-and-controlling-overriding-the-exit-status-code

OCRmyPDF began as a CLI but eventually acquired an API. The API works inside out,
by synthesizing a command line argument. So we subclass the standard parser with
one that doesn't call sys.exit(). Obviously this is not the ideal way to do things
but it works for us.
c                4   > [         TU ]  " U0 UD6  SU l        g)zInitialize the parser.FN)super__init__	_api_mode)selfargskwargs	__class__s      r   r-   ArgumentParser.__init__A   s    $)&)r   c                    SU l         g)zEnable API mode.

When set, the parser will not call sys.exit() on error. OCRmyPDF was originally
a command line program, but now it has an API. The API works by synthesizing
command line arguments.
TNr.   )r/   s    r   enable_api_modeArgumentParser.enable_api_modeF   s     r   c                \   > U R                   (       d  [        TU ]	  U5        g[        U5      e)z-Override the default argparse error behavior.N)r.   r,   error
ValueError)r/   messager2   s     r   r9   ArgumentParser.errorO   s$    ~~GM'"!!r   r5   )
r   
__module____qualname____firstlineno____doc__r-   r6   r9   __static_attributes____classcell__r2   s   @r   r)   r)   6   s    
" "r   r)   c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )LanguageSetActionW   zManages a list of languages.c                F   > Uc
  [        5       n[        TU ]  " X4SU0UD6  g)zInitialize the action.Ndefault)listr,   r-   )r/   option_stringsdestrH   r1   r2   s        r   r-   LanguageSetAction.__init__Z   s&    ?fGIwI&Ir   c                    [        X R                  5      nSU;   a0  UR                  S5       Vs/ s H  oeR                  U5      PM       ngUR                  U5        gs  snf )zAdd a language to the set.+N)getattrrK   splitappend)r/   parser	namespacevaluesoption_stringrK   langs          r   __call__LanguageSetAction.__call__`   sL    y)),&=+1<<+<=+<4[[+<=KK >s   Ar&   )N)	r   r=   r>   r?   r@   r-   rW   rA   rB   rC   s   @r   rE   rE   W   s    &J   r   rE   c            
        [        [        SS[        R                  SSS9n U R	                  SSSS	9  U R	                  S
SSS	9  U R	                  SSS[
        SS9  U R	                  SS[        SS9  U R	                  S/ SQSSS9  U R	                  SSSSSS S!9  U R	                  S"S#[        S$S%9  U R                  S&5      nUR	                  S'S(S)[        [        S*S+5      S,S9  UR	                  S-S.S/S0S19  UR	                  S2S3[        [        S*S45      S*S5SS6S79  UR	                  S8S9S:[        R                  S;9  UR	                  S<S/S[        R                  S=9  UR	                  S>S9S?[        R                  S;9  U R                  S@SA5      nUR	                  SB[        SCSD9  UR	                  SE[        SFSD9  UR	                  SG[        SHSD9  UR	                  SI[        SJSD9  U R                  SKSL5      nUR	                  SMSNS/SOS19  UR	                  SPS/SQS19  UR	                  SRSSS/STS19  UR	                  SUSVS/SWS19  UR	                  SXSYS/SZS19  UR	                  S[[        SS\S]9  UR	                  S^S[        [        S*S_5      S*S`Sa9  UR	                  SbS/ScS19  U R                  SdSe5      nUR	                  SfSgS/ShS19  UR	                  SiSjS/SkS19  UR	                  SlS/SmS19  UR	                  Sn[        [        S*S_5      SoSpSq9  UR	                  SrS/SsS19  U R                  StSu5      nUR	                  Sv[        SwSD9  UR	                  SxSy[        [        S*5      SoSzS{S|9  UR	                  S}/ S~QSSS9  UR	                  S[        [        [        S*S5      SSS9  UR	                  S[        [        S*5      SSSS9  UR	                  SS/SS19  UR	                  SSS/ SS9  U R                  SS5      nUR	                  SSS/SS19  U $ )zGet the main CLI parser.T@a  Generates a searchable PDF or PDF/A from a regular PDF.

OCRmyPDF rasterizes each page of the input PDF, optionally corrects page
rotation and performs image processing, runs the Tesseract OCR engine on the
image, and then creates a PDF from the OCR information.
a?  OCRmyPDF attempts to keep the output file at about the same size.  If a file
contains losslessly compressed images, and images in the output file will be
losslessly compressed as well.

PDF is a page description file that attempts to preserve a layout exactly.
A PDF can contain vector objects (such as text or lines) and raster objects
(images).  A page might have multiple images.  OCRmyPDF is prepared to deal
with the wide variety of PDFs that exist in the wild.

When a PDF page contains text, OCRmyPDF assumes that the page has already
been OCRed or is a "born digital" page that should not be OCRed.  The default
behavior is to exit in this case without producing a file.  You can use the
option --skip-text to ignore pages with text, or --force-ocr to rasterize
all objects on the page and produce an image-only PDF as output.

    ocrmypdf --skip-text file_with_some_text_pages.pdf output.pdf

    ocrmypdf --force-ocr word_document.pdf output.pdf

If you are concerned about long-term archiving of PDFs, use the default option
--output-type pdfa which converts the PDF to a standardized PDF/A-2b.  This
removes some features from the PDF such as Javascript or forms. If you want to
minimize the number of changes made to your PDF, use --output-type pdf.

If OCRmyPDF is given an image file as input, it will attempt to convert the
image to a PDF before processing.  For more control over the conversion of
images to PDF, use the Python package img2pdf or other image to PDF software.

For example, this command uses img2pdf to convert all .png files beginning
with the 'page' prefix to a PDF, fitting each image on A4-sized paper, and
sending the result to OCRmyPDF through a pipe.

    img2pdf --pagesize A4 page*.png | ocrmypdf - myfile.pdf

Online documentation is located at:
    https://ocrmypdf.readthedocs.io/en/latest/introduction.html

)progallow_abbrevfromfile_prefix_charsformatter_classdescriptionepilog
input_fileinput_pdf_or_imagezOPDF file containing the images to be OCRed (or '-' to read from standard input))metavarhelpoutput_file
output_pdfzOutput searchable PDF file (or '-' to write to standard output). Existing files will be overwritten. If same as input file, the input file will be updated only if processing is successful.z-lz
--language	languageszLanguage(s) of the file to be OCRed (see tesseract --list-langs for all language packs installed in your system). Use -l eng+deu for multiple languages.)rK   actionrd   z--image-dpiDPIzWhen the input file is an image, not a PDF, use this DPI instead of the DPI claimed by the input file. If the input does not claim a sensible DPI, this option will be required.)rc   typerd   z--output-type)pdfapdfzpdfa-1zpdfa-2zpdfa-3nonerk   a  Choose output type. 'pdfa' creates a PDF/A-2b compliant file for long term archiving (default, recommended) but may not suitable for users who want their file altered as little as possible. 'pdfa' also has problems with full Unicode text. 'pdf' minimizes changes to the input file. 'pdf-a1' creates a PDF/A-1b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a PDF/A-3b file. 'none' will produce no output, which may be helpful if only the --sidecar is desired.)choicesrH   rd   z	--sidecar? NFILEa  Generate sidecar text files that contain the same text recognized by Tesseract. This may be useful for building a OCR text database. If FILE is omitted, the sidecar file be named {output_file}.txt; the next argument must NOT be the name of the input PDF. If FILE is set to '-', the sidecar is written to stdout (a convenient way to preview OCR quality). The output file and sidecar may not both use stdout at the same time.)nargsconstrH   rc   rd   z	--versionversionzPrint program version and exit)rh   rt   rd   zJob control optionsz-jz--jobsNr      z8Use up to N CPU cores simultaneously (default: use all).z-qz--quiet
store_truezSuppress INFO messages)rh   rd   z-vz	--verbose      zPrint more verbose messages for each additional verbose level. Use `-v 1` typically for much more detailed logging. Higher numbers are probably only useful in debugging.)rj   rH   rs   rr   rd   z--no-progress-barstore_falseprogress_bar)rh   rK   rd   z--use-threads)rh   rH   rd   z--no-use-threadsuse_threadszMetadata optionszCSet output PDF/A metadata (default: copy input document's metadata)z--titlez3Set document title (place multiple words in quotes))rj   rd   z--authorzSet document authorz	--subjectz Set document subject descriptionz
--keywordszSet document keywordszImage preprocessing optionsz7Options to improve the quality of the final PDF and OCRz-rz--rotate-pagesz=Automatically rotate pages based on detected text orientationz--remove-backgroundzKAttempt to remove background from gray or color pages, setting it to white z-dz--deskewz&Deskew each page before performing OCRz-cz--cleanzClean pages from scanning artifacts before performing OCR, and send the cleaned page to OCR, but do not include the cleaned page in the outputz-iz--clean-finalzgClean page as above, and incorporate the cleaned image in the final PDF.  Might remove desired content.z--unpaper-argszmA quoted string of arguments to pass to unpaper. Requires --clean. Example: --unpaper-args '--layout double'.)rj   rH   rd   z--oversamplei  zPOversample images to at least the specified DPI, to improve OCR results slightly)rc   rj   rH   rd   z--remove-vectorszEXPERIMENTAL. Mask out any vector objects in the PDF so that they will not be included in OCR. This can eliminate false characters.zOCR optionszControl how OCR is appliedz-fz--force-ocrzrRasterize any text or vector objects on each page, apply OCR, and save the rastered output (this rewrites the PDF)z-sz--skip-textzSkip OCR on any pages that already contain text, but include the page in final output; useful for PDFs that contain a mix of images, text pages, and/or previously OCRed pagesz
--redo-ocra  Attempt to detect and remove the hidden OCR layer from files that were previously OCRed with OCRmyPDF or another program. Apply OCR to text found in raster images. Existing visible text objects will not be changed. If there is no existing OCR, OCR will be added.z
--skip-bigMPixelszkSkip OCR on pages larger than the specified amount of megapixels, but include skipped pages in final output)rj   rc   rd   z--invalidate-digital-signatureszNormally, OCRmyPDF will refuse to OCR a PDF that has a digital signature. This option allows OCR to proceed, but the digital signature will be invalidated.Advancedz$Advanced options to control OCRmyPDFz--pageszMLimit OCR to the specified pages (ranges or comma separated), skipping othersz--max-image-mpixelsstorez[Set maximum number of megapixels to unpack before treating an image as a decompression bombg     @o@)rh   rj   rc   rd   rH   z--pdf-renderer)autohocrsandwich	hocrdebugr   zjChoose OCR PDF renderer - the default option is to let OCRmyPDF choose.  See documentation for discussion.z--rotate-pages-thresholdi  
CONFIDENCEz]Only rotate pages when confidence is above this value (arbitrary units reported by tesseract))rH   rj   rc   rd   z--fast-web-viewg      ?	MEGABYTESa~  If the size of file is more than this threshold (in MB), then linearize the PDF for fast web viewing. This allows the PDF to be displayed before it is fully downloaded in web browsers, but increases the space required slightly. By default we skip this for small files which do not benefit. If the threshold is 0 it will be apply to all files. Set the threshold very high to disable.)rj   rH   rc   rd   z--continue-on-soft-render-errora  Continue processing pages after a recoverable PDF rendering error. A recoverable error is one that does not prevent the page from being rendered, but may result in visual differences compared to the input file. Missing fonts are a typical source of these errors.--pluginpluginsrQ   aw  Name of plugin to import. Argument may be issued multiple times to import multiple plugins. Plugins may be specified as module names in Python syntax, provided they are installed in the same Python (virtual) environment as ocrmypdf; or you may give the path to the Python file that contains the plugin. Plugins must conform to the specification in the OCRmyPDF documentation.rK   rh   rH   rd   	Debuggingz4Arguments to help with troubleshooting and debuggingz-kz--keep-temporary-filesz,Keep temporary files (helpful for debugging))r)   _PROGRAM_NAMEr   RawDescriptionHelpFormatteradd_argumentrE   r%   _VERSIONadd_argument_groupr   SUPPRESSr   floatr   )rR   
jobcontrolmetadatapreprocessingocrsettingsadvanced	debuggings          r   
get_parserr   i   s	   ! <<&3Fj $   G       6	   E)	  " 4   -	   **+@AJS!S!G   i3K   S!Q1  
 	   dARAR   	   ((MH "W   *35JK#$F   ,S7NO--%AM L	      5	   	   .	   5	   S!T"   L   ++M;WXK;	   <	   J   UAt$4	   )   ((:H 	   UA   95	   ".UAt$'   UA2   )D   "   ))KI  ;	   Mr   rZ   F)r[   r]   add_helpr\   r   r   rQ   zName of plugin to import.r   )NN)r   zCallable[[Any], T]r   T | Noner   r   )r"   zMapping[str, int])r@   
__future__r   r   collections.abcr   r   typingr   r   ocrmypdf._defaultsr   r	   r   ocrmypdf._versionr
   r   r%   r   r   r   r'   r)   ActionrE   r   plugins_only_parserr   r&   r   r   <module>r      s    ; "  -  = < 5Ce*"X,, "B   $^B %	cEPU      		$ ! r   