
    TAi                    :   S r SSKJr  SSKrSSKrSSKrSSKrSSKJrJ	r	J
r
  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJrJrJrJr  SSKrSSKrSS
KJrJrJr  SSKJr  SSK J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,J-r-J.r.  SSK/J0r0J1r1J2r2  SSK3J4r4J5r5  SSK6J7r7  SSK8J9r9  SSK:J;r;J<r<J=r=J>r>  SSK?J@r@   SSKAJBrB  \" S5      rD\R                  " \F5      rGSrH\B" 5         SAS jrISBSCS jjrJ        SDS jrKSSSSSS.           SES  jjrLSFS! jrMSGS" jrN SH     SIS# jjrO SH     SIS$ jjrPSJS% jrQSKS& jrR        SLS' jrSSMS( jrTSNS) jrUSOS* jrV   SP           SQS+ jjrWSKS, jrXSKS- jrYSKS. jrZSRS/ jr[SSS0 jr\STS1 jr]SRS2 jr^        SUS3 jr_SVS4 jr`      SWS5 jraSXS6 jrb SY         SZS7 jjrcS[S8 jrdS\S9 jreS]S: jrfS^S; jrg      S_S< jrh        S`S= jri    SaS> jrjSbS? jrk        ScS@ jrlg! \C a    S rB GN]f = f)dz,OCRmyPDF page processing pipeline functions.    )annotationsN)IterableIteratorSequence)suppress)BytesIO)Path)copyfileobj)AnyBinaryIOTypeVarcast)Image
ImageColor	ImageDraw)Executor)unpaper)PageContext
PdfContext)repair_docinfo_nuls)DigitalSignatureErrorDpiErrorEncryptedPdfErrorInputFileErrorPriorOcrFoundErrorTaggedPDFErrorUnsupportedImageFormatError)IMG2PDF_KWARGS
Resolutionsafe_symlink)DebugRenderOptionsHocrTransform)Courier)generate_pdfa_ps)
ColorspaceEncodingPageInfoPdfInfo)OrientationConfidence)register_heif_openerc                     g N r-       N/var/www/html/land-ocr/venv/lib/python3.13/site-packages/ocrmypdf/_pipeline.pyr*   r*   0   s    r.   Ti  c           	     V   [         R                  S5         [        R                  " U 5      nU   [         R                  S5        S	UR                  ;   au  UR                  S	   S
::  aa  UR                  (       dP  [         R                  " S/UR                   Q76   [         R                  " S/UR                  S	   Q76   [#        S5      eO=UR                  (       d,  [         R                  " S/UR                   Q76   [#        S5      eUR$                  S;   a  [        S5      eSUR                  ;  aA  UR$                  S:X  a  [         R                  S5        OUR$                  S:X  a  [        S5      eSSS5         [         R                  S5        [&        R(                  nUR                  (       a4  [&        R*                  " [-        UR                  UR                  5      5      n[        US5       n[&        R.                  " [0        R2                  " U 5      4UUS.[4        D6  SSS5        [         R                  S5        g! [         Ga  n[         R                  [        U5      R                  [        U 5      [        UR                  5      5      5        U R                  5       (       d  [         R                  SU 5        U R                  5       (       a  [         R                  SU 5        U R                  5       (       a  [         R                  SU 5        U R                  5       R                  S:X  a  [         R                  SU 5        [        5       UeSnAff = f! , (       d  f       GN= f! , (       d  f       GNT= f! [&        R6                   a  n[        5       UeSnAff = f)a  Triage the input image file.

If the input file is an image, check its resolution and convert it to PDF.

Args:
    input_file: The path to the input file.
    output_file: The path to the output file.
    options: An object containing the options passed to the OCRmyPDF command.

Raises:
    UnsupportedImageFormatError: If the input file is not a supported image format.
    DpiError: If the input image has no resolution (DPI) in its metadata or if the
        resolution is not credible.
z6Input file is not a PDF, checking if it is an image...zInput file does not exist: %szInput file is a directory: %szInput file is a file: %sr   zInput file is empty: %sNzInput file is an imagedpi)`   r3   zImage size: (%d, %d)zImage resolution: (%d, %d)zInput file is an image, but the resolution (DPI) is not credible.  Estimate the resolution at which the image was scanned and specify it using --image-dpi.zInput file is an image, but has no resolution (DPI) in its metadata.  Estimate the resolution at which image was scanned and specify it using --image-dpi.)RGBALAzEThe input image has an alpha channel. Remove the alpha channel first.
iccprofileRGBz-Input image has no ICC profile, assuming sRGBCMYKz/Input CMYK image has no ICC profile, not usablez+Image seems valid. Try converting to PDF...wb)
layout_funoutputstreamz,Successfully converted to PDF, processing...)loginfor   openOSErrorerrorstrreplace
input_fileexistsis_diris_filestatst_sizer   	image_dpisizer   modeimg2pdfdefault_layout_funget_fixed_dpi_layout_funr   convertosfspathr   ImageOpenError)rC   output_fileoptionsimer:   outfs          r/   triage_image_filerX   =   s    HHEF3ZZ
# 
)*BGGwwu~)'2C2C/:"'':5GGJ 
 ""HH+6bgg6F  77n$-! 
 rww&ww%HIF"1E ; 
B3>?//
 997,,g.?.?@J +t$OO		*%%! !	 % 	?@y  3		#a&..Z#g6H6H2IJK  ""II5zBII5zBII0*=??$$)II/<)+23 
P %$ !! 3)+23sU   I EM <A6N 23M2%N MD
MM 
M/2
N<N N(N##N(c                    [        U S5       nUR                  U5      nSSS5        [        R                  " SW5      nU(       a   UR	                  S5      R                  S5      $ g! , (       d  f       NM= f)zTry to find version signature at start of file.

Not robust enough to deal with appended files.

Returns empty string if not found, indicating file is probably not PDF.
rbNs   %PDF-(\d\.\d)   ascii )r>   readresearchgroupdecode)rC   search_windowf	signaturems        r/   _pdf_guess_versionrg      s\     
j$	1FF=)	 
 
		#Y/Awwqz  )) 
 	s   A&&
A4c                ^    [        U5      (       aY  UR                  (       a  [        R                  S5         [        R
                  " U5       nUR                  U5        SSS5        U$  [        XU5        U$ ! , (       d  f       U$ = f! [        R                   a  n[        5       UeSnAf[        R                   a  n[        5       UeSnAff = f! [         aM  n[        R                  SU 35        [        U5      R                  [        U5      U 5      n[        U5      UeSnAff = f)z5Triage the input file. We can handle PDFs and images.zTArgument --image-dpi is being ignored because the input file is a PDF, not an image.NzTemporary file was at: )rg   rI   r<   warningpikepdfr>   savePdfErrorr   PasswordErrorr   r?   debugrA   rB   rX   )original_filenamerC   rS   rT   pdfrV   msgs          r/   triagerr      s   )j))  91\\*-HH[) .  *& jw7 .- 	 ## .$&A-(( 1')q01  )		+J<89!fnnS_.?@S!q()sk   6C B A:!B )C :
B	B C 	B C B++CCCC 
D,AD''D,FT)detailed_analysisprogbarmax_workersuse_threadscheck_pagesc          
          [        U UUUUUUS9$ ! [        R                   a  n[        5       UeSnAf[        R                   a  n[        5       UeSnAff = f)zGet the PDF info.)rs   rt   ru   rv   rw   executorN)r(   rj   rm   r   rl   r   )rC   ry   rs   rt   ru   rv   rw   rV   s           r/   get_pdfinforz      sf    &/###
 	
    )!q( &A%&s    A0AAAc                   U R                   nU R                  nUR                  (       a  [        S5      eUR                  (       a1  UR
                  (       a  [        R                  S5        O
[        5       eUR                  (       aW  UR                  (       a  [        S5      e[        R                  S5        UR                  (       d  [        R                  S5        UR                  (       aS  UR                  (       d"  UR                  (       d  UR                  (       a  [        R                  S5        O
[        5       eU R                   R"                  R%                  XS9  g)	zValidate the PDF info options.z~This PDF contains dynamic XFA forms created by Adobe LiveCycle Designer and can only be read by Adobe Acrobat or Adobe Reader.z*All digital signatures will be invalidatedzVThis PDF has a user fillable form. --redo-ocr is not currently possible on such files.z_This PDF has a fillable form. Chances are it is a pure digital document that does not need OCR.zUse the option --force-ocr to produce an image of the form and all filled form fields. The output PDF will be 'flattened' and will no longer be fillable.zThis PDF is marked as a Tagged PDF. This often indicates that the PDF was generated from an office document and does not need OCR. PDF pages processed by OCRmyPDF may not be tagged correctly.)pdfinforT   N)r|   rT   needs_renderingr   has_signatureinvalidate_digital_signaturesr<   ri   r   has_acroformredo_ocr	force_ocrr=   	is_tagged	skip_textr   plugin_managerhookvalidate)contextr|   rT   s      r/   validate_pdfinfo_optionsr      s   ooGooGN
 	
 00KKDE')) 4 
 KK3
 $$B
  1 1W5E5EKK$ !""(((Jr.   c                V    U R                   (       d  U R                  (       a  [        $ S$ )zBGet a DPI to use for vector pages, if the page has vector content.r   )
has_vectorhas_textVECTOR_PAGE_DPIpageinfos    r/   _vector_page_dpir     s    &11X5F5F?MAMr.   c           	        U R                   nU R                  nU(       d  UR                  nUR                  =(       d    SnUR                  =(       d    Sn[        UR                  5      =(       d    Sn[        [        XF-  =(       d    [        XV-  =(       d    [        [        U5      UR                  =(       d    S5      5      n[        Xw5      $ )ziGet the DPI when we require xres == yres, scaled to physical units.

Page DPI includes UserUnit scaling.
        g      ?)r   rT   r2   xyfloatuserunitmaxr   r   
oversampler   )page_contextrI   r   rT   xresyresr   unitss           r/   get_page_square_dpir   	  s     $$H""GLL	;;#D;;#DX&&'.3H_0_0X&%#		
E e##r.   c           	     0   U R                   nU R                  nU(       d  UR                  n[        [	        UR
                  =(       d    [        UR                  =(       d    [        [        U5      UR                  =(       d    S5      5      n[        XD5      $ )zGet the DPI when we require xres == yres, in Postscript units.

Canvas DPI is independent of PDF UserUnit scaling, which is
used to describe situations where the PDF user space is not 1:1 with
the physical units of the page.
r   )r   rT   r2   r   r   r   r   r   r   r   r   )r   rI   r   rT   r   s        r/   get_canvas_square_dpir   "  ss     $$H""GLL	KK*?KK*?X&%#		
E e##r.   c                8   U R                   nU R                  nSnUR                  (       aM  UR                  UR                  ;  a3  [        R                  SUR                   SUR                   35        SnGOUR                  (       a  UR                  (       d-  UR                  (       d  UR                  (       d  [        S5      eUR                  (       a  [        R                  S5        SnGO'UR                  (       a?  UR                  (       a  [        R                  S5        O[        R                  S5        SnOUR                  (       a  [        R                  S	5        SnOUR                  (       d  UR                  (       d  UR                  (       a5  UR                   (       a$  [        R                  S
UR                    S35        OFUR                  (       a  [        R                  S["         S35        O[        R                  S5        SnU(       az  UR$                  (       ai  UR                  (       aX  UR&                  UR(                  -  nXBR$                  S-  :  a-  Sn[        R                  SUS-  S SUR$                  S S35        U$ )z$Check if the page needs to be OCR'd.Tzskipped z as requested by --pages Fz|page already has text! - aborting (use --force-ocr to force OCR;  see also help for the arguments --skip-text and --redo-ocrz@page already has text! - rasterizing text and running OCR anywayzYsome text on this page cannot be mapped to characters: consider using --force-ocr insteadzredoing OCRz$skipping all processing on this pagez$page has no images - rasterizing at z3 DPI because --force-ocr --oversample was specifiedz>page has no images - all vector content will be rasterized at za DPI, losing some resolution and likely increasing file size. Use --oversample to adjust the DPI.zpage has no images - skipping all processing on this page to avoid losing detail. Use --force-ocr if you wish to perform OCR on pages that have vector content.@B zpage too big, skipping OCR (z.1fz MPixels > z MPixels --skip-big))r   rT   pagespagenor<   rn   r   r   r   r   r   r=   has_corrupt_textri   imageslossless_reconstructionr   r   skip_bigwidth_pixelsheight_pixels)r   r   rT   ocr_requiredpixel_counts        r/   is_ocr_requiredr   :  s	   $$H""GL}}=		HX__--Fw}}oVW			  '*;*;w?O?O$N  HHWXL((9
 'LHH;< L__W%D%D !3!3HH"")"4"4!5 699
 KK!!0 1 2 HH' !L((X__++h.D.DD**Y67 LKK 9,c2+##C((<>
 r.   c                r   UR                  S5      n[        SS5      R                  [        U5      /5      n[        SS5      R                  [	        U5      /5      nUR
                  R                  R                  U USUUR                  R                  S-   USSUR                  R                  (       + S9	  U$ )z'Generate a lower quality preview image.zrasterize_preview.jpgg     r@jpeggrayr[   r   F)	rC   rS   raster_device
raster_dpir   page_dpirotationfilter_vectorstop_on_soft_error)get_pathr   take_minr   r   r   r   rasterize_pdf_pager   r   rT   continue_on_soft_render_error)rC   r   rS   
canvas_dpir   s        r/   rasterize_previewr     s    ''(?@KE5)22	|	,-J %'002El2S1TUH$$77 $$++a/+33QQQ 8 
 r.   c                x   SSSSS.nSSSS	S.nU R                   R                  nS
nUR                  U R                  R                  :  a  US:w  a  SXB   -   nOSnOUS:w  a  SnOSnS
nUS:w  a  SUR                  US5       S3nUSUR                  UR                  S5       3-  nU SUR                  S SU 3$ )zDDescribe the page rotation we are going to perform (or not perform).u   ⇧u   ⇨u   ⇩u   ⇦)r   Z      i   u   ⬏u   ↻u   ⬑r]   r   zwill rotate zrotation appears correctzconfidence too low to rotatez	no changezwith existing rotation ?z, zpage is facing z, confidence .2fz - )r   r   
confidencerT   rotate_pages_thresholdgetangle)r   orient_conf
correction	directionturnsexisting_rotationactionfacings           r/   describe_rotationr     s     u5u=IU7E$--66F!5!5!L!LL?#e&77F/F?3F FFA*9==9JC+P*QQST
	k.?.? EFGGFX];#9#9#">c&JJr.   c                2   UR                   R                  R                  5       R                  XR                  5      nUR
                  S-  n[        R                  [        XU5      5        UR                  UR                  R                  :  a  US:w  a  U$ g)a  Work out orientation correction for each page.

We ask Ghostscript to draw a preview page, which will rasterize with the
current /Rotate applied, and then ask OCR which way the page is
oriented. If the value of /Rotate is correct (e.g., a user already
manually fixed rotation), then OCR will say the page is pointing
up and the correction is zero. Otherwise, the orientation found by
OCR represents the clockwise rotation, or the counterclockwise
correction to rotation.

When we draw the real page for OCR, we rotate it by the CCW correction,
which points it (hopefully) upright. _graft.py takes care of the orienting
the image and text layers.
h  r   )r   r   get_ocr_engineget_orientationrT   r   r<   r=   r   r   r   )previewr   r   r   s       r/   get_orientation_correctionr     s     --22AACSS%%K ""S(JHH|*EF,"6"6"M"MM!Or.   c                    U R                   nUR                  5       nU(       a2  UR                  S:  a"  [        UR                  UR                  5      nU$ UR
                  nU$ )z%Calculate the DPI for the page image.皙?)r   page_dpi_profileaverage_to_max_dpi_ratior   weighted_dpir2   )r   r   dpi_profilerI   s       r/   calculate_image_dpir     s\    $$H++-K{;;cA{779Q9QR	  LL	r.   c                    [        U 5      nU R                  R                  5       n[        X5      n[	        X5      nU(       aJ  UR
                  S:  a:  [        R                  SUR                  UR                  UR                  5       5        X44$ )z$Calculate the DPI for rasterization.r   zWeighted average image DPI is %0.1f, max DPI is %0.1f. The discrepancy may indicate a high detail region on this page, but could also indicate a problem with the input PDF file. Page image will be rendered at %0.1f DPI.)r   r   r   r   r   r   r<   ri   r   max_dpi	to_scalar)r   rI   r   r   r   s        r/   calculate_raster_dpir     s     $L1I''88:K&|?J"<;H{;;cA8 $$  "	
 r.   c                  ^^ / SQmSmUc  UR                   R                  nUR                  SU S35      nUR                  nUU4S jnUR                   H  nUR
                  S:w  a  M  UR                  S:  d  M'  UR                  [        R                  :X  a
  U" S5      mMO  UR                  [        R                  :X  a
  U" S	5      mMw  U" S
5      mM     UR                  (       a  [        R                  S5        U" S
5      mTT   n	[        R                  SU	 SU 35        [        U5      u  pUR                  R                   R#                  U UU	U
UUR$                  S-   UUUR                   R&                  (       + S9	  U$ )a  Rasterize a PDF page to a PNG image.

Args:
    input_file: The input PDF file path.
    page_context: The page context object.
    correction: The orientation correction angle. Defaults to 0.
    output_tag: The output tag. Defaults to ''.
    remove_vectors: Whether to remove vectors. Defaults to None, which means
        the value from the page context options will be used. If the value
        is True or False, it will override the page context options.

Returns:
    Path: The output PNG file path.
)pngmonopnggraypng256png16mr   	rasterizez.pngc                :   > [        TTR                  U 5      5      $ r,   )r   index)
colorspacecolorspaces
device_idxs    r/   at_leastrasterize.<locals>.at_least  s    :{00<==r.   imager[   r   r   r   z%Page has vector content, using png16mzRasterize with z, rotation )	rC   rS   r   r   r   r   r   r   r   )rT   remove_vectorsr   r   r   type_bpccolorr%   r   grayr   r<   rn   r   r   r   r   r   r   )rC   r   r   
output_tagr   rS   r   r   r   devicer   r   r   r   s               @@r/   r   r     se   * =KJ%--<<'')J<t(DEK$$H> ;;'!99q={{j...%h/

/%i0
%h/
 ! 		9:h'
$FIIx{:,?@/=J$$77"$+33QQQ 8 
 r.   c                    [        S UR                  R                   5       5      (       a  [        S5      e[        R                  S5        U $ )zBRemove the background from the input image (temporarily disabled).c              3  >   #    U  H  oR                   S :  v   M     g7f)r[   N)r   ).0r   s     r/   	<genexpr>/preprocess_remove_background.<locals>.<genexpr><  s     
C&BU99q=&B   z2--remove-background is temporarily not implementedz'background removal skipped on mono page)anyr   r   NotImplementedErrorr<   r=   )rC   r   s     r/   preprocess_remove_backgroundr   :  s@    

Cl&;&;&B&B
CCC!"VWW HH67r.   c           
        UR                  S5      n[        U[        U5      5      nUR                  R                  R                  5       nUR                  XR                  5      n[        R                  " U 5       nUR                  U[        R                  R                  [        R                  " SUR                  S9S9nUR!                  X#S9  SSS5        U$ ! , (       d  f       U$ = f)zDeskews the input image using the OCR engine and saves the output to a file.

Args:
    input_file: The input image file to deskew.
    page_context: The context of the page being processed.

Returns:
    Path: The path to the deskewed image file.
zpp_deskew.pngwhite)rK   )resample	fillcolorr2   N)r   r   r   r   r   r   
get_deskewrT   r   r>   rotate
ResamplingBICUBICr   getcolorrK   rk   )rC   r   rS   r2   
ocr_enginedeskew_angle_degreesrU   deskeweds           r/   preprocess_deskewr	  E  s     ''8K
l,?,M
NC,,11@@BJ%00=Q=QR	J	2 99 %%-- ))'@  

 	k+ 
   
 	 s   <AC
C+c                    UR                  S5      n[        U[        U5      5      n[        R                  " U UUR                  5       UR                  R                  S9$ )z$Clean the input image using unpaper.zpp_clean.png)r2   unpaper_args)r   r   r   r   cleanr   rT   r  )rC   r   rS   r2   s       r/   preprocess_cleanr  b  sS    ''7K
l,?,M
NC==MMO!))66	 r.   c           	        UR                  S5      nUR                  n[        R                  " U 5       n[        R                  SUR                  S   5        UR                  (       d  SnUR                  (       a  Sn[        R                  " U5      nUR                  R                  USS9 H  nU Vs/ s H  n[        U5      PM     n	n[        S UR                  S    5       5      n
U	S   U
S   -  UR                  U	S	   U
S
   -  -
  U	S   U
S   -  UR                  U	S
   U
S
   -  -
  4n[        R                  SU5        UR                  USS9  M     UR                   R"                  R%                  XS9nUb  Un[        S UR                  S    5       5      nUR'                  X-S9  SSS5        U$ s  snf ! , (       d  f       U$ = f)zCreate the image we send for OCR.

Might not be the same as the display image depending on preprocessing.
This image will never be shown to the user.
zocr.pngzresolution %rr2   NT)visiblecorruptc              3  >   #    U  H  n[        U5      S -  v   M     g7f)      R@N)r   r   coords     r/   r   #create_ocr_image.<locals>.<genexpr>  s     Pet 3r   r      r[      zblanking %rr   )fill)pager   c              3  8   #    U  H  n[        U5      v   M     g 7fr,   )roundr  s     r/   r   r    s     =nUE%LLns   r   )r   rT   r   r>   r<   rn   r=   r   r   r   r   get_textareasr   tupleheight	rectangler   r   filter_ocr_imagerk   )r   r   rS   rT   rU   maskdrawtextareavbboxxyscale	pixcoords	filter_imr2   s                 r/   create_ocr_imager)  n  s    ''	2K""G	E	b		/2775>2   D&&r*D(11??d @  +33(Qa(3PPPGgaj(IIQ'!* 44Ggaj(IIQ'!* 44		 		-3yw7$ !//44EE F 
	  B =bggen==
%K 
L + 4# 
	L s   BF>5F9
C%F>9F>>
Gc                    UR                  S5      nUR                  S5      nUR                  nUR                  R                  R	                  5       nUR                  U UUUS9  X#4$ )z,Run the OCR engine and generate hOCR output.zocr_hocr.hocrzocr_hocr.txt)rC   output_hocroutput_textrT   )r   rT   r   r   r   generate_hocr)rC   r   hocr_outhocr_text_outrT   r  s         r/   ocr_engine_hocrr0    sq    $$_5H )).9M""G,,11@@BJ!	   ""r.   c                r    [        U R                  5      =(       a    [        S U R                   5       5      $ )a1  Determines whether the visible page image should be saved as a JPEG.

If all images were JPEGs originally, permit a JPEG as output.

Args:
    pageinfo: The PageInfo object containing information about the page.

Returns:
    A boolean indicating whether the visible page image should be saved as a JPEG.
c              3  Z   #    U  H!  oR                   [        R                  :H  v   M#     g 7fr,   )encr&   jpeg)r   rU   s     r/   r   4should_visible_page_image_use_jpg.<locals>.<genexpr>  s      )*9B(--/s   )+)boolr   allr   s    r/   !should_visible_page_image_use_jpgr8    s2       S )*2//) & r.   c                >   UR                  S5      n[        R                  " U 5       nSUR                  ;   a  [	        UR                  S   6 nO[        U[        U5      5      nUR                  USUR                  5       S9  SSS5        U$ ! , (       d  f       U$ = f)z|Create a visible page image in JPEG format.

This is intended to be used when all images on the page were originally JPEGs.
zvisible.jpgr2   JPEG)formatr2   N)	r   r   r>   r=   r   r   r   rk   to_int)r   r   rS   rU   r2   s        r/   create_visible_page_jpgr=    s    
 ''6K	E	b BGGbggen-C &l4G4UVC 	F

= 
  
	 s   AB
Bc           	        UR                  S5      nUR                  nS[        UR                  5      -  S[        UR                  5      -  4nUR
                  U-
  S-  nUS-  S:H  nU(       a
  US   US   4n[        5       n[        U S5       n	[        R                  S	5        [        R                  " U5      n
[        R                  " U	U
U[        R                  R                  [        R                  R                   S
9  [        R                  S5        SSS5        UR#                  S5        [%        XXS9  UR&                  R(                  R+                  XUS9nU$ ! , (       d  f       NO= f)z$Create a PDF page from a page image.zvisible.pdfr  r   r   r   r[   r   rZ   rO   )r:   r;   enginer   zconvert doneN)	swap_axis)r  image_filename
output_pdf)r   r   r   width_inchesheight_inchesr   r   r>   r<   rn   rL   get_layout_funrO   Enginerj   Rotationifvalidseekfix_pagepdf_boxesr   r   filter_pdf_page)r   r   orientation_correctionrS   r   pagesizeeffective_rotationr@  bioimfiler:   s              r/   create_pdf_page_from_imagerQ    sH    ''6K$$HeH1122D5AWAW;X4XXH"++.DDK"S(B.IA;+ )C	eT	f		)++H5
!>>))%%--	
 			.! 
 HHQKcJ--22BBK C K ) 
	s   B	E!!
E/c                   UR                   nUR                  S5      nU R                  5       R                  S:X  a  UR	                  5         U$ [        U[        U5      5      n0 nUR                  S:X  a  [        [        SSSSSSS9[        5       S9n[        SU UR                  5       S.UD6R                  US	U(       d  SOSS
9  U$ )zRender the hOCR page to a PDF.zocr_hocr.pdfr   	hocrdebugTF)render_baselinerender_trianglerender_line_bboxrender_word_bboxrender_paragraph_bboxrender_space_bbox)debug_render_optionsfont)hocr_filenamer2   N)out_filenamerA  invisible_textr-   )rT   r   rG   rH   touchr   r   pdf_rendererdictr!   r#   r"   r   to_pdf)hocrr   rT   rS   r2   debug_kwargss         r/   render_hocr_pagere    s    ""G''7Kyy{a
l,?,M
NCL{*!3 $ $!&!%&+"'" 

  MMO  f #/tU  
 r.   c                    UR                  S5      nUR                  S5      nUR                  nUR                  R                  R	                  5       nUR                  U UUUS9  X#4$ )zBRun the OCR engine and generate a text-only PDF (will look blank).zocr_tess.pdfzocr_tess.txt)rC   rB  r,  rT   )r   rT   r   r   r   generate_pdf)input_imager   rB  r,  rT   r  s         r/   ocr_engine_textonly_pdfri  %  ss     &&~6J''7K""G,,11@@BJ	   ""r.   c                V    U S   US   -   U S   US   -   U S   US   -   U S   US   -   4$ )z%Offset a rectangle by a given amount.r   r[   r  r  r-   )rectoffsets     r/   _offset_rectrm  7  sN     	Q&)Q&)Q&)Q&)	 r.   c                   [         R                  " U 5       nUR                   H  nUR                  R                  nUS   US   4n[        UR                  R                  U5      n[        UR                  R                  U5      n	U(       a$  US   US   US   US   4nU	S   U	S   U	S   U	S   4n	Xl        Xl	        M     UR                  U5        SSS5        U$ ! , (       d  f       U$ = f)ay  Fix the bounding boxes in a single page PDF.

The single page PDF is created with a normal MediaBox with its lower left corner
at (0, 0). infile is the single page PDF. page_context.mediabox has the original
file's mediabox, which may have a different origin. We needto adjust the other
boxes in the single page PDF to match the effect they had on the original page.

When correcting page rotation, we create a single page PDF that is correctly
rotated instead of an incorrectly rotated and then setting page.Rotate on it.
If rotation is either 90 or 270 degrees, then this function can be called
with swap_axis to swap the X and Y coordinates of all the boxes.

We are not concerned with solving degenerate cases where the boxes overlap or
or express invalid rectangles. We merely pass the boxes, producing a
transformation equivalent to the change made by constructing a new page image.
r   r[   r  r  N)rj   r>   r   r   mediaboxrm  cropboxtrimboxCropBoxTrimBoxrk   )
infileout_filer   r@  rp   r  ro  rl  rp  rq  s
             r/   rJ  rJ  A  s    , 
f	IID $,,55Ha[(1+-F"<#8#8#@#@&IG"<#8#8#@#@&IG!!*gaj'!*gajH!!*gaj'!*gajH"L"L  	 
 O 
	 Os   B<C
C,c                >    U R                  S5      n[        U5        U$ )zGenerates a PostScript file stub for the given PDF context.

Args:
    context: The PDF context to generate the PostScript file stub for.

Returns:
    Path: The path to the generated PostScript file stub.
zpdfa.ps)r   r$   )r   rS   s     r/   generate_postscript_stubrw  i  s"     ""9-K[!r.   c                @   UR                   nUR                  nUR                  S5      nUR                  S5      n[        R                  " U 5       n[        U5      (       a  UR                  U5        O[        X5        SSS5        UR                  R                  R                  UR                  U/UUUUR                  S   UR                  (       a$  UR                  R                  R                  5       OSUR                  (       + S9  U$ ! , (       d  f       N= f)zConverts the given PDF to PDF/A.

Args:
    input_pdf: The input PDF file path (presumably not PDF/A).
    input_ps_stub: The input PostScript file path, containing instructions
        for the PDF/A generator to use.
    context: The PDF context.
zfix_docinfo.pdfzpdfa.pdfN)pdf_version	pdf_pagespdfmarkrS   r   	pdfa_partprogressbar_classr   )rT   r|   r   rj   r>   r   rk   r    r   r   generate_pdfamin_versionoutput_typeprogress_barget_progressbar_classr   )	input_pdfinput_ps_stubr   rT   input_pdfinfofix_docinfo_filerS   pdf_files           r/   convert_to_pdfar  w  s     ooGOOM''(9:"":.K 
i	 Hx((MM*+5	 
! --!--#$%%b) ## ""''==?&DDD .  + 
!	 s   .D
Dc                ~    [         R                  " U 5      R                  nX!R                  R                  S-  :  a  gg)zkDetermine whether the PDF should be linearized.

For smaller files, linearization is not worth the effort.
r   TF)rP   rG   rH   rT   fast_web_view)working_filer   filesizes      r/   should_linearizer    s3    
 ww|$,,H??009<=r.   c                    U S:X  a<  [        SS[        R                  R                  [        R                  R
                  S9$ [        SS[        R                  R                  S9$ )zGet pikepdf.Pdf.save settings for the given output type.

Essentially, don't use features that are incompatible with a given
PDF/A specification.
zpdfa-1T)preserve_pdfacompress_streamsstream_decode_levelobject_stream_mode)r  r  r  )ra  rj   StreamDecodeLevelgeneralizedObjectStreamModedisablegenerate)r  s    r/   get_pdf_save_settingsr    sc     h ! ' 9 9 E E&77??	
 	
 ! ' 8 8 A A
 	
r.   c                    U R                  5       R                  nUR                  5       R                  nUS:X  a  gX#-  nSX2-  -
  nXE4$ )ar  Calculate ratio of input to output file sizes and percentage savings.

Args:
    input_file (Path): The path to the input file.
    output_file (Path): The path to the output file.

Returns:
    tuple[float | None, float | None]: A tuple containing the file size
    ratio and the percentage savings achieved by the output file size
    compared to the input file size.
r   NNr[   )rG   rH   )rC   rS   
input_sizeoutput_sizeratiosavingss         r/   _file_size_ratior    sR     "**J""$,,Ka$E+**G>r.   c           
     h   UR                  S5      nUR                  R                  R                  U UUU[	        X5      S9u  pE[        X5      u  pgU(       a  [        R                  SUS SUS 35        [        UR                  U5      u  pgU(       a  [        R                  SUS SUS 35        XE4$ )zOptimize the given PDF file.zoptimize.pdf)r  rB  r   ry   	linearizezImage optimization ratio: r   z
 savings: z.1%zTotal file size ratio: )	r   r   r   optimize_pdfr  r  r<   r=   origin)rC   r   ry   rS   rB  messagesr  r  s           r/   r  r    s     "">2K"1166CC":7 D J &j>NE-eC[
GS/RS%gnnkBNE*5+Z#OPr.   c              #     #    Su  p[        U 5       H0  u  p#US-  nU(       a  Ub  XS-
  4S4v   SnX"4U4v   M)  Ub  M.  UnM2     Ub  X4S4v   gg7f)aV  Enumerate the ranges of non-empty elements in an iterable.

Compresses consecutive ranges of length 1 into single elements.

Args:
    iterable: An iterable of elements to enumerate.

Yields:
    A tuple containing a range of indices and the corresponding element.
    If the element is None, the range represents a skipped range of indices.
r  r[   N)	enumerate)iterableskipped_fromr   txt_files       r/   enumerate_compress_rangesr    s|      %L$X.
'#QY/55#.(**#$ / #T))  s   :A Ac                   UR                  S5      n[        USSS9 n[        U 5       H|  u  u  pEnUS:w  a  UR                  S5        U(       a1  UR	                  SS9nUR                  UR                  S5      5        MW  XE:w  a  U SU 3nOU nUR                  SU S	35        M~     S
S
S
5        U$ ! , (       d  f       U$ = f)zMerge the page sidecar files into a single file.

Sidecar files are created by the OCR engine and contain the text for each
page in the PDF. This function merges the sidecar files into a single file
and returns the path to the merged file.
zsidecar.txtwzutf-8)encodingr[   -z[OCR skipped on page(s) ]N)r   r>   r  write	read_textremovesuffix)		txt_filesr   rS   streamfrom_to_r  txtr   s	            r/   merge_sidecarsr    s     ""=1K	k3	1V&?	&J"LU(zT"(('(: S--d34<$gQse,E$gE7wa@A 'K 
2  
2	1 s   BB33
Cc                F   [         R                  SX5        U R                  S5       nUS:X  aC  [        U[        R
                  R                  5        [        R
                  R                  5         Ou[        US5      (       aD  [        [        U5      n[        X45        [        [        5         UR                  5         SSS5        O [        US5       n[        X45        SSS5        SSS5        g! , (       d  f       N= f! , (       d  f       N(= f! , (       d  f       g= f)a  Copy the final temporary file to the output destination.

Args:
    input_file (Path): The intermediate input file to copy.
    output_file (str | Path | BinaryIO): The output file to copy to.
    original_file: The original file to copy attributes from.

Returns:
    None
z%s -> %srZ   r  writableNzw+b)r<   rn   r>   r
   sysstdoutbufferflushhasattrr   r   r   AttributeError)rC   rS   original_fileinput_streamoutput_streams        r/   
copy_finalr  (  s     IIj*2		,#cjj&7&78JJ[*-- ;7M4.)##% *) k5)]L8 * 
	 *) *) 
	s<   BD-C0>DDD0
C>	:D
D	D
D )rC   r	   rS   r	   returnNone)i   )rC   r	   r  rA   )ro   rA   rC   r	   rS   r	   r  r	   )ry   r   rs   r6  rt   r6  ru   z
int | Nonerv   r6  r  r(   )r   r   r  r  )r   r'   r  intr,   )r   r   rI   zResolution | Noner  r   )r   r   r  r6  )rC   r	   r   r   r  r	   )r   r   r   r)   r   r  r  rA   )r   r	   r   r   r  r  )r   r   r  r   )r   r   )r   r]   N)rC   r	   r   r   r   r  r   rA   r   zbool | Noner  r	   )r   r	   r   r   r  r	   )rC   r	   r   r   r  tuple[Path, Path])r   r'   r  r6  )r   r	   r   r   rL  r  r  r	   )rc  r	   r   r   r  r	   )rh  r	   r   r   r  r  )rk  z!tuple[float, float, float, float]rl  ztuple[float, float])F)
rt  zPath | BinaryIOru  r	   r   r   r@  r6  r  r	   )r   r   r  r	   )r  r	   r  r	   r   r   r  r	   )r  r	   r   r   r  r6  )r  rA   r  zdict[str, Any])rC   r	   rS   r	   r  z!tuple[float | None, float | None])rC   r	   r   r   ry   r   r  ztuple[Path, Sequence[str]])r  zIterable[T]r  z*Iterator[tuple[tuple[int, int], T | None]])r  zIterable[Path | None]r   r   r  r	   )rC   r	   rS   zstr | Path | BinaryIOr  zPath | Noner  r  )m__doc__
__future__r   loggingrP   r_   r  collections.abcr   r   r   
contextlibr   ior   pathlibr	   shutilr
   typingr   r   r   r   rL   rj   PILr   r   r   ocrmypdf._concurrentr   ocrmypdf._execr   ocrmypdf._jobcontextr   r   ocrmypdf._metadatar   ocrmypdf.exceptionsr   r   r   r   r   r   r   ocrmypdf.helpersr   r   r    ocrmypdf.hocrtransformr!   r"   ocrmypdf.hocrtransform._fontr#   ocrmypdf.pdfar$   ocrmypdf.pdfinfor%   r&   r'   r(   ocrmypdf.pluginspecr)   pi_heifr*   ImportErrorr0   	getLogger__name__r<   r   rX   rg   rr   rz   r   r   r   r   r   r   r   r   r   r   r   r   r	  r  r)  r0  r8  r=  rQ  re  ri  rm  rJ  rw  r  r  r  r  r  r  r  r  r-   r.   r/   <module>r     sg  
 3 "  	 	 
 8 8     / /   , , ) " 8 2   F E D 0 * D D 5, CL!  P3f(,;?	@ $"& & 	&
 & & & &6+K\N ?C$$*;$$4 ?C$$*;$$0IX*KK,AKORKK:< 0 "&AAA A 	A
  A 
AH:	.b#  .))*)DG)	)X F##%0##$ 	%%% % 	%
 
%P(V
.#'&.  ) 5=  ,**/*:499#89IT9	9u  s   6H 	HH