
    TAix                       % S r SSKJr  SSKrSSKrSSKrSSKrSSKJr  SSK	J
r
JrJrJrJrJr  SSKJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJrJrJr  SSKJr  SSK J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'J(r(  SSK)J*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3J4r4  SSK5J6r6J7r7  SSK8J9r9  SSK:J;r;J<r<  SSK=J>r>J?r?J@r@  SSKAJBrBJCrCJDrDJErE  \R                  " 5       rG " S S\5      rH " S S\5      rI\J\K\K\K\K4   rL\HR                  \HR                  \HR                  \HR                  \HR                  \HR                  \HR                  \HR                  \HR                  \HR                  \HR                  \HR                  \HR                  \HR                  \HR                  S.rVS\WS'   \IR                  \IR                  \IR                  \IR                  \IR                  \IR                  \IR                  \IR                  \IR                  \IR                  \IR                  S.raS\WS'   \HR                  S \HR                  S!\HR                  S"\HR                  S!\HR                  S 0rbS#\WS$'   S%rcS& rd " S' S(\#5      re " S) S*\#5      rf " S+ S,\#5      rg " S- S.\#5      rh " S/ S05      ri " S1 S25      rjS3 rk\c4SLS4 jjrlSMS5 jrm " S6 S75      rnSNS8 jroSOS9 jrp      SPS: jrqSQS; jrrSS<.     SRS= jjrsSSS> jrt      STS? jruSqvSUS@ jrw\SVSA j5       rx              SWSC jry  SX           SYSD jjrz " SE SF\#5      r{ " SG SB5      r|\7" 5       r} " SH SI5      r~SJ r\SK:X  a  \" 5         gg)Zz/Extract information about the content of a PDF.    )annotationsN)defaultdict)Callable	ContainerIterableIteratorMappingSequence)contextmanagernullcontext)Decimal)Enumauto)partial)hypotinfisclose)PathLike)Path)
NamedTuple)warn)LTPage	LTTextBox)
DictionaryMatrixNameObjectPagePdfPdfImagePdfInlineImageStreamUnsupportedImageTypeErrorparse_content_stream)ExecutorSerialExecutor)ProgressBar)EncryptedPdfErrorInputFileError)
Resolutionavailable_cpu_countpikepdf_enable_mmap)LTStateAwareCharPdfMinerStateget_page_analysisget_text_boxesc                      \ rS rSrSr\" 5       r\" 5       r\" 5       r\" 5       r	\" 5       r
\" 5       r\" 5       r\" 5       r\" 5       r\" 5       rSrg)
Colorspace6   z1Description of common image colorspaces in a PDF. N)__name__
__module____qualname____firstlineno____doc__r   grayrgbcmyklabiccindexsepdevnpatternjpeg2000__static_attributes__r4       Q/var/www/html/land-ocr/venv/lib/python3.13/site-packages/ocrmypdf/pdfinfo/info.pyr2   r2   6   sM    ; 6D
&C6D
&C
&CFE
&C6DfGvHrE   r2   c                      \ rS rSrSr\" 5       r\" 5       r\" 5       r\" 5       r	\" 5       r
\" 5       r\" 5       r\" 5       r\" 5       rSrg)EncodingF   z/Description of common image encodings in a PDF.r4   N)r5   r6   r7   r8   r9   r   ccittjpegrC   jbig2asciihexascii85lzwflate	runlengthrD   r4   rE   rF   rH   rH   F   sF    9 FE6DvHFEvHfG
&CFEIrE   rH   )z/DeviceGrayz/CalGrayz
/DeviceRGBz/CalRGBz/DeviceCMYKz/Labz	/ICCBasedz/Indexedz/Separationz/DeviceNz/Patternz/Gz/RGBz/CMYKz/Izdict[str, Colorspace]FRIENDLY_COLORSPACE)z/CCITTFaxDecodez
/DCTDecodez
/JPXDecodez/JBIG2Decodez/CCFz/DCTz/AHxz/A85z/LZWz/Flz/RLzdict[str, Encoding]FRIENDLY_ENCODING         zdict[Colorspace, int]FRIENDLY_COMP)      ?        rY   rX   rY   rY   c                f    [        [        U 5      n[        U[        5      n[	        S U 5       5      $ )Nc              3  :   #    U  H  u  p[        XS S9v   M     g7f)gMbP?)rel_tolN)r   ).0abs      rF   	<genexpr>"_is_unit_square.<locals>.<genexpr>   s     @xtqwqT*xs   )mapfloatzipUNIT_SQUAREall)	shorthandvaluespairwises      rF   _is_unit_squarerj      s+    	"F6;'H@x@@@rE   c                  8    \ rS rSr% SrS\S'   S\S'   S\S'   S	rg
)XobjectSettings   z%Info about an XObject found in a PDF.strname/tuple[float, float, float, float, float, float]rg   intstack_depthr4   Nr5   r6   r7   r8   r9   __annotations__rD   r4   rE   rF   rl   rl      s    /
I>>rE   rl   c                  8    \ rS rSr% SrS\S'   S\S'   S\S'   S	rg
)InlineSettings   z*Info about an inline image found in a PDF.r!   iimagerp   rg   rq   rr   r4   Nrs   r4   rE   rF   rv   rv      s    4>>rE   rv   c                  L    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
\S'   Srg)ContentsInfo   z*Info about various objects found in a PDF.zlist[XobjectSettings]xobject_settingszlist[InlineSettings]inline_imagesboolfound_vector
found_textz#Mapping[str, list[XobjectSettings]]
name_indexr4   Nrs   r4   rE   rF   rz   rz      s#    4++''33rE   rz   c                  8    \ rS rSr% SrS\S'   S\S'   S\S'   Srg	)
TextboxInfo   z%Info about a text box found in a PDF.z!tuple[float, float, float, float]bboxr~   
is_visible
is_corruptr4   Nrs   r4   rE   rF   r   r      s    /
++rE   r   c                      \ rS rSrSrSrg)VectorMarker   zCSentinel indicating vector drawing operations were found on a page.r4   Nr5   r6   r7   r8   r9   rD   r4   rE   rF   r   r      s    MrE   r   c                      \ rS rSrSrSrg)
TextMarker   zASentinel indicating text drawing operations were found on a page.r4   Nr   r4   rE   rF   r   r      s    KrE   r   c              #     #    U  HB  u  p[        U5      n[        R                  " SU5      (       a  U H	  n/ U4v   M     M=  X4v   MD     g7f)z8Convert runs of qQ's in the stack into single graphobjs.zQ*q+$N)rn   rematch)	graphobjsoperandsoperatorchars       rF   _normalize_stackr      sK     'x=88Hh'' 4j  ! && (s   A
Ac                B   / n[        U5      n/ n/ n[        S 5      nSnSn[        SR                  5       5      n	[        SR                  5       5      n
[        SR                  5       5      nSR	                  X-  U-  5      n[        [        [        X5      5      5       GH9  u  pUu  nnUS:X  aL  UR                  U5        [        U5      S:  a*  [        U5      S	:  a  [        S
U 35      e[        S5        M[  M]  US:X  a   UR                  5       nMv  US:X  a   [        U5      U-  nM  US:X  aS  US   n[!        UUR"                  [        U5      S9nUR                  U5        U[%        U5         R                  U5        M  US:X  a7  US   n['        UUR"                  [        U5      S9nUR                  U5        GM#  UU	;   a  SnGM.  UU
;   d  GM7  SnGM<     [)        UUUUUS9$ ! [         a    [        S5         GMd  f = f! [         a    [        S5      ef = f)a  Interpret the PDF content stream.

The stack represents the state of the PDF graphics stack.  We are only
interested in the current transformation matrix (CTM) so we only track
this object; a full implementation would need to track many other items.

The CTM is initialized to the mapping from user space to device space.
PDF units are 1/72".  In a PDF viewer or printer this matrix is initialized
to the transformation to device space.  For example if set to
(1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.

Images are always considered to be (0, 0) -> (1, 1).  Before drawing an
image there should be a 'cm' that sets up an image coordinate system
where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
page.

PDF units suit our needs so we initialize ctm to the identity matrix.

According to the PDF specification, the maximum stack depth is 32. Other
viewers tolerate some amount beyond this.  We issue a warning if the
stack depth exceeds the spec limit and set a hard limit beyond this to
bound our memory requirements.  If the stack underflows behavior is
undefined in the spec, but we just pretend nothing happened and leave the
CTM unchanged.
c                     / $ Nr4   r4   rE   rF   <lambda>%_interpret_contents.<locals>.<lambda>   s    RrE   FzS s f F f* B B* b b*z	TJ Tj " 'zBI ID EI q Q Do cm q       z5PDF graphics stack overflowed hard limit at operator z(PDF graphics stack overflowed spec limitQz5PDF graphics stack underflowed - PDF may be malformedcmzwPDF content stream is corrupt - this PDF is malformed. Use a PDF editor that is capable of visually inspecting the PDF.Dor   )ro   rg   rr   zINLINE IMAGE)rx   rg   rr   T)r|   r}   r   r   r   )r   r   setsplitjoin	enumerater   r$   appendlenRuntimeErrorr   pop
IndexError
ValueErrorr)   rl   rg   rn   rv   rz   )contentstreaminitial_shorthandstackctmr|   r}   r   r   r   
vector_opstext_showing_ops	image_opsoperator_whitelistngraphobjr   r   
image_namesettingsrx   inlines                        rF   _interpret_contentsr      sG   4 E
"
#C.0*,MZ(JLJ+1134J?0023(..01I*"?)"KL -mPQ &(s?LL5zBu:#&OPQsS  ?@  _Niik
 X&, !!J&3==c%jH ##H-s:'..x8'a[F#CJF   (#L))JWZ )#! =  N LMMN  $W s   G+H+HHHc                   ^^	^
 U u  p#pE  n[        X#5      [        XE5      4m
SS jm	U	U
U4S j[        S5       5       u  px[        Xx5      $ )a  Given the transformation matrix and image size, find the image DPI.

PDFs do not include image resolution information within image data.
Instead, the PDF page content stream describes the location where the
image will be rasterized, and the effective resolution is the ratio of the
pixel size to raster target size.

Normally a scanned PDF has the paper size set appropriately but this is
not guaranteed. The most common case is a cropped image will change the
page size (/CropBox) without altering the page content stream. That means
it is not sufficient to assume that the image fills the page, even though
that is the most common case.

A PDF image may be scaled (always), cropped, translated, rotated in place
to an arbitrary angle (rarely) and skewed. Only equal area mappings can
be expressed, that is, it is not necessary to consider distortions where
the effective DPI varies with position.

To determine the image scale, transform an offset axis vector v0 (0, 0),
width-axis vector v0 (1, 0), height-axis vector vh (0, 1) with the matrix,
which gives the dimensions of the image in PDF units. From there we can
compare to actual image dimensions. PDF uses
row vector * matrix_transposed unlike the traditional
matrix * column vector.

The offset, width and height vectors can be combined in a matrix and
multiplied by the transform matrix. Then we want to calculated
    magnitude(width_vector - offset_vector)
and
    magnitude(height_vector - offset_vector)

When the above is worked out algebraically, the effect of translation
cancels out, and the vector magnitudes become functions of the nonzero
transformation matrix indices. The results of the derivation are used
in this code.

pdfimages -list does calculate the DPI in some way that is not completely
naive, but it does not get the DPI of rotated images right, so cannot be
used anymore to validate this. Photoshop works, or using Acrobat to
rotate the image back to normal.

It does not matter if the image is partially cropped, or even out of the
/MediaBox.

c                .    U S:w  a  X-  O[         nX2-  nU$ )Nr   )r   )drawnpixelsinches_per_ptscaledpis        rF   calc_get_dpi.<locals>.calcM  s    "'1*##
rE   c              3  B   >#    U  H  nT" TU   TU   5      v   M     g 7fr   r4   )r]   r   r   image_drawn
image_sizes     rF   r`   _get_dpi.<locals>.<genexpr>S  s#     JADQA77s      )      R@)r   ranger*   )ctm_shorthandr   r^   r_   cd_dpi_wdpi_hr   r   s    `       @@rF   _get_dpir     sK    \ %A!1 +uQ{*K KqJLEe##rE   c                  0   \ rS rSr% Sr\" S5      rS\S'   S\S'   SS	S	S	S
.   SS jjrSS jr	\
S 5       r\
S 5       r\
SS j5       r\
SS j5       r\
S 5       r\
S 5       r\
S 5       r\
S 5       r\
SS j5       r\
SS j5       r\
SS j5       rS rSrg	) 	ImageInfoiW  zInformation about an image found in a PDF.

This gathers information from pikepdf and pdfminer.six, and is pickle-able
so that it can be passed to a worker process, unlike objects from those
libraries.
z1.000
int | None_comprn   _name N)ro   pdfimager   rg   c                  [        U5      U l        X@l        Ub
  SU l        UnO6Ub(  [	        U[
        5      (       a  SU l        [        U5      nO[        S5      eUR                  U l	        UR                  U l        UR                  R                  [        R                  S5      =nb  [	        U[
        [         -  5      (       ar  [#        UR                  [        R$                  S5      U R                  5      U l	        [#        UR                  [        R&                  S5      U R                  5      U l        UR                  R                  [        R(                  S5      =nb  [	        U[
        [         -  5      (       ar  [#        UR                  [        R$                  S5      U R                  5      U l	        [#        UR                  [        R&                  S5      U R                  5      U l        UR*                  (       a  SU l        OSU l        [/        UR0                  5      U l         [4        R                  UR6                  S   5      U l         [<        R                  UR>                  =(       d    S5      U l         U R8                  [D        RF                  :X  a  [H        RF                  U l         SU l%        U R@                  [H        RL                  :X  a,  [	        U[        5      (       a  U RO                  U5      U l%        g[	        U R@                  [H        5      (       a$  [P        R                  U R@                  5      U l%        U RJ                  cL  U R8                  [D        RR                  [D        RT                  4;   a  [P        [H        RV                     U l%        ggg! [:         a    SU l         GNcf = f! [B         a    SU l          GNMf = f)	zInitialize an ImageInfo.Nr   xobjectz%Either pdfimage or inline must be setr   stencilimager   ),rn   r   
_shorthand_origin
isinstancer"   r    r   width_widthheight_heightobjgetr   SMaskr   maxWidthHeightMask
image_mask_typerq   bits_per_component_bpcrS   filters_encr   rR   
colorspace_colorNotImplementedErrorrH   rC   r2   r   r>   	_init_iccrW   rJ   rL   r:   )selfro   r   r   rg   pimsmaskmasks           rF   __init__ImageInfo.__init__d  s    Y
# #DLC!j6&B&B$DL8$CDEEiizzWW[[T22E? %*!455!%))DJJ":DKKH"599T[[!#<dllKGGKK		400D=
 $ 344!$((4::q"94;;G"488DKK#;T\\J
 >>"DJ DJ../		)--ckk!n=DI	-11#..2FBGDK 99)))$--DK
;;*..(ZX-F-F,DJ$++z22*..t{{;
 zz!diiHNNHNN3S&S*:??;
 'T!%  	DI	
 # 	DK	s$   -'N# -N; #N87N8;OOc                j    UR                   nUb  [	        US5      (       d  [        R                  SU  35        g  UR
                  R                  S:X  a  gUR
                  R                  S:X  a  gg	! [         a%  n[        R                  SU SU  35         S nAg S nAff = f! [         a     g f = f)
NznAn image with a corrupt or unreadable ICC profile was found. Output PDF may not match the input PDF visually: z. profilezuAn image with an ICC profile but no ICC profile data was found. The output PDF may not match the input PDF visually. GRAYrT   CMYKrV   rU   )r>   r#   loggerwarninghasattrr   xcolor_spaceAttributeError)r   r   r>   es       rF   r   ImageInfo._init_icc  s    	''C ;gc955NNHHLvO 	{{''61))V3' ) 	NNDDE3bP 	(  		s.   A3 B% B% 3
B"=BB"%
B21B2c                    U R                   $ )z+Name of the image as it appears in the PDF.)r   r   s    rF   ro   ImageInfo.name       zzrE   c                    U R                   $ )z+Type of image, either 'image' or 'stencil'.)r   r  s    rF   type_ImageInfo.type_  r	  rE   c                    U R                   $ )zWidth of the image in pixels.)r   r  s    rF   r   ImageInfo.width       {{rE   c                    U R                   $ )zHeight of the image in pixels.)r   r  s    rF   r   ImageInfo.height       ||rE   c                    U R                   $ )zBits per component.)r   r  s    rF   bpcImageInfo.bpc  s     yyrE   c                8    U R                   b  U R                   $ S$ )zColorspace of the image.?)r   r  s    rF   colorImageInfo.color  s     #kk5t{{>3>rE   c                8    U R                   b  U R                   $ S$ )z+Number of components/channels in the image.r  )r   r  s    rF   compImageInfo.comp  s     "ZZ3tzz<<rE   c                8    U R                   b  U R                   $ S$ )zEncoding of the image.r   )r   r  s    rF   encImageInfo.enc  s     !II1tyy>w>rE   c                    U R                   R                  =(       a;    U R                  S:  =(       a%    U R                  S:  =(       a    U R                  S:g  $ )zWhether the image is renderable.

Some PDFs in the wild have invalid images that are not renderable,
due to unusual dimensions.

Stencil masks are not also not renderable, since they are not
drawn, but rather they control how rendering happens.
r   r   )r   	is_finiter   r   r  r  s    rF   
renderableImageInfo.renderable  sJ     HH (

a(q ( 

i'		
rE   c                Z    [        U R                  U R                  U R                  45      $ )z^Dots per inch of the image.

Calculated based on where and how the image is drawn in the PDF.
)r   r   r   r   r  s    rF   r   ImageInfo.dpi  s"     $++t||)DEErE   c                    U R                   (       d  g[        U R                  U R                  R                  -  U R
                  U R                  R                  -  -  5      $ )z,Physical area of the image in square inches.rY   )r"  rc   r   r   xr   yr  s    rF   printed_areaImageInfo.printed_area
  s@     djj488::-$++

2JKLLrE   c                    SU R                    SU R                   SU R                   SU R                   SU R                   SU R
                   SU R                   SU R                   SU R                   S3$ )z,Return a string representation of the image.z<ImageInfo 'z' r      ×>)	ro   r  r   r   r  r  r  r  r   r  s    rF   __repr__ImageInfo.__repr__  sm     499+R

|1TZZL4;;-qzzl!DII;az488*AdhhZqJ	
rE   )
r   r   r   r   r   r   r   r   r   r   )r   zObject | Noner   zPdfInlineImage | None)r   r    returnrq   r1  r~   r1  r*   )r1  rc   )r5   r6   r7   r8   r9   r   DPI_PRECrt   r   r   propertyro   r  r   r   r  r  r  r  r"  r   r)  r.  rD   r4   rE   rF   r   r   W  s3    wHJ
 "&(,I<  	I<
 &I<V4           ? ? = = ? ? 
 
  F F M M
rE   r   c              #     #    [        U R                  5       H*  u  p[        SUS 3UR                  UR                  S9v   M,     g7f)z(Find inline images in the contentstream.zinline-02d)ro   rg   r   N)r   r}   r   rg   rx   )contentsinfor   r   s      rF   _find_inline_imagesr9    sC     |99:	1S'"f.>.>v}}
 	
 ;s   AAc              #  h  #    [         R                  U ;  a  gU [         R                     n[         R                  U;  a  gU[         R                     R                  5        HL  u  p#Ub  [         R                  U;  a  M  U[         R                     [         R
                  :X  d  ME  UnXB4v   MN     g7f)a  Search for all XObject-based images in the container.

Usually the container is a page, but it could also be a Form XObject
that contains images. Filter out the Form XObjects which are dealt with
elsewhere.

Generate a sequence of tuples (image, xobj container), where container,
where xobj is the name of the object and image is the object itself,
since the object does not know its own name.

N)r   	ResourcesXObjectitemsSubtypeImage)	container	resourceskey	candidater   s        rF   _image_xobjectsrD  !  s      ~~Y&$..)I||9$#DLL1779I =T\\"djj0 H/! :s   B B2&B2c              #    #    [        U 5       Hx  u  p#X1R                  ;  a  M  UR                  U    HO  nUR                  S:X  a  [        UR                  5      (       a  M/  [        UR                  X$R                  S9v   MQ     Mz     g7f)zFind images stored in the container's /Resources /XObject.

Usually the container is a page, but it could also be a Form XObject
that contains images.

Generates images with their DPI at time of drawing.
r   )ro   r   rg   N)rD  r   rr   rj   rg   r   ro   )r@  r8  r   xobjdraws        rF   _find_regular_imagesrH  :  sr      *)4... ++D1D1$)H)H XXX 2 5s   B	Bc              #    #    [         R                  U;  a  gU[         R                     n[         R                  U;  a  gU[         R                     R                  5       nU H  nXE   nUb1  UR	                  [         R
                  5      [         R                  :w  a  M=  UnUR                   H3  nUR                  U:w  a  M  UR                  n	[        XU	S9 Sh  vN   M5     M     g N7f)zuFind any images that are in Form XObjects in the container.

The container may be a page, or a parent Form XObject.

Npdfr@  rg   )r   r;  r<  as_dictr   r>  Formr|   ro   rg   _process_content_streams)
rK  r@  r8  rA  xobjsrF  rC  form_xobjectr   r   s
             rF   _find_form_xobject_imagesrQ  S  s      ~~Y&$..)I||9$dll#++-EK		dll ;tyy H $55H}}$ %..M/=   6 s   CC,C*C,)rg   c              #  .  #    UR                  [        R                  5      [        R                  :X  a$  [        R                  U;   a  U=(       d    [
        nOUR                  [        R                  5      [        R                  :X  a  U[        R                     [        R                  :X  a`  U(       a  [        U5      O	[        5       nUR                  [        R                  [        5       5      n[        U5      nXd-  nUR                  nOg[        X5      nUR                  (       a  [        5       v   UR                  (       a  [        5       v   [!        U5       Sh  vN   [#        X5       Sh  vN   [%        XU5       Sh  vN   g N, N N	7f)a  Find all individual instances of images drawn in the container.

Usually the container is a page, but it may also be a Form XObject.

On a typical page images are stored inline or as regular images
in an XObject.

Form XObjects may include inline images, XObject images,
and recursively, other Form XObjects; and also vector graphic objects.

Every instance of an image being drawn somewhere is flattened and
treated as a unique image, since if the same image is drawn multiple times
on one page it may be drawn at differing resolutions, and our objective
is to find the resolution at which the page can be rastered without
downsampling.

N)r   r   Typer   Contentsre   r<  r>  rM  r   rg   r   r   r   r   r   r9  rH  rQ  )rK  r@  rg   r   r   form_shorthandform_matrixr8  s           rF   rN  rN  s  s    ( }}TYY499,)1K%4dii DLL0dll#tyy0 $-fY&( #t{{FH=^, MM&yDL  nl"<000#I<<<(FFF 1<Fs6   E F"F#F5F6F	F
FFFc                    [        U5      [        U5      pCSnXS-  SU-
  U-  SU-
  U-  XT-  4nSS jnSnU  H  n	U" X5      (       d  M  Sn  U$    U$ )z4Smarter text detection that ignores text in margins.g      ?rT   c                    U S   US   :  =(       a/    U S   US   :  =(       a    U S   US   :  =(       a    U S   US   :  $ )zCheck if two 4-tuple rects intersect.

Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3)
https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other
Formula assumes all boxes are in first quadrant.
r   r   rT   rU   r4   )r^   r_   s     rF   rects_intersect'_page_has_text.<locals>.rects_intersect  sI     tad{Jqtad{Jqtad{Jqtad{JrE   FT)r^   	FloatRectr_   r[  r1  r~   )rc   )
text_blocks
page_widthpage_heightpwphmargin_ratiointerior_bboxrY  has_textr   s
             rF   _page_has_textrd    s{    :k 2L	
\	R	
\	R	MK H4//HO	  OrE   c              #    #    U" U 5       Hr  nUR                   S   nUR                   S   n[        U[        5      (       d  M8  UR                  S:g  nUR	                  5       S:H  n[        UR                  XV5      v   Mt     g7f)zuExtract only limited content from text boxes.

We do this to save memory and ensure that our objects are pickleable.
r   rU   u   �N)_objsr   r-   
rendermodeget_textr   r   )
miner_pagetextbox_getterbox
first_line
first_charvisiblecorrupts          rF   simplify_textboxesrp    sx      j)YYq\
%%a(
*&677''1,%%'83#((G55 *s   B Bc                    [        5         [        R                  " S5      R                  U5        U c0  [        R
                  " U5      qS n[        R                  " U5        g g )Npdfminerc                 ,    [         R                  5         g r   )
worker_pdfcloser4   rE   rF   on_process_close1_pdf_pageinfo_sync_init.<locals>.on_process_close  s    rE   )	r,   logging	getLoggersetLevelr   openrt  atexitregister)rK  infilepdfminer_loglevelrv  s       rF   _pdf_pageinfo_sync_initr    sP    j!**+<= {XXf%
	 	() rE   c              #     #    U b  U v   g [         b	  [         v   g [        R                  " U5       nUv   S S S 5        g ! , (       d  f       g = f7fr   )rt  r   r{  )
thread_pdfr~  rK  s      rF   _pdf_pageinfo_sync_pdfr    s;     		XXfI s   /A?	A
A	APageInfoc           	     h    [        X5       n[        X`X#XE5      sS S S 5        $ ! , (       d  f       g = fr   )r  r  )pagenor  r~  check_pagesdetailed_analysisminer_staterK  s          rF   _pdf_pageinfo_syncr    s*     
 
	3s.?
 
4	3	3s   #
1c	                p  ^^^^^^ S /[        U R                  5      -  mSU4S jjn	Uc
  [        5       n[        U R                  5      n
[        S[        T5      S-  -   U5      nUS:X  a  SnU(       a  US:  a  SnU(       a  U OS mUUUUU4S j[	        U
5       5       nU(       a  US:X  d   S5       eUS:  d   S5       e[
        R                  SU S	3U(       a  S
OS-   S-   5        U" UU[        U
SSU(       + S9[        [        TT[        R                  " S5      R                  5      [        UU	S9  T$ )Npagec                h   > U (       d  [        S5      eU TU R                  '   UR                  5         g )NzCould read a page in the PDF)r)   r  update)r  pbarpagess     rF   update_pageinfo1_pdf_pageinfo_concurrent.<locals>.update_pageinfo  s)     !?@@!dkkrE   rT   rV   Tc              3  4   >#    U  H  nUTTTTT4v   M     g 7fr   r4   )r]   r   r  r  r~  initial_pdfr  s     rF   r`   +_pdf_pageinfo_concurrent.<locals>.<genexpr>+  s&      A 
K.?Ms   zNot multithreadablezGathering info with r   threadprocessz workerszScanning contents)totaldescunitdisablerr  )use_threadsmax_workersprogress_kwargsworker_initializertasktask_argumentstask_finished)r  r  r  r'   )r   r  r+   minr   r   debugdictr   r  rx  ry  levelr  )rK  executorr  r  r~  progbarr  r  r  r  r  	n_workerscontextsr  r  s       ` ```    @@rF   _pdf_pageinfo_concurrentr    sG    %)6C		N#:E )+		NEAE
a'5IA~ y1} 	
 %#$K uH )9>S>SS<i1nS>SS<
LL
yk+"8		3
	
 1G
 ##j)//	
  %  LrE   c                  H    \ rS rSr% SrS\S'    S\S'    S\S'    S\S'   Srg	)
PageResolutionProfileiH  z,Information about the resolutions of a page.rc   weighted_dpimax_dpiaverage_to_max_dpi_ratio
area_ratior4   Nrs   r4   rE   rF   r  r  H  s.    6SN2## rE   r  c                     \ rS rSr% SrS\S'   S\S'   / rS\S'     S            S!S	 jjr            S!S
 jr\	S"S j5       r
\	S#S j5       r\	S#S j5       r\	S#S j5       r\	S$S j5       r\	S$S j5       r\	S"S j5       r\	S"S j5       r\	S"S j5       r\R&                  S 5       r\	S%S j5       r\	S%S j5       r\	S%S j5       r\	S&S j5       rS'S(S jjr\	S)S j5       r\	S$S j5       r\	S*S j5       rS+S jrS rSrg),r  ib  z9Information about type of contents on each page in a PDF.bool | None	_has_text_has_vectorlist[ImageInfo]_imagesNc                N    X l         X0l        XPl        U R                  XX4XV5        g)zInitialize a PageInfo object.N)_pageno_infile_detailed_analysis_gather_pageinfo)r   rK  r  r~  r  r  r  s          rF   r   PageInfo.__init__i  s*     "3.?	
rE   c                j   UR                   U   nUR                  R                  5        Vs/ s H  n[        U5      PM     n	nU	S   U	S   -
  n
U	S   U	S   -
  nUR                  R                  5        Vs/ s H  n[        U5      PM     snU l        UR                  R                  5        Vs/ s H  n[        U5      PM     snU l        UR                  R                  5        Vs/ s H  n[        U5      PM     snU l	        X$;   nU(       af  U(       a_  UR                  U5      nUb  [        [        U[        5      5      U l        O/ U l        S U R                   5       n[        XU5      U l        O/ U l        S U l        UR#                  [$        R&                  [        S5      5      n[)        U[        5      (       d  [        U5      nXl        X-  [        S5      -  U l        X-  [        S5      -  U l        [1        [3        UR4                  SS5      5      U l        USSUSS4nU(       a  S	U l        S	U l        / U l        [=        XUS
9 Hy  n[)        U[>        5      (       a	  SU l        M!  [)        U[@        5      (       a	  SU l        M?  [)        U[B        5      (       a  U R:                  RE                  U5        Mq  [G        5       e   OS U l        S U l        / U l        S U l$        U R:                  (       a  [K        SS5      RM                  S U R:                   5       5      nUU l$        [1        [O        URP                  [        U R,                  5      -  5      5      U l)        [1        [O        URT                  [        U R.                  5      -  5      5      U l+        g g s  snf s  snf s  snf s  snf )Nr   r   rU   rT   c              3  8   #    U  H  oR                   v   M     g 7fr   r   )r]   rk  s     rF   r`   ,PageInfo._gather_pageinfo.<locals>.<genexpr>  s     :/3hh/   rX   r   RotateFrJ  TrY   c              3  ^   #    U  H#  oR                   (       d  M  UR                  v   M%     g 7fr   )r"  r   )r]   r   s     rF   r`   r    s      0'3e7G7G			|s   --),r  mediaboxas_listr   cropboxrc   _cropbox	_mediaboxtrimbox_trimboxr/   listrp  r0   
_textboxesrd  r  r   r   UserUnitr   	_userunit_width_inches_height_inchesrq   getattrr   _rotater  r  rN  r   r   r   r   r   _dpir*   take_maxroundr'  _width_pixelsr(  _height_pixels)r   rK  r  r~  r  r  r  r  r   r  width_pt	height_ptcheck_this_pagepage_analysisbboxesuserunituserunit_shorthandinfor   s                      rF   r  PageInfo._gather_pageinfoz  s    YYv&(,(=(=(?@(?1GAJ(?@A;!,QK(1+-	 ,0<<+?+?+AB+Aaq+AB,0MM,A,A,CD,Cq%(,CD+/<<+?+?+AB+Aaq+AB /0'99&AM("&&}nE# #%:$//:F+FiHDN DO!DN88DMM73<8(G,,x(H!%074=@'2WT]B7488Xq9:&1h1=$D"DNDL03E dL11'+D$j11%)DNi00LL''--//  $D!DNDL	<<S#&// 0'+||0 C DI!$U35559K9K3L+L%M!ND"%eCEEE$:M:M4N,N&O"PD u A CDBs   N!5N&.N+'N0c                    U R                   $ )zReturn page number (0-based).)r  r  s    rF   r  PageInfo.pageno  r  rE   c                ,    [        U R                  5      $ )z6Return True if page has text, False if not or unknown.)r~   r  r  s    rF   rc  PageInfo.has_text  s     DNN##rE   c                r    U R                   (       d  [        S5      e[        S U R                   5       5      $ )z>Return True if page has corrupt text, False if not or unknown.zDid not do detailed analysisc              3  8   #    U  H  oR                   v   M     g 7fr   )r   )r]   tboxs     rF   r`   ,PageInfo.has_corrupt_text.<locals>.<genexpr>  s     ?t??r  )r  r   anyr  r  s    rF   has_corrupt_textPageInfo.has_corrupt_text  s.     &&%&DEE?t???rE   c                ,    [        U R                  5      $ )zReturn True if page has vector graphics, False if not or unknown.

Vector graphics are sometimes used to draw fonts, so it may not be
obvious on visual inspection whether a page has text or not.
)r~   r  r  s    rF   
has_vectorPageInfo.has_vector  s     D$$%%rE   c                    U R                   $ )zReturn width of page in inches.)r  r  s    rF   width_inchesPageInfo.width_inches       !!!rE   c                    U R                   $ )z Return height of page in inches.)r  r  s    rF   height_inchesPageInfo.height_inches       """rE   c                ~    [        [        [        U R                  5      U R                  R
                  -  5      5      $ )zReturn width of page in pixels.)rq   r  rc   r  r   r'  r  s    rF   width_pixelsPageInfo.width_pixels  s,     5t001DHHJJ>?@@rE   c                ~    [        [        [        U R                  5      U R                  R
                  -  5      5      $ )z Return height of page in pixels.)rq   r  rc   r  r   r(  r  s    rF   height_pixelsPageInfo.height_pixels  s,     5t112TXXZZ?@AArE   c                    U R                   $ )zDReturn rotation of page in degrees.

Will only be a multiple of 90.
)r  r  s    rF   rotationPageInfo.rotation  s     ||rE   c                2    US;   a  Xl         g [        S5      e)N)r   Z      i  ih  iiLiz!rotation must be a cardinal angle)r  r   )r   values     rF   r  r     s    ;; L@AArE   c                    U R                   $ )z*Return cropbox of page in PDF coordinates.)r  r  s    rF   r  PageInfo.cropbox       }}rE   c                    U R                   $ )z+Return mediabox of page in PDF coordinates.)r  r  s    rF   r  PageInfo.mediabox       ~~rE   c                    U R                   $ )z*Return trimbox of page in PDF coordinates.)r  r  s    rF   r  PageInfo.trimbox  r  rE   c                    U R                   $ )zReturn images.)r  r  s    rF   imagesPageInfo.images  r  rE   c                   ^^^         SS jmU R                   (       d  Tb  Tb  [        S5      eU R                   $ UUU4S jU R                    5       $ )z?Return textareas bounding boxes in PDF coordinates on the page.c                ^    SnUb  U R                   U:w  a  SnUb  U R                  U:w  a  SnU$ )NTF)r   r   )r   want_visiblewant_corruptresults       rF   	predicate)PageInfo.get_textareas.<locals>.predicate  s<     F'>>\1"F'>>\1"FMrE   z#Incomplete information on textboxesc              3  ^   >#    U  H"  nT" UTT5      (       d  M  UR                   v   M$     g 7fr   r  )r]   r   ro  r  rn  s     rF   r`   )PageInfo.get_textareas.<locals>.<genexpr>+  s"     XOSygw7WOs   --)r   r   r  r  r  r  r1  r~   )r  r   )r   rn  ro  r  s    ``@rF   get_textareasPageInfo.get_textareas  sa    
	
	,7
	GR
	
	 "w':)*OPP??"XDOOXXrE   c                L    U R                   c  [        SS5      $ U R                   $ )z3Return DPI needed to render all images on the page.rY   )r  r*   r  s    rF   r   PageInfo.dpi-  s%     99c3''yyrE   c                    U R                   $ )zReturn user unit of page.)r  r  s    rF   r  PageInfo.userunit4  r
  rE   c                     U R                   b  gg)z6Return minimum PDF version needed to render this page.z1.6z1.5r  r  s    rF   min_versionPageInfo.min_version9  s     ==$rE   c                   / n/ nU R                    HZ  nUR                  (       d  M  UR                  UR                  R	                  5       5        UR                  UR
                  5        M\     [        U5      nUS:X  a  gU Vs/ s H  oUU-  PM	     nn[        R                  " X5      n[        U5      nXx-  n	UR                  U5      n
X*   U-  n[        UUU	U5      $ s  snf )a|  Return information about the DPIs of the page.

This is useful to detect pages with a small proportion of high-resolution
content that is forcing us to use a high DPI for the whole page. The ratio
is weighted by the area of each image. If images overlap, the overlapped
area counts.

Vector graphics and text are ignored.

Returns None if there is no meaningful DPI for the page.
r   N)r  r"  r   r   	to_scalarr)  sum
statisticsharmonic_meanr   r?   r  )r   
image_dpisimage_areasr   total_drawn_areaareaweightsr  r  dpi_average_max_ratioarg_max_dpimax_area_ratios               rF   page_dpi_profilePageInfo.page_dpi_profileA  s     
\\E##eii1134u112	 " {+q 7BC{t**{C!//
Dj/ , 6 &&w/$14DD$!	
 	
 Ds   C c                    SU R                    SU R                   SU R                   SU R                   SU R                   SU R
                   S3$ )Return string representation.z<PageInfo pageno=r   z"xz" rotation=z dpi=z
 has_text=r-  )r  r  r  r  r   rc  r  s    rF   r.  PageInfo.__repr__h  sZ    kk]!D$5$5#6b9K9K8L MeDHH:ZaQ	
rE   )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  FN)rK  r   r  rq   r~  r   r  Container[int]r  r~   r  PdfMinerState | Noner0  r2  )r1  r   )r1  r[  )r1  r  )NN)rn  r  ro  r  r3  r1  rn   )r1  zPageResolutionProfile | None)r5   r6   r7   r8   r9   rt   r  r   r  r5  r  rc  r  r  r  r  r  r  r  setterr  r  r  r  r  r   r  r!  r0  r.  rD   r4   rE   rF   r  r  b  s!   C!G_! #(,0

 
 	

 $
  
 *
"JQJQ JQ 	JQ
 $JQ  JQ *JQX   $ $ @ @ & & " " # # A A B B   __B B        Y,      %
N
rE   c                  ,   \ rS rSr% SrSrS\S'   SrS\S'   SrS\S'   SSSS	S\	S
.           SS jjr
\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       rSS jrS rS rSrg)PdfInfoit  zExtract summary information about a PDF without retaining the PDF itself.

Crucially this lets us get the information in a pure Python format so that
it can be pickled and passed to a worker process.
Fr~   _has_acroform_has_signature_needs_renderingNT)r  r  r  r  r  r  c                  Xl         Uc  [        SS5      n[        R                  " U5       nUR                  (       a
  [        5       e[        UR                  R                  [        R                  S5      5      R                  S5      n	U(       a  [        X5      O	[        5       U l        U R                   n
[        UUUUUUUUU
S9	U l        SSS5        UR"                  R                  [        R$                  S5      U l        [        R(                  UR"                  ;   a  [+        UR"                  R(                  R                  [        R,                  / 5      5      S:  a  SU l        O/[        R0                  UR"                  R(                  ;   a  SU l        [3        UR"                  R(                  R                  [        R4                  S5      S	-  5      U l        [3        UR"                  R                  [        R8                  0 5      R                  [        R:                  S5      5      U l        SSS5        g! , (       d  f       GNx= f! , (       d  f       g= f)
zInitialize.Nr   i ʚ;r   PScript5)r  r  r  FTrT   )r  r   r   r{  is_encryptedr(   rn   docinfor   r   Creator
startswithr.   r   _miner_stater  _pagesRootNeedsRenderingr>  AcroFormr   Fieldsr<  XFAr~   SigFlagsr=  MarkInfoMarked
_is_tagged)r   r~  r  r  r  r  r  r  rK  pscript5_moder  s              rF   r   PdfInfo.__init__  s    =1KXXf'))b ABMMM
 % f4 ] 
 ""k6 +&7 +
 # %(HHLL1D1De$LD!}}(sxx((,,T[["=>B)-D&XX!2!22)-D&&*388+<+<+@+@PQ+RUV+V&W#"T]]B/33DKKGDO=  #" s%   BI7H?E(I?
I		I
Ic                    U R                   $ )z9Return list of PageInfo objects, one per page in the PDF.rF  r  s    rF   r  PdfInfo.pages  r  rE   c                :    [        S U R                   5       5      $ )z5Return minimum PDF version needed to render this PDF.c              3  J   #    U  H  o(       d  M  UR                   v   M     g 7fr   )r!  r]   r  s     rF   r`   &PdfInfo.min_version.<locals>.<genexpr>  s     C
d#4##
s   
##)r   r  r  s    rF   r!  PdfInfo.min_version  s     C

CCCrE   c                :    [        S U R                   5       5      $ )z(Return True if any page has a user unit.c              3  P   #    U  H  o(       d  M  UR                   S :g  v   M     g7f)rX   Nr   rW  s     rF   r`   'PdfInfo.has_userunit.<locals>.<genexpr>  s     GJD$'4==C'Js   
&&)r  r  r  s    rF   has_userunitPdfInfo.has_userunit  s     GDJJGGGrE   c                    U R                   $ )z4Return True if the document catalog has an AcroForm.)r<  r  s    rF   has_acroformPdfInfo.has_acroform  r  rE   c                    U R                   $ )z@Return True if the document annotations has a digital signature.)r=  r  s    rF   has_signaturePdfInfo.has_signature  r  rE   c                    U R                   $ )zCReturn True if the document catalog indicates this is a Tagged PDF.)rO  r  s    rF   	is_taggedPdfInfo.is_tagged  s     rE   c                |    [        U R                  [        [        -  5      (       d  [	        S5      eU R                  $ )zReturn filename of PDF.zcan't get filename from stream)r   r  rn   r   r   r  s    rF   filenamePdfInfo.filename  s/     $,,d
33%&FGG||rE   c                    U R                   $ )zReturn True if PDF contains XFA forms.

XFA forms are not supported by most standard PDF renderers, so we
need to detect and suppress them.
)r>  r  s    rF   needs_renderingPdfInfo.needs_rendering  s     $$$rE   c                     U R                   U   $ )z.Return PageInfo object for page number `item`.rS  )r   items     rF   __getitem__PdfInfo.__getitem__  s    {{4  rE   c                ,    [        U R                  5      $ )zReturn number of pages in PDF.)r   rF  r  s    rF   __len__PdfInfo.__len__  s    4;;rE   c                     S[        U 5       S3$ )r3  z<PdfInfo('...'), page count=r-  )r   r  s    rF   r.  PdfInfo.__repr__  s    -c$i[::rE   )r<  r=  r  rO  rE  r>  rF  )r~  r   r  r~   r  r~   r  r   r  r~   r  r%   )r1  Sequence[PageInfo | None]r8  r2  )r1  z
str | Path)r1  r  )r5   r6   r7   r8   r9   r<  rt   r=  r>  DEFAULT_EXECUTORr   r5  r  r!  r]  r`  rc  rf  ri  rl  rp  rs  r.  rD   r4   rE   rF   r;  r;  t  s/     M4 ND "d" #("& -00  	0
 0  0 0 0d   D D
 H H " " # #     % %! ;rE   r;  c                     SSK n SSKJn  U R                  5       nUR                  S5        UR	                  5       n[        UR                  5      nU" U5        UR                   H&  nU" U5        UR                   H  nU" U5        M     M(     g)zRun as a script.r   N)pprintr~  )	argparserz  ArgumentParseradd_argument
parse_argsr;  r~  r  r  )r{  rz  parserargspdfinfor  ims          rF   mainr    sq    $$&F
!Ddkk"G
7Ot++B2J  rE   __main__)r   r   r3  )r8  rz   r1  Iterator[ImageInfo])r1  zIterator[tuple[Object, str]])r@  r   r8  rz   r1  r  )rK  r   r@  r   r8  rz   )rK  r   r@  r   r1  z/Iterator[VectorMarker | TextMarker | ImageInfo])r\  zIterable[FloatRect]r1  r~   )ri  r   rj  z'Callable[[LTPage], Iterator[LTTextBox]]r1  zIterator[TextboxInfo])rK  r   r~  r   )r  
Pdf | Noner~  r   )r  rq   r  r  r~  r   r  r6  r  r~   r  r7  r1  r  r5  )r  r%   r  rq   r  r~   r  r~   r  r7  r1  rw  )r9   
__future__r   r|  rx  r   r&  collectionsr   collections.abcr   r   r   r   r	   r
   
contextlibr   r   decimalr   enumr   r   	functoolsr   mathr   r   r   osr   pathlibr   typingr   warningsr   pdfminer.layoutr   r   pikepdfr   r   r   r   r   r   r    r!   r"   r#   r$   ocrmypdf._concurrentr%   r&   ocrmypdf._progressbarr'   ocrmypdf.exceptionsr(   r)   ocrmypdf.helpersr*   r+   r,   ocrmypdf.pdfinfo.layoutr-   r.   r/   r0   ry  r   r2   rH   tuplerc   r[  r:   r;   r<   r=   r>   r?   r@   rA   rB   rR   rt   rJ   rK   rC   rL   rM   rN   rO   rP   rQ   rS   rW   re   rj   rl   rv   rz   r   r   r   r   r   r   r   r9  rD  rH  rQ  rN  rd  rp  rt  r  r  r  r  r  r  rx  r;  r  r5   r4   rE   rF   <module>r     s  
 6 "   	  # V V 2    $ $     -    : - A Q Q  
			  t  %u,-	 ??..~~??NN  >>""
//NN__


. * &  ~~--##NNNNMMLL>>* &  OOQNNAOOQNNAa($  -Aj Z 4: 4* N NL L' BM Yx:$z
 
D
"2YY%1YY2B /32G2G"2G42Gj:66(O66" 
*"  


 
  	

 
 &
 
, $(,BB B 	B B &B BJJ 4L
 L
^ "# v; v;r" zF rE   