
    TAiR&                    6   S r SSKJr  SSKrSSKrSSKJr  SSKJ	r	  SSK
Jr  SSK
Jr  SSK
Jr  SS	K
Jr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJ r   SSKJ!r!  \\\\S.r" " S S5      r#g)z0Functions to handle all operations on the PDF's.    )annotationsN)Path)Any)LTChar)LTImage)LTTextLineHorizontal)LTTextLineVertical)	PdfReader)	PdfWriter)StrByteType   )	TableList)Hybrid)Lattice)Network)Stream)TemporaryDirectory)download_url)get_image_char_and_text_objects)get_page_layout)get_rotation)is_url)latticestreamnetworkhybridc                      \ rS rSrSr   S
 SS jjrS r        SS jr    S       SS jjr      SS jr	S	r
g)
PDFHandler'   a*  Handles all operations on the PDF's.

Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
temp directory.

Parameters
----------
filepath : str
    Filepath or URL of the PDF file.
pages : str, optional (default: '1')
    Comma-separated page numbers.
    Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
    Password for decryption.
debug : bool, optional (default: False)
    Whether the parser should store debug information during parsing.
Nc                :   X@l         [        U5      (       a  [        [        U5      5      nXl        [        U[        5      (       a/  UR                  5       R                  S5      (       d  [        S5      eUc  SU l	        OX0l	        U R                  U5      U l        g )N.pdfzFile format not supported )debugr   r   strfilepath
isinstancelowerendswithNotImplementedErrorpassword
_get_pagespages)selfr%   r,   r*   r#   s        L/var/www/html/land-ocr/venv/lib/python3.13/site-packages/camelot/handlers.py__init__PDFHandler.__init__;   sy     
(#CM2H2:h$$X^^-=-F-Fv-N-N%&ABBDM$M__U+
    c                   / nUS:X  a  UR                  SSS.5        GO[        U R                  SS9nUR                  (       a  UR	                  U R
                  5        US:X  a(  UR                  S[        UR                  5      S.5        OUR                  S5       H  nSU;   aV  UR                  S5      u  pVUS	:X  a  [        UR                  5      nUR                  [        U5      [        U5      S.5        M_  UR                  [        U5      [        U5      S.5        M     / nU H'  nUR                  [        US
   US	   S-   5      5        M)     [        [        U5      5      $ )a!  Convert pages string to list of integers.

Parameters
----------
filepath : str
    Filepath or URL of the PDF file.
pages : str, optional (default: '1')
    Comma-separated page numbers.
    Example: '1,3,4' or '1,4-end' or 'all'.

Returns
-------
P : list
    List of int page numbers.

1r   )startendFstrictall,-r5   r4   )appendr
   r%   is_encrypteddecryptr*   lenr,   splitintextendrangesortedset)	r-   r,   page_numbersinfilerabresultps	            r.   r+   PDFHandler._get_pagesP   s"   " C<!A 67t}}U;F""t}}-~##aFLL8I$JKS)Aax wws|: #FLL 1A$++c!fSV,LM$++c!fSV,LM * AMM%'
AeHqL9: c&k""r1   c                   [        USS9nUR                  (       a  UR                  U R                  5        [        R
                  R                  USU S35      n[        R
                  R                  U5      u  pxUR                  US-
     n	[        5       n
U
R                  U	5        [        US5       nU
R                  U5        SSS5        [        U40 UD6u  p[        U5      u  pnn[        UUU5      nUS:w  Ga/  SR                  UR!                  S	S
5      SU/5      n[        R"                  " UU5        [        US5      n[        USS9nUR                  (       a  UR                  U R                  5        [        5       n
UR                  S   n	US:X  a  U	R%                  S5        OUS:X  a  U	R%                  S5        U
R                  U	5        [        US5       nU
R                  U5        SSS5        [        U40 UD6u  p[        U5      u  pnnUR'                  5         XXUU4$ XXUU4$ ! , (       d  f       GNu= f! , (       d  f       NY= f)a  Saves specified page from PDF into a temporary directory.

Parameters
----------
filepath : str
    Filepath or URL of the PDF file.
page : int
    Page number.
temp : str
    Tmp directory.


Returns
-------
layout : object

dimensions : tuple
    The dimensions of the pdf page

filepath : str
    The path of the single page PDF - either the original, or a
    normalized version.

Fr6   page-r!   r   wbNr"   pagerK   _rotatedrbr   anticlockwiseZ   	clockwisei)r
   r<   r=   r*   ospathjoinsplitextr,   r   add_pageopenwriter   r   r   replacerenamerotateclose)r-   r%   rP   templayout_kwargsrF   fpathfrootfextrK   outfileflayout
dimensionsimagescharshorizontal_textvertical_textrotation	fpath_newinstreams                        r.   
_save_pagePDFHandler._save_page|   s   D 8E2NN4==)TU4&#56gg&&u-LL"+%!MM! ,UDmD8W9
5  Fr>vs!;Z NOIIIeY'It,Hx6F""t}}-kGQA?*[(QeT"aa  # "1!H-!HF/7 :F?M NNvo}TT6/=PP? , #"s   5H5"I5
I
Ic                   Uc  0 n/ n[         U   nU" SSU R                  0UD6n[        5        n	[        R                  " 5       n
U(       a  [        U R                  5      S:  a  U
S:  a  [        R                  " S5      R                  U
S9 n/ nU R                   H3  nUR                  U R                  XXU45      nUR                  U5        M5     U H$  nUR                  5       nUR                  U5        M&     SSS5        O7U R                   H'  nU R                  XXU5      nUR                  U5        M)     SSS5        [        [        U5      5      $ ! , (       d  f       N*= f! , (       d  f       N3= f)a  Extract tables by calling parser.get_tables on all single page PDFs.

Parameters
----------
flavor : str (default: 'lattice')
    The parsing method to use.
    Lattice is used by default.
suppress_stdout : bool (default: False)
    Suppress logs and warnings.
parallel : bool (default: False)
    Process pages in parallel using all available cpu cores.
layout_kwargs : dict, optional (default: {})
    A dict of `pdfminer.layout.LAParams
    <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs.
kwargs : dict
    See camelot.read_pdf kwargs.

Returns
-------
tables : camelot.core.TableList
    List of tables found in PDF.

Nr#   r   spawn)	processes )PARSERSr#   r   mp	cpu_countr>   r,   get_contextPoolapply_async_parse_pager;   getrA   r   rC   )r-   flavorsuppress_stdoutparallelrb   kwargstables
parser_objparsertempdirry   pooljobsrK   jts                   r.   parsePDFHandler.parse   sI   >  MV_
7$**77!WI C

Oa/IM^^G,11I1F$D!ZZ ,, ,,-P A ( "EEGa( " GF A((F]A MM!$	 $% "0 ((' GF "!s&   AE+A0E>A E+
E(	$E++
E9c                    U R                   " U R                  X40 UD6u  pgpp[        R                  R	                  USU S35      nUR                  UUUUUU
UUS9  UR                  5       nU$ )a  Extract tables by calling parser.get_tables on a single page PDF.

Parameters
----------
page : int
    Page number to parse
parser : Lattice, Stream, Network or Hybrid
    The parser to use.
suppress_stdout : bool
    Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
    A dict of `pdfminer.layout.LAParams
    <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs.

Returns
-------
tables : camelot.core.TableList
    List of tables found in PDF.

rN   r!   )rb   )rq   r%   rV   rW   rX   prepare_page_parseextract_tables)r-   rP   r   r   r   rb   rh   ri   rj   rk   rl   rm   	page_pathr   s                 r.   r}   PDFHandler._parse_page  s    0 OODMM4JMJ 	JF? GGLLE$t*<=	!!' 	" 		
 &&(r1   )r#   r%   r,   r*   )r3   NF)r%   zStrByteType | Path | str)r%   zStrByteType | PathrP   r@   ra   r$   returnzrtuple[Any, tuple[float, float], list[LTImage], list[LTChar], list[LTTextLineHorizontal], list[LTTextLineVertical]])r   FFN)r   r$   r   boolr   r   rb   zdict[str, Any] | None)rP   r@   r   r$   r   r   )__name__
__module____qualname____firstlineno____doc__r/   r+   rq   r   r}   __static_attributes__rv   r1   r.   r   r   '   s    , ,*,**#XIQ*IQ25IQ=@IQ
IQZ   %/3?)?) ?) 	?)
 -?)B&&"%&@D&r1   r   )$r   
__future__r   multiprocessingrx   rV   pathlibr   typingr   pdfminer.layoutr   r   r   r	   pypdfr
   r   pypdf._utilsr   corer   parsersr   r   r   r   utilsr   r   r   r   r   r   rw   r   rv   r1   r.   <module>r      sn    6 "  	   " # 0 .   $      %  2 "   	G Gr1   