
    SAiM                        S SK r S SKrS SKrS SKJr  S SKJrJrJr  S SK	J
r
JrJrJrJrJrJrJrJrJrJr  S SKJr  S SKJr  S SKJrJr  S SKJr  S	S
KJrJ r   S	SK!J"r"J#r#  \RH                  " \%5      r&\
(       a  S	SK'J(r(  S	SK)J*r*  \S/\+4   r,S\S   S\\-\\-   \,4   S\S   4S jr. " S S5      r/\ " S S\/5      5       r0 " S S\15      r2 " S S\/5      r3g)    N)deque)asdict	dataclassfield)TYPE_CHECKINGAnyCallableDictIterableIteratorListOptionalPatternTupleUnion)
NumberTree)	PDFParser)	PDFObjRefresolve1)	PSLiteral   )T_bboxT_obj)decode_textgeometry)Page)PDFPDFStructElementelementsmatcherreturnc              #     ^#    SSS[         4U4S jjnSSS[         4U4S jjn[        T[        5      (       a  UnO$[        T[        R                  5      (       a  UnOTn[        U 5      nU(       aO  UR                  5       nU" U5      (       a  Uv   UR                  [        UR                  5      5        U(       a  MN  gg7f)z5
Common code for `find_all()` in trees and elements.
xr   r!   c                 "   > U R                   T:H  $ )zMatch an element name.)typer#   r    s    P/var/www/html/land-ocr/venv/lib/python3.13/site-packages/pdfplumber/structure.py	match_tag_find_all.<locals>.match_tag/   s    vv      c                 :   > TR                  U R                  5      $ )z,Match an element name by regular expression.)matchr%   r&   s    r'   match_regex_find_all.<locals>.match_regex3   s    }}QVV$$r*   N)
bool
isinstancestrrer   r   popleft
extendleftreversedchildren)r   r    r(   r-   
match_funcdels    `     r'   	_find_allr:   '   s     !' !D !%) %d % '3
	GRZZ	(	( 

hA
YY[b>>H	Xbkk*+	 !s   B;C Cc                   ~    \ rS rSr% Sr\S   \S'   S\\\	\   \
4   S\S   4S jrS\\\	\   \
4   S\S   4S jrS	rg
)FindableE   zNfind() and find_all() methods that can be inherited to avoid
repeating oneselfr   r6   r    r!   c                 .    [        U R                  U5      $ )zIterate depth-first over matching elements in subtree.

The `matcher` argument is either an element name, a regular
expression, or a function taking a `PDFStructElement` and
returning `True` if the element matches.
)r:   r6   selfr    s     r'   find_allFindable.find_allK   s     00r*   c                 b     [        [        U R                  U5      5      $ ! [         a     gf = f)zFind the first matching element in subtree.

The `matcher` argument is either an element name, a regular
expression, or a function taking a `PDFStructElement` and
returning `True` if the element matches.
N)nextr:   r6   StopIterationr?   s     r'   findFindable.findV   s/    		$--9:: 		s   ! 
.. N)__name__
__module____qualname____firstlineno____doc__r   __annotations__r   r1   r   	MatchFuncr   rA   r   rF   __static_attributes__rH   r*   r'   r<   r<   E   sk     %&&	1S'#,	9:	1	$	%	1S'#,	9:	$	%r*   r<   c                   8   \ rS rSr% \\S'   \\   \S'   \\   \S'   \\   \S'   \\   \S'   \\   \S'   \\   \S'   \\   \S	'   \" \	S
9r
\\\4   \S'   \" \S
9r\\   \S'   \" \S
9r\S    \S'   S\S    4S jrS\\\\   \4      4S jrS\\\4   4S jrSrg)r   e   r%   revisionidlangalt_textactual_texttitlepage_number)default_factory
attributesmcidsr6   r!   c                 ,    [        U R                  5      $ Niterr6   r@   s    r'   __iter__PDFStructElement.__iter__s       DMM""r*   c              #   L  #    U R                    H  nU R                  U4v   M     [        U R                  5      nU(       aa  UR	                  5       nUR                    H  nUR                  U4v   M     UR                  [        UR                  5      5        U(       a  M`  gg7f)zqCollect all MCIDs (with their page numbers, if there are
multiple pages in the tree) inside a structure element.
N)r\   rY   r   r6   r3   r4   r5   )r@   mcidr8   r9   s       r'   	all_mcidsPDFStructElement.all_mcidsv   s{     
 JJD""D(( $-- Bnnd** !LL"++./	 as   BB$"B$c                    [        U 5      n[        U/5      nU(       al  UR                  5       n[        UR	                  5       5       H  nX4   b  X4   / :X  d
  X4   0 :X  d  M  X4	 M     SU;   a  UR                  US   5        U(       a  Ml  U$ )z'Return a compacted dict representation.r6   )r   r   r3   listkeysextend)r@   rr8   r9   ks        r'   to_dictPDFStructElement.to_dict   s|    4L1#JB"'')_5=BERK25B; % RJ( a r*   rH   N)rI   rJ   rK   rL   r1   rN   r   intr   dictr[   r
   r   rj   r\   r   r6   r   rb   r   rg   ro   rP   rH   r*   r'   r   r   e   s    
Ism
3-sm#C=#!&t!<JS#X<T2E492).t)DHd%&D#(#56 #08E(3-*<$=> 0c3h r*   c                       \ rS rSrSrg)StructTreeMissing   rH   N)rI   rJ   rK   rL   rP   rH   r*   r'   rt   rt      s    r*   rt   c                      \ rS rSr% Sr\S   \S'   SSSS\S   4S jjrS	\\	\
4   S
\\   S\\	\
4   4S jrS	\
S\\\   \\
   4   4S jrS\\
   SS4S jrS	\\	\
4   S\4S jrSS jrS\\	\
4   SS4S jrS\\   4S jrS\S\4S jrSrg)PDFStructTree   aN  Parse the structure tree of a PDF.

The constructor takes a `pdfplumber.PDF` and optionally a
`pdfplumber.Page`.  To avoid creating the entire tree for a large
document it is recommended to provide a page.

This class creates a representation of the portion of the
structure tree that reaches marked content sections, either for a
single page, or for the whole document.  Note that this is slightly
different from the behaviour of other PDF libraries which will
also include structure elements with no content.

If the PDF has no structure, the constructor will raise
`StructTreeMissing`.

r   pageNdocr   c                 j  ^ UR                   U l         SU R                   R                  ;  a  [        S5      e[        U R                   R                  S   5      U l        [        U R                  R                  S0 5      5      U l        [        U R                  R                  S0 5      5      U l        / U l        Ub  X l	        UR                  U0U l        S U l        U R                  R                  S5      nUc  U R                  5         g [        U5      nSU R                  R                  R                   ;  a  g U R                  R                  R                   S   m[        [#        U4S jUR$                   5       5      5      nU R'                  U5        g S U l	        UR                   Vs0 s H  o"R                  U_M     snU l        U R                  R%                  5        Vs0 s H#  o"R                  R(                  UR                  _M%     snU l        U R                  5         g s  snf s  snf )NStructTreeRootzPDF has no structureRoleMapClassMap
ParentTreeStructParentsc              3   <   >#    U  H  u  pUT:X  d  M  Uv   M     g 7fr^   rH   ).0numarray	parent_ids      r'   	<genexpr>)PDFStructTree.__init__.<locals>.<genexpr>   s     X1C:3siGW1Cs   	)rz   catalogrt   r   rootgetrole_map	class_mapr6   ry   rY   pages	page_dict_parse_struct_treer   page_objattrsrD   values_parse_parent_treepageid)r@   rz   ry   parent_tree_objparent_treeparent_arrayr   s         @r'   __init__PDFStructTree.__init__   s   77488#3#33#$:;;TXX--.>?@	 y"!=>!$))--
B"?@02 I**D1DJ!DN #iimmL9O&'')(9 #$))*<*<*B*BB II..44_E	'X1C1CXX  ''5DI=@YYGYT**D0YGDJCG::CTCTCVCV4$$d&6&66CVDN ##%	 Hs   .H+(*H0objrS   r!   c                 
   / nS HP  nXA;  a  M
  [        X   5      n[        U[        5      (       a  UR                  U5        M?  UR	                  U5        MR     / nS nU HT  n[        U[
        5      (       a  X:X  a  Ub  UR	                  U5        S nM5  Ub  UR	                  U5        [        U5      nMV     Ub  UR	                  U5        0 n	U H  n[        U[        5      (       aK  [        UR                  5      nX@R                  ;  a  [        R                  SU5        MT  U R                  U   nUR                  5        H9  u  p[        U[        5      (       a  [        UR                  5      X'   M3  X   X'   M;     M     U	$ )N)CAzUnknown attribute class %s)r   r0   rj   rl   appendrq   r   r   namer   loggerwarningitems)r@   r   rS   attr_obj_listkeyattr_obj	attr_objsprev_objarefattrrn   vs               r'   _make_attributesPDFStructTree._make_attributes   s[    C~)H(D))$$X.$$X.  	!D $$$#(<$$X.'$$X.#D> " X& C#y))!#((+nn,NN#?EnnS)		a++)!&&1DG!fDG	 $  r*   c                    SU;  d
   SU-  5       eSU;  d
   SU-  5       eS nU R                   b=  SU;   a7  US   R                  nX0R                   ;   d
   SU-  5       eU R                   U   nSnSU;   aI  [        US   R                  5      nX@R                  ;   a"  [        U R                  U   R                  5      nS	U;   a  [        US	   5      O/ n[        U[        5      (       a  U/nO[        U[        5      (       a  US	   /nUR                  S
5      nU R                  X5      nSU;   a  [        [        US   5      5      OS nSU;   a  [        [        US   5      5      OS n	SU;   a  [        [        US   5      5      OS n
SU;   a  [        [        US   5      5      OS nSU;   a  [        [        US   5      5      OS n[        UUUUU
U	UUUS9	nX4$ )NMCIDzUncaught MCR: %sObjzUncaught OBJR: %sPgzObject on unparsed page: %s SKRIDTLangAlt
ActualText)	r%   rT   rY   rS   rU   rX   rV   rW   r[   )r   objidr   r   r   r   r0   rq   rr   r   r   r   )r@   r   rY   
page_objidobj_tagr6   rS   r[   
element_idrX   rU   rV   rW   elements                 r'   _make_elementPDFStructTree._make_element
  s   S :"4s":: C:!4s!::>>%$#+TJ/T1NQT1TT/..4K#:!#c(--0G--'%dmmG&<&A&AB),8CH%h$$ zH$''CzH773<**39
9=[#d)!45$
36#:HSX./45;s]{8CK018=;xE
34$8D8KK\!234QU 	 ###!

   r*   r   c                    [        U5      n0 nSnU(       a  UR                  5       nU[        R                  :X  a  M-  [	        U5      U;   a  M>  [        U5      nSU;   a  [        US   R                  5      S:X  a  SnO;U R                  U5      u  pxUc   eXx4U[	        U5      '   UR                  US   5        U(       a  M  U(       d   eU R                  U5        g)zQPopulate the structure tree using the leaves of the parent tree for
a given page.FTyper|   TNP)r   r3   r   KEYWORD_NULLreprr   r   r   r   r   _resolve_children)	r@   r   r8   s
found_rootrefr   r   r6   s	            r'   r    PDFStructTree._parse_parent_tree4  s     ,
))+C i,,,CyA~3-C}S[-=-=!>BR!R!
 %)$6$6s$;!***&0$s)S"' a* zq!r*   c                     SU;  a  gUS   R                   nU R                  b  X R                  ;   $ U R                  b$  X R                  R                  R                  :w  a  gg)Nr   TF)r   r   ry   r   r   )r@   r   r   s      r'   on_parsed_pagePDFStructTree.on_parsed_pageS  sV    s?Y__
>>%//99 YY//666r*   c                 X  ^ ^^	 [        T R                  S   5      n[        U[        5      (       a  T R                  S   /n[	        U5      n0 m	U(       Ga  UR                  5       n[        U5      T	;   a  M)  [        U5      n[        U[        5      (       a.  SU;   a(  T R                  U5      (       d  Mg  US   n[        U5      nT R                  U5      u  pVXV4T	[        U5      '   U Hw  n[        U5      n[        U[        5      (       a,  T R                  U5      (       d  M;  SU;   a  US   nOSU;   a  MO  [        U[        5      (       d  Mf  UR                  U5        My     U(       a  GM  S[        [           S[        [           4UU	U 4S jjmT" U5        T R                  T	5        g)z_Populate the structure tree starting from the root, skipping
unparsed pages and empty elements.r   r   r   r   r!   c                   > / nU  H  n[        U5      n[        U[        5      (       a  UR                  U5        M6  [        U[        5      (       a?  TR                  U5      (       d  Mc  SU;   a  UR                  US   5        M  SU;   a  US   nT[        U5         u  pET" U5      nUb  U(       d  T[        U5      	 M  XE4T[        U5      '   UR                  U5        M     U$ )Nr   r   )r   r0   rq   r   rr   r   r   )	r   next_elementsr   r   r   r6   pruner   r@   s	         r'   r   /PDFStructTree._parse_struct_tree.<locals>.prune  s    Msmc3''!((-T**..s33 }%,,S[9 #!%j$%d3iL! ??($s)#*#4Ad3iL!((-)  * ! r*   N)r   r   r0   rr   r   r3   r   r   r   r   r   r   r   r   )
r@   r   r8   r   r   r   r6   childr   r   s
   `       @@r'   r    PDFStructTree._parse_struct_tree_  sk    		#' dD!!IIcN#D$K))+CCyA~3-C#t$$#**3//%jsm $ 2 23 7G #,Ad3iL!uoc4((..s33 | #E
3 eY//HHUO "! a<	!DI 	!$s) 	! 	!2 	dq!r*   seenc                 b   [        U R                  S   5      n[        U[        5      (       a  U R                  S   /n/ U l        / nU Hh  n[        U5      n[        U[        5      (       a#  SU;   a  U R                  U5      (       d  MA  US   n[        U5      U;   d  MW  UR                  U5        Mj     [        U5      nU(       Ga=  UR                  5       nU[        U5         u  pxUc   S5       eU GH  n	[        U	5      n[        U[        5      (       a  UR                  R                  U5        O][        U[        5      (       aH  U R                  U5      (       d  Mm  SU;   a  UR                  R                  US   5        OSU;   a  US   n	[        U	[        5      (       d  M  UR                  [        U	5      S5      u  pU
c  M  UR                  R                  U
5        UR                  U	5        GM     U(       a  GM=  U Vs/ s H  oA[        U5         S   PM     snU l        gs  snf )zlResolve children starting from the tree root based on references we
saw when traversing the structure tree.
r   r   NzUnparsed elementr   )NNr   )r   r   r0   rr   r6   r   r   r   r   r3   rq   r\   r   r   )r@   r   r   parsed_rootr   r   r8   r   r6   r   child_element_s               r'   r   PDFStructTree._resolve_children  s    		#'dD!!IIcN#DC3-C#t$$#**3//%jCyD ""3'  +))+C $T#YG&:(::&!uoc3''MM((-T**..s33 },,S[9# #E
eY//'+xxU\'J$M$0((//># "	 a, 8CC{d3i+{CCs   	H,c                 ,    [        U R                  5      $ r^   r_   ra   s    r'   rb   PDFStructTree.__iter__  rd   r*   r9   c                    SnU R                   b  U R                   nO&UR                  b  U R                  UR                     nUR                  R	                  SS5      nUb  Ub  SSK JnJnJn  U" U" U5      UR                  S   UR                  S   -
  5      n[        X$5      (       aS  [        R                  " U5      nUR                  U/5      nU(       d  [        S5      e[        R                  " US   5      $ U$ / n	UR                  5        H  u  pU
c>  Ub8  [         R"                  R%                  UR&                  R)                  5       5      nOG/ nOD[         R"                  R%                  U R                  U
   R&                  R)                  5       5      nU H  nUS   U:X  d  M  U	R+                  U5        M!     M     U	(       d  [        S	5      e[        R,                  " U	5      $ )
z9Get the bounding box for an element for visual debugging.NBBoxr   )CroppedPage_invert_box_normalize_box   zElement no longer on pager   rf   zNo objects found)ry   rY   r   r[   r   r   r   r   mediaboxr0   r   bbox_to_rect_crop_fn
IndexErrorobj_to_bboxrg   	itertoolschainfrom_iterableobjectsr   r   objects_to_bbox)r@   r9   ry   bboxr   r   r   rectrects	mcid_objsrY   rf   r   cs                 r'   element_bboxPDFStructTree.element_bbox  s   99 99D^^'::bnn-D}}  . 0FF
 t$dmmA&6q9I&ID $,,,,T2tf-$%@AA++E!H55 I%'\\^!&'"+//"?"?@S@S@U"V"$'oo;;

;/77>>@G !AyD(!((+ ! &4  !344++I66r*   )r6   r   rz   ry   r   r   r   r   r^   )r!   N)rI   rJ   rK   rL   rM   r   rN   r   r
   r1   r   rq   r   r   r   r   r   r   r/   r   r   r   r   rb   r   r   rP   rH   r*   r'   rw   rw      s   " 6
,&E ,&&)9 ,&\/S>/-5c]/	c3h/b(! (!x8H/I4PS9/T)U (!T"tCy "T ">
$sCx. 
T 
B"H*Dd38n *D *DX#(#34 #-7/ -7F -7r*   rw   )4r   loggingr2   collectionsr   dataclassesr   r   r   typingr   r   r	   r
   r   r   r   r   r   r   r   pdfminer.data_structuresr   pdfminer.pdfparserr   pdfminer.pdftypesr   r   pdfminer.psparserr   _typingr   r   utilsr   r   	getLoggerrI   r   ry   r   pdfr   r/   rO   r1   r:   r<   r   
ValueErrorrt   rw   rH   r*   r'   <module>r     s      	  0 0    0 ( 1 ' " (			8	$  ()4/0	,)*,3i/0,  !,< @ *x * *Z	
 	h7H h7r*   