
    j?                        U d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 ddlmZ dd	lmZmZmZmZmZmZmZ d
dgZddd eed                   ddddd eed                   dddZeeeeef         f         ed<    G d dej                  Z G d dej                  Z G d dej                  Zd0de dedefdZ!d0de dedefd Z"d!ede d"ee         dedef
d#Z#	 	 	 	 d1d$e$d%e$d&e$d'e%d(ee%         d)e$d*e$d+e d,e$dz  dej        fd-Z&	 	 	 	 d1d$e$d%e$d&e$d'e%d(ee%         d)e$d*e$d+e d,e$dz  dej        fd.Z'	 	 	 	 d1d$e$d%e$d&e$d'e%d(ee%         d)e$d*e$d+e d,e$dz  dej        fd/Z(dS )2    )deepcopy)AnyN)VOCABS)AdaptiveAvgPool2d   load_pretrained_params   )CrossShapedWindowAttentionMultiHeadSelfAttention	OSRABlock
PatchEmbedPatchMergingPermuteLayerSqueezeLayervip_tinyvip_base)gh|?5?g=
ףp=?gV-?)gA`"?gl?g$C?r       r   frenchzLhttps://doctr-static.mindee.com/models?id=v0.11.0/vip_tiny-033ed51c.pt&src=0)meanstdinput_shapeclassesurlzLhttps://doctr-static.mindee.com/models?id=v0.11.0/vip_base-f6ea2ff5.pt&src=0)r   r   default_cfgsc                   P     e Zd ZdZdedef fdZdej        dej        fdZ xZ	S )ClassifierHeadzKClassification head which averages the features and applies a linear layer.in_featuresout_featuresc                 |    t                                                       t          j        ||          | _        d S )N)super__init__nnLinearfc)selfr   r    	__class__s      j/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/doctr/models/classification/vip/pytorch.pyr#   zClassifierHead.__init__1   s0    )K66    xreturnc                 T    |                      |                    d                    S )Nr
   dim)r&   r   )r'   r+   s     r)   forwardzClassifierHead.forward5   s     wwqvv!v}}%%%r*   )
__name__
__module____qualname____doc__intr#   torchTensorr0   __classcell__r(   s   @r)   r   r   .   sw        UU7C 7s 7 7 7 7 7 7& &%, & & & & & & & &r*   r   c                        e Zd ZdZ	 	 	 	 ddedej        dej        dz  dej        dz  ded	edz  f fd
Z	de
j        de
j        fdZ xZS )VIPBlockzNUnified block for Local, Global, and Mixed feature mixing in VIP architecture.NF	embed_dim
local_unitglobal_unitproj
downsampleout_dimc                     t                                                       |r|t          d          || _        || _        || _        |rt          ||          nd| _        dS )aH  
        Args:
            embed_dim: dimension of embeddings
            local_unit: local mixing block(s)
            global_unit: global mixing block(s)
            proj: projection layer used for mixed mixing
            downsample: whether to downsample at the end
            out_dim: out channels if downsampling
        Nz0`out_dim` must be specified if `downsample=True`)r/   rA   )r"   r#   
ValueErrorr=   r>   r?   r   r@   )r'   r<   r=   r>   r?   r@   rA   r(   s          r)   r#   zVIPBlock.__init__<   sp    $ 	 	Q'/OPPP$&	JT^,9gFFFFZ^r*   r+   r,   c                 0   |j         \  }}}}| j        I| j        D ]?}|                    |d|          } ||||f          }|                    |||d          }@nt	          | j        | j                  D ]\  }}|                    |d|          }t          j        |dd          \  }	}
 ||	||f          }	 ||
||f          }
t          j        |	|
gd          }|                    dd          	                                                    |d||          }| 
                    |          |z   }|                    dddd          	                                }t          | j        t          j                  r|                     |          }|S )	z
        Forward pass for VIPBlock.

        Args:
            x: input tensor (B, H, W, C)

        Returns:
            Transformed tensor
        N   )chunksr/   r.   r
   r   r   )shaper>   r=   reshapezipr6   chunkcat	transpose
contiguousr?   permute
isinstancer@   r$   Module)r'   r+   bhwCblklblkgblkx1x2s              r)   r0   zVIPBlock.forwardW   s    W
1a # + +IIaQ''CAq6NNIIaAr**	+ "$/43CDD 	7 	7
dIIaQ''Qqa888BT"q!f%%T"q!f%%Ir2hA...KK1%%0022::1b!QGGIIaLL1$IIaAq))4466dory11 	#""Ar*   )NNFN)r1   r2   r3   r4   r5   r$   
ModuleListrQ   boolr#   r6   r7   r0   r8   r9   s   @r)   r;   r;   9   s        XX -1!% "_ __ M_ ]T)	_
 i$_ _ t_ _ _ _ _ _6$ $%, $ $ $ $ $ $ $ $r*   r;   c                        e Zd ZdZ	 	 	 	 ddededee         d	ee         d
ee         dee         dee         dee         deeeef         dededee	e
f         dz  ddf fdZd Zde	de
ddfdZ xZS )VIPNetzU
    VIP (Vision Permutable) encoder architecture, adapted for text recognition.
    r     TNin_channelsrA   
embed_dimsdepths	num_heads
mlp_ratiossplit_sizes	sr_ratiosr   num_classesinclude_topcfgr,   c                    || _         d t          j        ddt                              D             fdt	          t                              D             }t          ||d                   g}t          t          t          g}t          |          D ]\  }}||         }|         }||         }||         }||         }||         }||         }|t          |          dz
  k     r||dz            nd} |||||||||du|	  	        }|                    |           |                    t          j        t          j        |d	         d
          t          d          t!          |d	         df          t#          d                               t          j        t          j        |d	         |d          t          j                    t          j        d                    }|                    |           |r#|                    t+          ||
                      t-                      j        |  |                     | j                   dS )a|  
        Args:
            in_channels: number of input channels
            out_dim: final embedding dimension
            embed_dims: list of embedding dims per stage
            depths: number of blocks per stage
            num_heads: number of heads for attention blocks
            mlp_ratios: ratio for MLP expansion
            split_sizes: local window split sizes
            sr_ratios: used for some global block adjustments
            input_shape: (C, H, W)
            num_classes: number of output classes
            include_top: if True, append a classification head
            cfg: optional config dictionary
        c                 6    g | ]}|                                 S  )item).0r+   s     r)   
<listcomp>z#VIPNet.__init__.<locals>.<listcomp>   s     EEEAqvvxxEEEr*   r   g?c           
      ~    g | ]9}t          d |                   t          d |dz                               :S )Nr
   )sum)rn   irb   dprs     r)   ro   z#VIPNet.__init__.<locals>.<listcomp>   sE    ^^^ac#fRaRj//CwQw,@,@@A^^^r*   )r`   r<   r
   N)	r<   depthrc   	mlp_ratio
split_sizesr_ratio	drop_pathr@   rA   rE   gư>)eps)r   rF   r   r
   r   r.   F)bias)p)ri   r6   linspacerq   rangelenr   _vip_local_mixer_vip_mixed_mixer_vip_global_mha_mixer	enumerateappendr$   
Sequential	LayerNormr   r   r   r%   	HardswishDropoutr   r"   r#   apply_init_weights)r'   r`   rA   ra   rb   rc   rd   re   rf   r   rg   rh   ri   
drop_pathslayersmixer_functionsrr   mixer_fnr<   depth_inum_headru   sp_sizerw   rx   next_dimblockmlp_headrs   r(   s       `                       @r)   r#   zVIPNet.__init__   sv   < EE3F!D!DEEE^^^^^5QTU[Q\Q\K]K]^^^
'K:VW=YYYZ
 !
 %_55 	! 	!KAx"1IQiG |H"1I!!nG |H"1I,-J!0C,C,Cz!a%((HH#"#"!#$D0 
 
 
E MM%     	MZ^666\**!:b>1"566###	 	
 	
 	
 =IjngE:::LNNJ
 

 	h 	@MM.+>>???&!!

4%&&&&&r*   c                 P   t          |t          j                  rVt          j                            |j        d           |j        't          j                            |j        d           d S d S t          |t          j                  r)t          j        	                    |j        dd           d S t          |t          j
        t          j        f          rLt          j                            |j        d           t          j                            |j        d           d S d S )Ng{Gz?)r   r   fan_outrelu)modenonlinearityg      ?)rP   r$   r%   inittrunc_normal_weightrz   	constant_Conv2dkaiming_normal_r   BatchNorm2d)r'   ms     r)   r   zVIPNet._init_weights   s    a## 	-G!!!(!555v!!!!&!,,,,, "!29%% 	-G##AH96#RRRRRBL".9:: 	-Gafa(((Gah,,,,,	- 	-r*   path_or_urlkwargsc                 "    t          | |fi | dS )zLoad pretrained parameters onto the model

        Args:
            path_or_url: the path or URL to the model parameters (checkpoint)
            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
        Nr   )r'   r   r   s      r)   from_pretrainedzVIPNet.from_pretrained   s"     	t[;;F;;;;;r*   )r   r_   TN)r1   r2   r3   r4   r5   listtupler\   dictstrr   r#   r   r   r8   r9   s   @r)   r^   r^   ~   sW         -8 %)Y' Y'Y' Y' I	Y'
 S	Y' 9Y' IY' #YY' 9Y' 3S=)Y' Y' Y' #s(^d"Y' 
Y' Y' Y' Y' Y' Y'v	- 	- 	-<3 <# <$ < < < < < < < <r*   r^   F
pretrainedr   r,   c                 N    t          d| fddg dg dg dg dg dg d	d
dgd	|S )aX  
    VIP-Tiny encoder architecture.Corresponds to SVIPTRv2-T variant in the paper (VIPTRv2 function
    in the official implementation:
    https://github.com/cxfyxl/VIPTR/blob/main/modules/VIPTRv2.py)

    Args:
        pretrained: whether to load pretrained weights
        **kwargs: optional arguments

    Returns:
        VIPNet model
    r   r      )@         )r   r   r   )rF         )r   r   r   r
   rF   r   r   rF   rF   6.fc.weight	6.fc.bias	r`   rA   ra   rb   rc   rd   re   rf   ignore_keys_vipr   r   s     r)   r   r      sj      !>>yy))99II))"K0    r*   c                 N    t          d| fddg dg dg dg dg dg d	d
dgd	|S )aZ  
    VIP-Base encoder architecture. Corresponds to SVIPTRv2-B variant in the paper (VIPTRv2B function
    in the official implementation:
    https://github.com/cxfyxl/VIPTR/blob/main/modules/VIPTRv2.py)

    Args:
        pretrained: whether to load pretrained weights
        **kwargs: optional arguments

    Returns:
        VIPNet model
    r   r   r   )r   r   i  )r      	   )r   r      )r   r   r   r   r   r   r   r   r   r   s     r)   r   r     sj      "??yy**99II))"K0    r*   archr   c                    |                     dt          t          |          d                             |d<   |                     dt          |          d                   |d<   |                     dt          |          d                   |d<   t          t          |                    }|d         |d<   |d         |d<   |d         |d<   |                    d           t          dd|i|}|rV|d         t          t          |          d                   k    r|nd}|                    t          |          d         |           |S )	a	  
    Internal constructor for the VIPNet models.

    Args:
        arch: architecture key
        pretrained: load pretrained weights?
        ignore_keys: layer keys to ignore
        **kwargs: arguments passed to VIPNet

    Returns:
        VIPNet instance
    rg   r   r   ri   Nr   )r   rl   )getr~   r   r   popr^   r   )r   r   r   r   _cfgmodel_ignore_keyss          r)   r   r   -  s:   $ #JJ}c,t:LY:W6X6XYYF="JJ}l46H6WXXF=

9l4.@.KLLF9L&''D /D /DY'DO
JJy&&t&v&&E S '-]&;s<PTCUV_C`?a?a&a&a{{gkl407\RRRLr*   r<   rt   rc   ru   rx   rv   rw   r@   rA   c	                      t          j         fdt          |          D                       }	t           |	||          S )a  Builds a VIPBlock performing local (cross-shaped) window attention.

    Args:
        embed_dim: embedding dimension.
        depth: number of attention blocks in this stage.
        num_heads: number of attention heads.
        mlp_ratio: ratio used to expand the hidden dimension in MLP.
        split_size: size of the local window splits.
        sr_ratio: parameter needed for cross-compatibility between different mixers
        drop_path: list of per-block drop path rates.
        downsample: whether to apply PatchMerging at the end.
        out_dim: output embedding dimension if downsampling.

    Returns:
        A VIPBlock (local attention) for one stage of the VIP network.
    c                 F    g | ]}t          d |                   S )Tr/   rc   ru   qkv_biasrv   rx   r   rn   rr   rx   r<   ru   rc   rv   s     r)   ro   z$_vip_local_mixer.<locals>.<listcomp>p  sP     
 
 
  	#!l	
 	
 	

 
 
r*   r=   r@   rA   r$   r[   r}   r;   
r<   rt   rc   ru   rx   rv   rw   r@   rA   blockss
   ` ````    r)   r   r   U  sv    6 ] 
 
 
 
 
 
 
 
 u
 
 
 
 
F I&ZQXYYYYr*   c	                      t          j         fdt          |          D                       }	t           |	||          S )a  Builds a VIPBlock performing global multi-head self-attention.

    Args:
        embed_dim: embedding dimension.
        depth: number of attention blocks in this stage.
        num_heads: number of attention heads.
        mlp_ratio: ratio used to expand the hidden dimension in MLP.
        drop_path: list of per-block drop path rates.
        split_size: parameter needed for cross-compatibility between different mixers
        sr_ratio: parameter needed for cross-compatibility between different mixers
        downsample: whether to apply PatchMerging at the end.
        out_dim: output embedding dimension if downsampling.

    Returns:
        A VIPBlock (global MHA) for one stage of the VIP network.
    c           
      D    g | ]}t          d |                   S )T)r/   rc   ru   r   drop_path_rate)r   )rn   rr   rx   r<   ru   rc   s     r)   ro   z)_vip_global_mha_mixer.<locals>.<listcomp>  sM     	 	 	  	$Q<	
 	
 	
	 	 	r*   r   r   r   s
   ` ```     r)   r   r     s{    6 ] 	 	 	 	 	 	 	 u	 	 	 	 	F 	   r*   c	                     t          d dz            }	t          j        t          j          dd           t          j                    t          j                   t          j         |	d          t          j                    t          j        |	          t          j        |	 d          t          j                             }
t          j         fdt          |          D                       }t          j         fdt          |          D                       }t           |||
||	          S )
a  Builds a VIPBlock performing mixed local+global attention.

    Args:
        embed_dim: embedding dimension.
        depth: number of attention blocks in this stage.
        num_heads: total number of attention heads.
        mlp_ratio: ratio used to expand the hidden dimension in MLP.
        drop_path: list of per-block drop path rates.
        split_size: size of the local window splits (for the local half).
        sr_ratio: reduce spatial resolution in the global half (OSRA).
        downsample: whether to apply PatchMerging at the end.
        out_dim: output embedding dimension if downsampling.

    Returns:
        A VIPBlock (mixed local+global) for one stage of the VIP network.
       r   r   r
   )kernel_sizepaddinggroups)r   c                 L    g | ] }t          d z  d|                   !S )rF   Tr   r   r   s     r)   ro   z$_vip_mixed_mixer.<locals>.<listcomp>  sT     
  
  
   	#Q!l	
 	
 	

  
  
 r*   c           
      P    g | ]"}t          d z  d z  |                   #S )rF   )r/   rw   rc   ru   rx   )r   )rn   rr   rx   r<   ru   rc   rw   s     r)   ro   z$_vip_mixed_mixer.<locals>.<listcomp>  sU     	! 	! 	!  	Q1nl	
 	
 	
	! 	! 	!r*   )r=   r>   r?   r@   rA   )	maxr$   r   r   GELUr   r[   r}   r;   )r<   rt   rc   ru   rx   rv   rw   r@   rA   	inner_dimr?   r=   r>   s   ` `````      r)   r   r     s|   8 B	Q''I=
	)YAqSSS
		
y!!
	)YA666
		
y!!
	)YA666
y!!	 	D  
  
  
  
  
  
  
  
  u
  
  
  
 
J - 	! 	! 	! 	! 	! 	! 	! 	! u	! 	! 	! 	 	K    r*   )F)r
   r
   FN))copyr   typingr   r6   torch.nnr$   doctr.datasetsr   doctr.models.modules.layersr   utilsr	   r   r   r   r   r   r   r   r   __all__r   r   r   r   __annotations__rQ   r   r;   r   r^   r\   r   r   r   r5   floatr   r   r   rl   r*   r)   <module>r      s                       ! ! ! ! ! ! 9 9 9 9 9 9 + + + + + +                  z
" &$"4x())]  &$"4x())] + +d3S#X&'   $& & & & &RY & & &B B B B Bry B B BJr< r< r< r< r<R] r< r< r<j       :       :"
"" c" 	"
 " " " "\ &Z &Z&Z&Z &Z 	&Z
 E{&Z &Z &Z &Z 4Z&Z Y&Z &Z &Z &Zd * *** * 	*
 E{* * * * 4Z* Y* * * *l H HHH H 	H
 E{H H H H 4ZH YH H H H H Hr*   