
    j                        U d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ dd	lmZ d
dgZddd eed                   ddddd eed                   dddZeeeeef         f         ed<    G d dej                  Z G d dej                  Z	 d"dededee         dz  dedef
dZd#dededefd Zd#dededefd!ZdS )$    )deepcopy)AnyN)nn)VOCABS)EncoderBlock)PatchEmbedding   load_pretrained_paramsvit_svit_b)gh|?5?g=
ףp=?gV-?)gA`"?gl?g$C?r	       r   frenchzHhttps://doctr-static.mindee.com/models?id=v0.6.0/vit_s-5d05442d.pt&src=0)meanstdinput_shapeclassesurlzHhttps://doctr-static.mindee.com/models?id=v0.6.0/vit_b-0fbef167.pt&src=0)r   r   default_cfgsc                   T     e Zd ZdZdededdf fdZdej        dej        fdZ xZ	S )	ClassifierHeadzClassifier head for Vision Transformer

    Args:
        in_channels: number of input channels
        num_classes: number of output classes
    in_channelsnum_classesreturnNc                 |    t                                                       t          j        ||          | _        d S N)super__init__r   Linearhead)selfr   r   	__class__s      j/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/doctr/models/classification/vit/pytorch.pyr   zClassifierHead.__init__/   s2    
 	Ik;77			    xc                 @    |                      |d d df                   S )Nr   )r!   )r"   r&   s     r$   forwardzClassifierHead.forward8   s     yy111a4!!!r%   )
__name__
__module____qualname____doc__intr   torchTensorr(   __classcell__r#   s   @r$   r   r   '   s         88 8 
	8 8 8 8 8 8" "%, " " " " " " " "r%   r   c                        e Zd ZdZ	 	 	 	 	 	 dded	ed
ededeeef         deeeef         dedededee	e
f         dz  ddf fdZde	de
ddfdZ xZS )VisionTransformera  VisionTransformer architecture as described in
    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
    <https://arxiv.org/pdf/2010.11929.pdf>`_.

    Args:
        d_model: dimension of the transformer layers
        num_layers: number of transformer layers
        num_heads: number of attention heads
        ffd_ratio: multiplier for the hidden dimension of the feedforward layer
        patch_size: size of the patches
        input_shape: size of the input image
        dropout: dropout rate
        num_classes: number of output classes
        include_top: whether the classifier head should be instantiated
       r5   r             TNd_model
num_layers	num_heads	ffd_ratio
patch_sizer   dropoutr   include_topcfgr   c           
          t          |||          t          |||||z  |t          j                              g}|	r#|                    t          ||                      t                      j        |  |
| _        d S r   )	r   r   r   GELUappendr   r   r   r?   )r"   r8   r9   r:   r;   r<   r   r=   r   r>   r?   _layersr#   s               r$   r   zVisionTransformer.__init__N   s     ;<<Y99LgWYW^W`W`aa$
  	ANN>';??@@@'""r%   path_or_urlkwargsc                 "    t          | |fi | dS )zLoad pretrained parameters onto the model

        Args:
            path_or_url: the path or URL to the model parameters (checkpoint)
            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
        Nr
   )r"   rD   rE   s      r$   from_pretrainedz!VisionTransformer.from_pretrainede   s"     	t[;;F;;;;;r%   )r4   r   r6   r7   TN)r)   r*   r+   r,   r-   tuplefloatbooldictstrr   r   rG   r0   r1   s   @r$   r3   r3   =   s        , '-,7 %)   	
  #s(O 3S=)    #s(^d" 
     .<3 <# <$ < < < < < < < <r%   r3   arch
pretrainedignore_keysrE   r   c                    |                     dt          t          |          d                             |d<   |                     dt          |          d                   |d<   |                     dt          |          d                   |d<   t          t          |                    }|d         |d<   |d         |d<   |d         |d<   |                    d           t          dd|i|}|rV|d         t          t          |          d                   k    r|nd }|                    t          |          d         |           |S )Nr   r   r   r?   r   )rO    )getlenr   r   popr3   rG   )rM   rN   rO   rE   _cfgmodel_ignore_keyss          r$   _vitrX   o   s<    #JJ}c,t:LY:W6X6XYYF="JJ}l46H6WXXF=

9l4.@.KLLF9L&''D /D /DY'DO
JJy 11$1&11E S '-]&;s<PTCUV_C`?a?a&a&a{{gkl407\RRRLr%   Fc           	      .    t          d| fddddddgd|S )	a  VisionTransformer-S architecture
    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
    <https://arxiv.org/pdf/2010.11929.pdf>`_. Patches: (H, W) -> (H/8, W/8)

    NOTE: unofficial config used in ViTSTR and ParSeq

    >>> import torch
    >>> from doctr.models import vit_s
    >>> model = vit_s(pretrained=False)
    >>> input_tensor = torch.rand((1, 3, 32, 32), dtype=tf.float32)
    >>> out = model(input_tensor)

    Args:
        pretrained: boolean, True if model is pretrained
        **kwargs: keyword arguments of the VisionTransformer architecture

    Returns:
        A feature extractor model
    r   i        r5   2.head.weight2.head.biasr8   r9   r:   r;   rO   rX   rN   rE   s     r$   r   r      sF    ( 	 $m4	 	 	 	 	r%   c           	      .    t          d| fddddddgd|S )a]  VisionTransformer-B architecture as described in
    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
    <https://arxiv.org/pdf/2010.11929.pdf>`_. Patches: (H, W) -> (H/8, W/8)

    >>> import torch
    >>> from doctr.models import vit_b
    >>> model = vit_b(pretrained=False)
    >>> input_tensor = torch.rand((1, 3, 32, 32), dtype=tf.float32)
    >>> out = model(input_tensor)

    Args:
        pretrained: boolean, True if model is pretrained
        **kwargs: keyword arguments of the VisionTransformer architecture

    Returns:
        A feature extractor model
    r   i   rZ   r5   r\   r]   r^   r_   r`   s     r$   r   r      sF    $ 	 $m4	 	 	 	 	r%   r   )F)copyr   typingr   r.   r   doctr.datasetsr    doctr.models.modules.transformerr   'doctr.models.modules.vision_transformerr   utilsr   __all__listr   rK   rL   __annotations__Moduler   
Sequentialr3   rJ   rX   r   r   rQ   r%   r$   <module>rm      s5                       ! ! ! ! ! ! 9 9 9 9 9 9 B B B B B B + + + + + +G

 &$"4x())Y  &$"4x())Y + +d3S#X&'   $" " " " "RY " " ",/< /< /< /< /< /< /< /<j %) 
 cT! 	
    8 d c 6G    @ d c 6G      r%   