
    j`'                        U d dl mZ d dlmZ d dlmZ d dlZd dlmZ d dlm	Z
 d dlmZ d dlmZ d	d
lmZmZ d	dlmZmZ ddlmZmZ g dZddded         ddddded         dddZeeeeef         f         ed<    G d deej                  Z G d de          Z	 d'dede dee gej        f         ded e!e         dz  d!ed"efd#Z"d(de d!ed"efd%Z#d(de d!ed"efd&Z$dS ))    )Callable)deepcopy)AnyN)nn)
functional)IntermediateLayerGetter)VOCABS   )vit_bvit_s)_bf16_to_float32load_pretrained_params   )_ViTSTR_ViTSTRPostProcessor)ViTSTRvitstr_smallvitstr_base)gh|?5?g=
ףp=?gV-?)gA`"?gl?g$C?r
          frenchzOhttps://doctr-static.mindee.com/models?id=v0.7.0/vitstr_small-fcd12655.pt&src=0)meanstdinput_shapevocaburlzNhttps://doctr-static.mindee.com/models?id=v0.7.0/vitstr_base-50b21df2.pt&src=0)r   r   default_cfgsc                   &    e Zd ZdZ	 	 	 	 ddededed	eeeef         d
edeee	f         dz  ddf fdZ
dede	ddfdZ	 	 	 ddej        dee         dz  dededeee	f         f
dZedej        dej        dej        dej        fd            Z xZS )r   av  Implements a ViTSTR architecture as described in `"Vision Transformer for Fast and
    Efficient Scene Text Recognition" <https://arxiv.org/pdf/2105.08582.pdf>`_.

    Args:
        feature_extractor: the backbone serving as feature extractor
        vocab: vocabulary used for encoding
        embedding_units: number of embedding units
        max_length: maximum word length handled by the model
        dropout_prob: dropout probability of the encoder LSTM
        input_shape: input shape of the image
        exportable: onnx exportable returns only logits
        cfg: dictionary containing information about the model
    r   r   FNr   embedding_units
max_lengthr   
exportablecfgreturnc                 &   t                                                       || _        || _        || _        |dz   | _        || _        t          j        |t          | j                  dz             | _
        t          | j                  | _        d S )N   r   )r   )super__init__r   r"   r#   r!   feat_extractorr   LinearlenheadViTSTRPostProcessorpostprocessor)	selffeature_extractorr   r    r!   r   r"   r#   	__class__s	           j/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/doctr/models/recognition/vitstr/pytorch.pyr(   zViTSTR.__init__8   s~     	
$$q./Ios4:/BCC	0tzBBB    path_or_urlkwargsc                 "    t          | |fi | dS )zLoad pretrained parameters onto the model

        Args:
            path_or_url: the path or URL to the model parameters (checkpoint)
            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
        N)r   )r/   r4   r5   s      r2   from_pretrainedzViTSTR.from_pretrainedM   s"     	t[;;F;;;;;r3   xtargetreturn_model_outputreturn_predsc                                           |          d         }|                     |          \  }}t          j        |                              t          j                  t          j        |          }	}|                    |j                  |	                    |j                  }	} j        r|t          d          |d d d  j
        f         }|                                \  }
}}|                    |
|z  |          }                     |                              |
|t           j                  dz             }t#          |d d dd f                   }i } j        r||d<   |S |r||d<   ||r[t          j        j        dt          j        dt,          t.          t0          t2          f                  f fd	            } ||          |d
<   |                     |||	          |d<   |S )Nfeaturesdtypez&Need to provide labels during trainingr   logitsout_mapdecoded_featuresr$   c                 .                         |           S N)r.   )rB   r/   s    r2   _postprocessz$ViTSTR.forward.<locals>._postprocessx   s    ))*:;;;r3   predsloss)r)   build_targettorch
from_numpytolongtensordevicetraining
ValueErrorr!   sizereshaper,   viewr+   r   r   r"   compilerdisableTensorlisttuplestrfloatcompute_loss)r/   r8   r9   r:   r;   r=   _gt_seq_lengtseq_lenBNEr@   rB   outrE   s   `                r2   forwardzViTSTR.forwardV   s    &&q))*5 --f55MC*3//222DDelS[F\F\B%%//7::ah+?+?B= 	GV^EFFF AAA0001--//1a##AE1--8$$))!QDJ!0CDD+F111abb5M:: ? 	,CMJ 	.-C	N>\>^#<u| <U3PU:EV@W < < < < < $#< (<(899CL++,<b'JJCK
r3   model_outputr^   r_   c                    | j         d         }|dz   }t          j        |                     ddd          |ddddf         d          }t	          j        || j                  dddf         |dddf         k    }d||<   |                    d          |                    | j	                  z  }|
                                S )	al  Compute categorical cross-entropy loss for the model.
        Sequences are masked after the EOS character.

        Args:
            model_output: predicted logits of the model
            gt: the encoded tensor with gt labels
            seq_len: lengths of each gt word inside the batch

        Returns:
            The loss of the model on the batch
        r   r   r&   Nnone)	reduction)rN   r>   )shapeFcross_entropypermuterI   arangerN   sumrK   r?   r   )re   r^   r_   	input_lenccemask_2dce_losss          r2   r[   zViTSTR.compute_loss   s    $ !&q)	A+ ol221a;;R122YRXYYY,y1DEEEdAAAgNRYZ[Z[Z[]aZaRbbG''!**wzz0BzCCC||~~r3   )r   r   FN)NFF)__name__
__module____qualname____doc__rY   intrX   booldictr   r(   r7   rI   rV   rW   rd   staticmethodr[   __classcell__)r1   s   @r2   r   r   )   s        & ,8 %)C C C 	C
 C 3S=)C C #s(^d"C 
C C C C C C*<3 <# <$ < < < < $($)", ,<, S	D , "	,
 , 
c3h, , , ,\ lL  
	   \    r3   r   c                   H    e Zd ZdZdej        deeee	f                  fdZ
dS )r-   zPost processor for ViTSTR architecture

    Args:
        vocab: string containing the ordered sequence of supported characters
    r@   r$   c                 b    |                     d          }t          j        |d                              d          d          fd|                                                                D             }fdt          |          D             }t          t          ||                    S )N)dimr   c                     g | ]<}d                      fd|D                                           d          d         =S ) c              3   2   K   | ]}j         |         V  d S rD   )
_embedding).0idxr/   s     r2   	<genexpr>z:ViTSTRPostProcessor.__call__.<locals>.<listcomp>.<genexpr>   s*      @@SDOC(@@@@@@r3   z<eos>r   )joinsplit)r   encoded_seqr/   s     r2   
<listcomp>z0ViTSTRPostProcessor.__call__.<locals>.<listcomp>   s^     
 
 
 GG@@@@K@@@@@FFwOOPQR
 
 
r3   c                     g | ]Z\  }}|rQ|d t          |          f                             dd                                                                          nd[S )Nr   r   g        )r+   clipr   item)r   iword
preds_probs      r2   r   z0ViTSTRPostProcessor.__call__.<locals>.<listcomp>   sq     
 
 
U\UVX\4PJq+CII+~&++Aq116688==???S
 
 
r3   )	argmaxrI   softmaxmaxcpunumpy	enumeraterW   zip)r/   r@   out_idxsword_valuesprobsr   s   `    @r2   __call__zViTSTRPostProcessor.__call__   s    
 ==$$]62..22r2::1=

 
 
 
'||~~3355
 
 


 
 
 
`iju`v`v
 
 
 CU++,,,r3   N)rs   rt   ru   rv   rI   rV   rW   rX   rY   rZ   r    r3   r2   r-   r-      sT         -- 
eCJ	 - - - - - -r3   r-   arch
pretrainedbackbone_fnlayerignore_keysr5   r$   c                 b   t          t          |                    }|                    d|d                   |d<   |                    d|d                   |d<   |                    dd          }|d         |d<   |d         |d<   t           |d|d         |          |di          }|                    dd            |                    dd            t          |fd	|i|}	|rI|d         t          |          d         k    r|nd }
|	                    t          |          d
         |
           |	S )Nr   r   
patch_size      F)r   r   r=   pretrained_backboner#   r   )r   )r   r   getr   popr   r7   )r   r   r   r   r   r5   _cfgr   r)   model_ignore_keyss              r2   _vitstrr      sK    L&''DJJwW66DM **]D4GHHDL&11J7mF7O /F= -EtM':zRRR	
 N JJ|T"""
JJ$d+++ >66t6v66E S '+7m|D7I'7R&R&R{{X\l407\RRRLr3   Fc                 8    t          d| t          dfddddgd|S )a7  ViTSTR-Small as described in `"Vision Transformer for Fast and Efficient Scene Text Recognition"
    <https://arxiv.org/pdf/2105.08582.pdf>`_.

    >>> import torch
    >>> from doctr.models import vitstr_small
    >>> model = vitstr_small(pretrained=False)
    >>> input_tensor = torch.rand((1, 3, 32, 128))
    >>> out = model(input_tensor)

    Args:
        pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
        kwargs: keyword arguments of the ViTSTR architecture

    Returns:
        text recognition architecture
    r   1i  r   head.weight	head.biasr    r   r   )r   r   r   r5   s     r2   r   r      sF    " 		
 "K0	 	 	 	 	r3   c                 8    t          d| t          dfddddgd|S )a4  ViTSTR-Base as described in `"Vision Transformer for Fast and Efficient Scene Text Recognition"
    <https://arxiv.org/pdf/2105.08582.pdf>`_.

    >>> import torch
    >>> from doctr.models import vitstr_base
    >>> model = vitstr_base(pretrained=False)
    >>> input_tensor = torch.rand((1, 3, 32, 128))
    >>> out = model(input_tensor)

    Args:
        pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
        kwargs: keyword arguments of the ViTSTR architecture

    Returns:
        text recognition architecture
    r   r   i   r   r   r   r   )r   r   r   s     r2   r   r     sF    " 		
 "K0	 	 	 	 	r3   rD   )F)%collections.abcr   copyr   typingr   rI   r   torch.nnr   rj   torchvision.models._utilsr   doctr.datasetsr	   classificationr   r   utilsr   r   baser   r   __all__r   ry   rY   __annotations__Moduler   r-   rx   rW   r   r   r   r   r3   r2   <module>r      s   % $ $ $ $ $ $                    $ $ $ $ $ $ = = = = = = ! ! ! ! ! ! * * * * * * * * = = = = = = = = / / / / / / / /
3
3
3 &$#!`  &$#!_ + +d3S#X&'   $x x x x xWbi x x xv- - - - -. - - -B %)$ $
$$ 4&")+,$ 	$
 cT!$ $ $ $ $ $N T S V    : D C F      r3   