
    j$                        U d dl mZ d dlmZ d dlmZ d dlmZ d dlZd dl	m
Z
 d dlm
c mZ d dlmZ d dlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddgZdddded         ddiZeeeeef         f         ed<    G d de          Z  G d dee
j!                  Z"	 	 d%dede#dee#ge
j!        f         dede#de$e         dz  d ed!e"fd"Z%d&de#d ed!e"fd$Z&dS )'    )Callable)deepcopy)groupby)AnyN)IntermediateLayerGetter)VOCABSdecode_sequence   )vip_tiny)_bf16_to_float32load_pretrained_params   )RecognitionModelRecognitionPostProcessorVIPTR
viptr_tiny)gh|?5?g=
ףp=?gV-?)gA`"?gl?g$C?r
          frenchzNhttps://doctr-static.mindee.com/models?id=v0.11.0/viptr_tiny-1cb2515e.pt&src=0)meanstdinput_shapevocaburldefault_cfgsc                       e Zd ZdZeed         dfdej        dede	de
eeef                  fd            Zdej        de
eeef                  fd	Zd
S )VIPTRPostProcessorzPostprocess raw prediction of the model (logits) to a list of words using CTC decoding

    Args:
        vocab: string containing the ordered sequence of supported characters
    r   r   logitsr   blankreturnc                 :   t          j        | d                              d          j                            d          j        }fdt          j        | d          D             }t          t          ||	                                                    S )am  Implements best path decoding as shown by Graves (Dissertation, p63), highly inspired from
        <https://github.com/githubharald/CTCDecoder>`_.

        Args:
            logits: model output, shape: N x T x C
            vocab: vocabulary to use
            blank: index of blank label

        Returns:
            A list of tuples: (word, confidence)
        dim   c           	          g | ]=}t          fd t          |                                          D                       >S )c                 &    g | ]\  }}|k    |S  r)   ).0k_r    s      i/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/doctr/models/recognition/viptr/pytorch.py
<listcomp>z?VIPTRPostProcessor.ctc_best_path.<locals>.<listcomp>.<listcomp>A   s"    OOO41aAJJQJJJ    )r	   r   tolist)r*   seqr    r   s     r-   r.   z4VIPTRPostProcessor.ctc_best_path.<locals>.<listcomp>@   sY     
 
 
 OOOO73::<<+@+@OOOQVWW
 
 
r/   )
Fsoftmaxmaxvaluesmintorchargmaxlistzipr0   )r   r   r    probswordss    ``  r-   ctc_best_pathz VIPTRPostProcessor.ctc_best_path+   s    $ 	&b)))--"-55<@@Q@GGN
 
 
 
 
|F333
 
 

 Cu||~~..///r/   c                 `    |                      || j        t          | j                            S )a9  Performs decoding of raw output with CTC and decoding of CTC predictions
        with label_to_idx mapping dictionary

        Args:
            logits: raw output of the model, shape (N, C + 1, seq_len)

        Returns:
            A tuple of 2 lists: a list of str (words) and a list of float (probs)

        )r   r   r    )r=   r   len)selfr   s     r-   __call__zVIPTRPostProcessor.__call__G   s)     !!tzTZ!YYYr/   N)__name__
__module____qualname____doc__staticmethodr   r7   Tensorstrintr9   tuplefloatr=   rA   r)   r/   r-   r   r   $   s           H%0 000 0 
eCJ	 	0 0 0 \06Zu| ZU3:5F0G Z Z Z Z Z Zr/   r   c                   .    e Zd ZdZ	 	 	 ddej        dedeeeef         de	d	e
eef         dz  f
 fd
ZdededdfdZ	 	 	 ddej        dee         dz  de	de	de
eef         f
dZe	 ddej        dej        dej        dedej        f
d            Z xZS )r   a  Implements a VIPTR architecture as described in `"A Vision Permutable Extractor for Fast and Efficient
    Scene Text Recognition" <https://arxiv.org/abs/2401.10110>`_.

    Args:
        feature_extractor: the backbone serving as feature extractor
        vocab: vocabulary used for encoding
        input_shape: input shape of the image
        exportable: onnx exportable returns only logits
        cfg: configuration dictionary
    r   FNfeature_extractorr   r   
exportablecfgc                 L   t                                                       || _        || _        || _        d| _        t          |          | _        || _        t          j
                    5  |                     t          j        dg|R                     d         j        d         }d d d            n# 1 swxY w Y   t          | j                  | _        t          j        |t          | j                  dz             | _        |                                 D ]\  }}|                    d          rt)          |t          j                  rQt          j                            |j        d           |j        $t          j                            |j                   d S )	Nr   r&   featuresr#   )r   zfeat_extractor.g{Gz?)r   )super__init__r   rN   rO   
max_lengthr?   
vocab_sizefeat_extractorr7   inference_modezerosshaper   postprocessornnLinearheadnamed_modules
startswith
isinstanceinittrunc_normal_weightbiaszeros_)
r@   rM   r   r   rN   rO   embedding_unitsnm	__class__s
            r-   rS   zVIPTR.__init__b   s    	
$e**/!## 	h 	h"11%+q>O;>O>O2P2PQQR\]cdfgO	h 	h 	h 	h 	h 	h 	h 	h 	h 	h 	h 	h 	h 	h 	h 0djAAAIos4:/BCC	&&(( 	+ 	+DAq||-.. !RY'' +%%ahD%9996%GNN16***	+ 	+s   ,=B55B9<B9path_or_urlkwargsr!   c                 "    t          | |fi | dS )zLoad pretrained parameters onto the model

        Args:
            path_or_url: the path or URL to the model parameters (checkpoint)
            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
        N)r   )r@   rj   rk   s      r-   from_pretrainedzVIPTR.from_pretrained   s"     	t[;;F;;;;;r/   xtargetreturn_model_outputreturn_predsc                     |                      |          \  }}t          j        |                              t          j                  t          j        |          }}|                    |j                  |                    |j                  }} j        r|t          d           	                    |          d         }	|	
                                \  }
}}                     |	                              |
|t           j                  dz             }t          |          }i } j        r||d<   |S |r||d<   ||r[t          j        j        dt          j        dt(          t*          t,          t.          f                  f fd	            } ||          |d
<   |-                     |||t           j                            |d<   |S )N)dtypez&Need to provide labels during trainingrQ   r&   r   out_mapdecoded_featuresr!   c                 .                         |           S )N)rZ   )ru   r@   s    r-   _postprocessz#VIPTR.forward.<locals>._postprocess   s    ))*:;;;r/   predsloss)build_targetr7   
from_numpytolongtensordevicetraining
ValueErrorrV   sizer]   viewr?   r   r   rN   compilerdisablerG   r9   rJ   rH   rK   compute_loss)r@   rn   ro   rp   rq   _gt_seq_lengtseq_lenrQ   BNEr   ru   outrw   s   `                r-   forwardzVIPTR.forward   s     --f55MC*3//222DDelS[F\F\B%%//7::ah+?+?B= 	GV^EFFF&&q))*5--//1a8$$))!QDJ!0CDD+F33 ? 	,CMJ 	.-C	N>\>^#<u| <U3PU:EV@W < < < < < $#< (<(899CL++,<b'3tz??[[CK
r/   r   model_outputr   r   	blank_idxc                    | j         d         }| j         d         t          j        |ft          j                  z  }|                     ddd          }t          j        |d          }t          j        |||||d          }|S )	a>  Compute CTC loss for the model.

        Args:
            model_output: predicted logits of the model
            gt: ground truth tensor
            seq_len: sequence lengths of the ground truth
            blank_idx: index of the blank label

        Returns:
            The loss of the model on the batch
        r   r&   )r   rs   r   r#   r$   T)zero_infinity)rY   r7   onesint32permuter2   log_softmaxctc_loss)	r   r   r   r   	batch_leninput_lengthr   r;   r   s	            r-   r   zVIPTR.compute_loss   s    $ !&q)	#)!,uz	|SXS^/_/_/__%%aA..f"---:
 
 
 r/   )r   FN)NFF)r   )rB   rC   rD   rE   r[   ModulerH   rJ   rI   booldictr   rS   rm   r7   rG   r9   r   rF   r   __classcell__)ri   s   @r-   r   r   V   s       	 	 -9 %)+ +9+ + 3S=)	+
 + #s(^d"+ + + + + +><3 <# <$ < < < < $($)") )<) S	D ) "	)
 ) 
c3h) ) ) )V 
 	 lL  	
 
   \    r/   Tarch
pretrainedbackbone_fnlayerpretrained_backboneignore_keysrk   r!   c                    |o| }t          t          |                    }|                    d|d                   |d<   |                    d|d                   |d<   t           |||d                   |di          }|d         |d<   |d         |d<   t	          |fd|i|}	|rI|d         t          |          d         k    r|nd }
|	                    t          |          d         |
           |	S )Nr   r   )r   rQ   rO   r   )r   )r   r   getr   r   rm   )r   r   r   r   r   r   rk   _cfgrV   model_ignore_keyss              r-   _viptrr      s    .@j. L&''DJJwW66DM **]D4GHHD -'T-5HIII	
 N
 7mF7O /F=.55d5f55E  S '+7m|D7I'7R&R&R{{X\l407\RRRLr/   Fc                 4    t          d| t          dfdddgi|S )a9  VIPTR-Tiny as described in `"A Vision Permutable Extractor for Fast and Efficient Scene Text Recognition"
    <https://arxiv.org/abs/2401.10110>`_.

    >>> import torch
    >>> from doctr.models import viptr_tiny
    >>> model = viptr_tiny(pretrained=False)
    >>> input_tensor = torch.rand((1, 3, 32, 128))
    >>> out = model(input_tensor)

    Args:
        pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
        **kwargs: keyword arguments of the VIPTR architecture

    Returns:
        VIPTR: a VIPTR model instance
    r   5r   zhead.weightz	head.bias)r   r   )r   rk   s     r-   r   r      s@    " 	 
 #K0   r/   )TN)F)'collections.abcr   copyr   	itertoolsr   typingr   r7   torch.nnr[   torch.nn.functional
functionalr2   torchvision.models._utilsr   doctr.datasetsr   r	   classificationr   utilsr   r   corer   r   __all__r   r   rH   __annotations__r   r   r   r   r9   r   r   r)   r/   r-   <module>r      st   % $ $ $ $ $ $                                   = = = = = = 2 2 2 2 2 2 2 2 & & & & & & = = = = = = = = = = = = = = = =L
! %$#!_ +d3S#X&'   /Z /Z /Z /Z /Z1 /Z /Z /Zd    bi   N !%$(" "
"" 4&")+," 	"
 " cT!" " " " " "J 4 3 5      r/   