
    RЦi                     V   S r SSKJr  SSKJrJrJrJrJr  SSK	r	SSK
Jr  SSKJs  Jr  SSK	Jr  SSKJrJr  SSKJrJrJrJrJrJrJrJr  SSKJrJr  S	S
KJr  S	SK J!r!  S	SK"J#r#  S	SK$J%r%J&r&  S	SK'J(r(J)r)  S/r* " S S\RV                  5      r, " S S\RV                  5      r- " S S\RV                  5      r. " S S\RV                  5      r/ " S S\RV                  5      r0 " S S\RV                  5      r1S\S\\2\24   4S jr3\#S\S\\2\24   S \2S!\24S" j5       r4 " S# S$\RV                  5      r5 " S% S&\RV                  5      r6 " S' S(\RV                  5      r7 " S) S\RV                  5      r8S>S* jr9S+ r:S?S, jr;S@S- jr<\(" \<" S.S/9\<" S.S/9\<" S.S/9\<" 5       \<" 5       \<" 5       \<" S0SS1S29\<" S3SS1S29S4.5      r=\)S?S5\84S6 jj5       r>\)S?S5\84S7 jj5       r?\)S?S5\84S8 jj5       r@\)S?S5\84S9 jj5       rA\)S?S5\84S: jj5       rB\)S?S5\84S; jj5       rC\)S?S5\84S< jj5       rD\)S?S5\84S= jj5       rEg)Aae  DaViT: Dual Attention Vision Transformers

As described in https://arxiv.org/abs/2204.03645

Input size invariant transformer architecture that combines channel and spacial
attention in each block. The attention mechanisms used are linear in complexity.

DaViT model defs and weights adapted from https://github.com/dingmyu/davit, original copyright below

    )partial)ListOptionalTupleTypeUnionN)TensorIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPathcalculate_drop_path_rates	to_2tupletrunc_normal_MlpLayerNorm2dget_norm_layeruse_fused_attn)NormMlpClassifierHeadClassifierHead   )build_model_with_cfg)feature_take_indices)register_notrace_function)
checkpointcheckpoint_seq)generate_default_cfgsregister_modelDaVitc                   R   ^  \ rS rSr    S	S\S\S\4U 4S jjjrS\4S jrSr	U =r
$ )

ConvPosEnc"   dimkactc                    > XES.n[         TU ]  5         [        R                  " UU4USUS-  US.UD6U l        U(       a  [        R
                  " 5       U l        g [        R                  " 5       U l        g )Ndevicedtyper      )kernel_sizestridepaddinggroups)super__init__nnConv2dprojGELUIdentityr%   )selfr#   r$   r%   r(   r)   dd	__class__s          P/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/models/davit.pyr0   ConvPosEnc.__init__#   sl     /II
 F
 
	 !$2779    xc                 N    U R                  U5      nXR                  U5      -   nU$ N)r3   r%   )r6   r<   feats      r9   forwardConvPosEnc.forward9   s$    yy|r;   )r%   r3   )   FNN__name__
__module____qualname____firstlineno__intboolr0   r	   r@   __static_attributes____classcell__r8   s   @r9   r!   r!   "   sH     77 7 	7 7,  r;   r!   c            
       x   ^  \ rS rSrSrSSS\SS4S\S\S	\S
\\R                     4U 4S jjjr
S\4S jrSrU =r$ )Stem?   zSize-agnostic implementation of 2D image to patch embedding,
allowing input size to be adjusted during model forward operation
rB   `      Nin_chsout_chsr,   
norm_layerc                    > XVS.n[         TU ]  5         [        U5      nX0l        Xl        X l        US   S:X  d   e[        R                  " UU4SUSS.UD6U l        U" U40 UD6U l	        g )Nr'   r   rQ      rB   r+   r,   r-   )
r/   r0   r   r,   rR   rS   r1   r2   convnorm)	r6   rR   rS   r,   rT   r(   r)   r7   r8   s	           r9   r0   Stem.__init__D   s     /6"ayA~~II
 
 
	 w-"-	r;   r<   c                 b   UR                   u  p#pEU R                  S   XPR                  S   -  -
  U R                  S   -  nU R                  S   X@R                  S   -  -
  U R                  S   -  n[        R                  " USUSU45      nU R	                  U5      nU R                  U5      nU$ )Nr   r   )shaper,   FpadrX   rY   )r6   r<   BCHWpad_rpad_bs           r9   r@   Stem.forward^   s    WW
aQ!kk!n"44AFQ!kk!n"44AFEE!a5)*IIaLIIaLr;   )rX   rR   rY   rS   r,   )rD   rE   rF   rG   __doc__r   rH   r   r1   Moduler0   r	   r@   rJ   rK   rL   s   @r9   rN   rN   ?   se     *5.. . 	.
 RYY. .4  r;   rN   c            
       p   ^  \ rS rSrS\SS4S\S\S\S\\R                     4U 4S jjjr	S	\
4S
 jrSrU =r$ )
Downsampleh   rB   NrR   rS   r+   rT   c                    > XVS.n[         TU ]  5         Xl        X l        U" U40 UD6U l        US-  S:H  U l        [        R                  " UU4USU R
                  (       a  SOUS-  S.UD6U l        g )Nr'   r*   r   rW   )	r/   r0   rR   rS   rY   even_kr1   r2   rX   )	r6   rR   rS   r+   rT   r(   r)   r7   r8   s	           r9   r0   Downsample.__init__i   s     /v,,	!Ao*II
 $A+*:
 
	r;   r<   c                    UR                   u  p#pEU R                  U5      nU R                  (       aG  U R                  R                  u  pgXuU-  -
  U-  nXdU-  -
  U-  n	[
        R                  " USUSU	45      nU R                  U5      nU$ )Nr   )r\   rY   rl   rX   r+   r]   r^   )
r6   r<   r_   r`   ra   rb   k_hk_wrc   rd   s
             r9   r@   Downsample.forward   s    WW
aIIaL;;yy,,HCs7]c)Es7]c)Ea!UQ./AIIaLr;   )rX   rl   rR   rY   rS   )rD   rE   rF   rG   r   rH   r   r1   rg   r0   r	   r@   rJ   rK   rL   s   @r9   ri   ri   h   sZ    
  !*5

 
 	

 RYY
 
2	 	 	r;   ri   c            	       P   ^  \ rS rSr     S	S\S\S\S\4U 4S jjjrS rSrU =r	$ )
ChannelAttentionV2   r#   	num_headsqkv_biasdynamic_scalec                    > XVS.n[         TU ]  5         X l        X-  U l        X@l        [
        R                  " XS-  4SU0UD6U l        [
        R                  " X40 UD6U l        g )Nr'   rB   bias)	r/   r0   r.   head_dimrw   r1   Linearqkvr3   )	r6   r#   ru   rv   rw   r(   r)   r7   r8   s	           r9   r0   ChannelAttentionV2.__init__   s`     /(*99S'??B?IIc-"-	r;   c                 .   UR                   u  p#nU R                  U5      R                  X#SU R                  X@R                  -  5      R	                  SSSSS5      nUR                  S5      u  pgnU R                  (       a  XcS-  -  nOX`R                  S-  -  nUR                  SS5      U-  n	U	R                  SS	9n	XR                  SS5      -  R                  SS5      nUR                  SS5      R                  X#U5      nU R                  U5      nU$ )
NrB   r*   r   r   rQ         r#   )r\   r|   reshaper.   permuteunbindrw   rz   	transposesoftmaxr3   
r6   r<   r_   Nr`   r|   qr$   vattns
             r9   r@   ChannelAttentionV2.forward   s    ''ahhqk!!!4;;[[8HIQQRSUVXY[\^_`**Q-aIAMMT))A{{2r"Q&|||#KKB''222r:KK1%%aA.IIaLr;   )rw   r.   rz   r3   r|   )   TTNN)
rD   rE   rF   rG   rH   rI   r0   r@   rJ   rK   rL   s   @r9   rs   rs      sN    
 !"&.. . 	.
  . .$ r;   rs   c                   R   ^  \ rS rSr    S	S\S\S\4U 4S jjjrS\4S jrSr	U =r
$ )
ChannelAttention   r#   ru   rv   c                    > XES.n[         TU ]  5         X l        X-  nUS-  U l        [        R
                  " XS-  4SU0UD6U l        [        R
                  " X40 UD6U l        g )Nr'   r   rB   ry   )r/   r0   ru   scaler1   r{   r|   r3   )	r6   r#   ru   rv   r(   r)   r7   rz   r8   s	           r9   r0   ChannelAttention.__init__   sd     /"#%
99S'??B?IIc-"-	r;   r<   c                    UR                   u  p#nU R                  U5      R                  X#SU R                  X@R                  -  5      R	                  SSSSS5      nUR                  S5      u  pgnXpR                  -  nUR                  SS5      U-  n	U	R                  SS9n	XR                  SS5      -  R                  SS5      nUR                  SS5      R                  X#U5      nU R                  U5      nU$ )	NrB   r*   r   r   rQ   r   r   r   )
r\   r|   r   ru   r   r   r   r   r   r3   r   s
             r9   r@   ChannelAttention.forward   s    ''ahhqk!!!4>>1;NOWWXY[\^_abdef**Q-a

N{{2r"Q&|||#KKB''222r:KK1%%aA.IIaLr;   )ru   r3   r|   r   )r   FNNrC   rL   s   @r9   r   r      sH    
 ".. . 	. ."  r;   r   c                      ^  \ rS rSrSSS\R
                  \R                  SSSSS4
S\S\S	\S
\	S\S\
\R                     S\
\R                     S\	S\	S\	4U 4S jjjrS\4S jrSrU =r$ )ChannelBlock         @F        TNr#   ru   	mlp_ratiorv   	drop_path	act_layerrT   ffncpe_actv2c                 >  > XS.n[         TU ]  5         [        SUSU	S.UD6U l        Xl        U" U40 UD6U l        U
(       a  [        O[        nU" U4UUS.UD6U l        US:  a  [        U5      O[        R                  " 5       U l        [        SUSU	S.UD6U l        U R                  (       aY  U" U40 UD6U l        [        SU[!        X-  5      US.UD6U l        US:  a  [        U5      O[        R                  " 5       U l        g S U l        S U l        S U l        g Nr'   rB   )r#   r$   r%   )ru   rv   r   )in_featureshidden_featuresr    )r/   r0   r!   cpe1r   norm1rs   r   r   r   r1   r5   
drop_path1cpe2norm2r   rH   mlp
drop_path2)r6   r#   ru   r   rv   r   r   rT   r   r   r   r(   r)   r7   
attn_layerr8   s                  r9   r0   ChannelBlock.__init__   s    /?3!?B?	*r*
+-'3C


 	
	 2;R(9-R[[]?3!?B?	88#C.2.DJ  #CO 4# 	DH 6?^hy1DODJDH"DOr;   r<   c                 R   UR                   u  p#pEU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nU R                  U5      nXR                  U5      -   nU R                  UR                  SS5      R                  X#XE5      5      nU R                  bt  UR                  S5      R                  SS5      nXR                  U R                  U R                  U5      5      5      -   nUR                  SS5      R                  X#XE5      nU$ )Nr*   r   )r\   r   flattenr   r   r   r   r   viewr   r   r   )r6   r<   r_   r`   ra   rb   curs          r9   r@   ChannelBlock.forward  s    WW
aIIaL  #--a3jjmiin$$IIakk!Q',,Q18988		!&&q!,AOODHHTZZ]$;<<AAq!&&qQ2Ar;   )	r   r   r   r   r   r   r   r   r   )rD   rE   rF   rG   r1   r4   	LayerNormrH   floatrI   r   rg   r0   r	   r@   rJ   rK   rL   s   @r9   r   r      s      ""!)+*,,,!+#+# +# 	+#
 +# +# BII+# RYY+# +# +# +# +#Z  r;   r   r<   window_sizec                     U R                   u  p#pEU R                  X#US   -  US   XAS   -  US   U5      n U R                  SSSSSS5      R                  5       R                  SUS   US   U5      nU$ )z
Args:
    x: (B, H, W, C)
    window_size (int): window size
Returns:
    windows: (num_windows*B, window_size, window_size, C)
r   r   rB   r*   rQ      r   r\   r   r   
contiguous)r<   r   r_   ra   rb   r`   windowss          r9   window_partitionr     s     JA!	q{1~%{1~qN7JKXYN\]^Aii1aAq)446;;BAP[\]P^`abGNr;   r   ra   rb   c                     U R                   S   nU R                  SX!S   -  X1S   -  US   US   U5      nUR                  SSSSSS5      R                  5       R                  SX#U5      nU$ )z
Args:
    windows: (num_windows*B, window_size, window_size, C)
    window_size (int): Window size
    H (int): Height of image
    W (int): Width of image
Returns:
    x: (B, H, W, C)
r   r   r   rB   r*   rQ   r   r   )r   r   ra   rb   r`   r<   s         r9   window_reverser   (  sy     	bAR!n,aq>.A;q>S^_`SacdeA			!Q1a#..055b!BAHr;   c            	          ^  \ rS rSr% Sr\R                  R                  \   \	S'      SS\
S\\
\
4   S\
S\4U 4S jjjrS	\4S
 jrSrU =r$ )WindowAttentioni9  a  Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
    dim (int): Number of input channels.
    window_size (tuple[int]): The height and width of the window.
    num_heads (int): Number of attention heads.
    qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True

fused_attnr#   r   ru   rv   c                 4  > XVS.n[         T	U ]  5         Xl        X l        X0l        X-  nUS-  U l        [        5       U l        [        R                  " XS-  4SU0UD6U l
        [        R                  " X40 UD6U l        [        R                  " SS9U l        g )Nr'   r   rB   ry   r   r   )r/   r0   r#   r   ru   r   r   r   r1   r{   r|   r3   Softmaxr   )
r6   r#   r   ru   rv   r(   r)   r7   rz   r8   s
            r9   r0   WindowAttention.__init__D  s     /&"#%
(*99S'??B?IIc-"-	zzb)r;   r<   c                 
   UR                   u  p#nU R                  U5      R                  X#SU R                  X@R                  -  5      R	                  SSSSS5      nUR                  S5      u  pgnU R                  (       a  [        R                  " XgU5      nO7X`R                  -  nXgR                  SS5      -  n	U R                  U	5      n	X-  nUR                  SS5      R                  X#U5      nU R                  U5      nU$ )NrB   r*   r   r   rQ   r   r   )r\   r|   r   ru   r   r   r   r]   scaled_dot_product_attentionr   r   r   r3   )
r6   r<   B_r   r`   r|   r   r$   r   r   s
             r9   r@   WindowAttention.forward[  s    77qhhqk!!"DNNA<OPXXYZ\]_`bcefg**Q-a??..qQ7AJJAB++D<<%DAKK1%%bQ/IIaLr;   )r#   r   ru   r3   r|   r   r   r   )TNN)rD   rE   rF   rG   rf   torchjitFinalrI   __annotations__rH   r   r0   r	   r@   rJ   rK   rL   s   @r9   r   r   9  sq     		%% "** sCx* 	*
 * *.  r;   r   c                      ^  \ rS rSrSrSSSS\R                  \R                  SSSS4
S	\S
\S\S\	S\
S\	S\\R                     S\\R                     S\
S\
4U 4S jjjrS\4S jrSrU =r$ )SpatialBlockin  a  Windows Block.
Args:
    dim (int): Number of input channels.
    num_heads (int): Number of attention heads.
    window_size (int): Window size.
    mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
    qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
    drop_path (float, optional): Stochastic depth rate. Default: 0.0
    act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
    norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
rV   r   Tr   FNr#   ru   r   r   rv   r   r   rT   r   r   c                 |  > XS.n[         TU ]  5         Xl        Xl        X l        [        U5      U l        X@l        [        SUSU
S.UD6U l	        U" U40 UD6U l
        [        UU R                  4UUS.UD6U l        US:  a  [        U5      O[        R                  " 5       U l        [        SUSU
S.UD6U l        U R                  (       a[  U" U40 UD6U l        ['        X-  5      n[)        SUUUS.UD6U l        US:  a  [        U5      O[        R                  " 5       U l        g S U l        S U l        S U l        g r   )r/   r0   r#   r   ru   r   r   r   r!   r   r   r   r   r   r1   r5   r   r   r   rH   r   r   r   )r6   r#   ru   r   r   rv   r   r   rT   r   r   r(   r)   r7   mlp_hidden_dimr8   s                  r9   r0   SpatialBlock.__init__{  s>    /"$[1"?3!?B?	*r*
#
  	

 
	 2;R(9-R[[]?3!?B?	88#C.2.DJ 1N  .# 	DH 6?^hy1DODJDH"DOr;   r<   c           	         UR                   u  p#pEU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nUR                  X$XS5      nS=pxU R                  S   XPR                  S   -  -
  U R                  S   -  n	U R                  S   X@R                  S   -  -
  U R                  S   -  n
[        R                  " USSXyX45      nUR                   u  pp[        XR                  5      nUR                  SU R                  S   U R                  S   -  U5      nU R                  U5      nUR                  SU R                  S   U R                  S   U5      n[        XR                  X5      nUS S 2S U2S U2S S 24   R                  5       nUR                  X$U-  U5      nX`R                  U5      -   nU R                  UR                  SS5      R                  X#XE5      5      nU R                  bt  UR                  S5      R                  SS5      nXR!                  U R                  U R#                  U5      5      5      -   nUR                  SS5      R                  X#XE5      nU$ )Nr*   r   r   r   )r\   r   r   r   r   r   r   r]   r^   r   r   r   r   r   r   r   r   r   )r6   r<   r_   r`   ra   rb   shortcutpad_lpad_trc   rd   _HpWp	x_windowsattn_windowss                   r9   r@   SpatialBlock.forward  sO   WW
a99Q<''*44Q:JJx FF1!!!$q+;+;A+>'>>$BRBRSTBUU!!!$q+;+;A+>'>>$BRBRSTBUUEE!aE%78wwr$Q(8(89	NN2t'7'7':T=M=Ma=P'PRST	 yy+ $((T-=-=a-@$BRBRSTBUWXY<)9)92B a!RaRlO&&(FF1!eQq))IIakk!Q',,Q18988		!&&q!,AOODHHTZZ]$;<<AAq!&&qQ2Ar;   )r   r   r   r#   r   r   r   r   r   r   r   ru   r   )rD   rE   rF   rG   rf   r1   r4   r   rH   r   rI   r   rg   r0   r	   r@   rJ   rK   rL   s   @r9   r   r   n  s    
   !!!!)+*,,,!0#0# 0# 	0#
 0# 0# 0# BII0# RYY0# 0# 0# 0#d% % %r;   r   c            #       <  ^  \ rS rSrSSSSSSSS\\R                  SS	S
S	S	SS4S\S\S\S\S\	\
S4   S\S\S\S\S\	\S4   S\\R                     S\\R                     S\S\S\S\S\4"U 4S jjjr\R                   R"                  S#S j5       rS \4S! jrS"rU =r$ )$
DaVitStagei  r   TspatialchannelrB   rV   r   )r   r   Fr*   NrR   rS   depth
downsample
attn_types.ru   r   r   rv   drop_path_ratesrT   norm_layer_clr   r   down_kernel_sizenamed_blockschannel_attn_v2c                   > UUS.n[         TU ]  5         SU l        U(       a  [        X4XS.UD6U l        O[
        R                  " 5       U l         / n[        U5       H  nSSKJ	n  / n[        U5       Hh  u  nnUS:X  a+  UR                  S[        SUUUU	U
U   UUUUS.	UD645        M7  US	:X  d  M?  UR                  S
[        SUUUU	U
U   UUUUS.	UD645        Mj     U(       a-  UR                  [
        R                  " U" U5      5      5        M  UR                  [
        R                  " U Vs/ s H  nUS   PM
     sn6 5        M     [
        R                  " U6 U l        g s  snf )Nr'   F)r+   rT   r   )OrderedDictr   spatial_block)	r#   ru   r   rv   r   rT   r   r   r   r   channel_block)	r#   ru   r   rv   r   rT   r   r   r   r   r   )r/   r0   grad_checkpointingri   r   r1   r5   rangecollectionsr   	enumerateappendr   r   
Sequentialblocks)r6   rR   rS   r   r   r   ru   r   r   rv   r   rT   r   r   r   r   r   r   r(   r)   r7   stage_blocks	block_idxr   dual_attention_blockattn_idx	attn_typebr8   s                               r9   r0   DaVitStage.__init__  s   , /"' (tFVtqstDO kkmDO	 uI/#% '0'<#)	)(//, C#"+"+!)"1)"<#0 '$/C C 1  )+(//, C#"+"+!)"1)"<#0 '*C C 1  (=6 ##BMM+>R2S$TU##BMMBV3WBVQAaDBV3W$XYC &D mm\2 4Xs   4E(c                     Xl         g r>   )r   )r6   enables     r9   set_grad_checkpointing!DaVitStage.set_grad_checkpointing#  s    "(r;   r<   c                     U R                  U5      nU R                  (       a;  [        R                  R	                  5       (       d  [        U R                  U5      nU$ U R                  U5      nU$ r>   )r   r   r   r   is_scriptingr   r   r6   r<   s     r9   r@   DaVitStage.forward'  sV    OOA""599+A+A+C+Ct{{A.A  AAr;   )r   r   r   T)rD   rE   rF   rG   r   r1   r   rH   rI   r   strr   r   rg   r0   r   r   ignorer   r	   r@   rJ   rK   rL   s   @r9   r   r     sR   
 #*@ !!17*5-/\\!$%!&$))K3K3 K3 	K3
 K3 c3hK3 K3 K3 K3 K3 #5#:.K3 RYYK3  		?K3 K3 K3  "!K3" #K3$ "%K3 K3Z YY) )  r;   r   c            +         ^  \ rS rSrSr                       S/S\S\\S4   S\\S4   S\\S4   S\S	\S
\S\	S\	S\S\\	S4   S\S\S\S\S\S\S\S\S\	S\4*U 4S jjjr
S r\R                  R                  S0S j5       r\R                  R                  S1S j5       r\R                  R                  S\R$                  4S j5       rS2S\S\\	   4S jjr     S3S \R,                  S!\\\\\   4      S"\S#\S$\	S%\S\\\R,                     \\R,                  \\R,                     4   4   4S& jjr   S4S!\\\\   4   S'\S(\4S) jjrS* rS0S+\4S, jjrS- rS.rU =r$ )5r   i0  a  DaViT
    A PyTorch implementation of `DaViT: Dual Attention Vision Transformers`  - https://arxiv.org/abs/2204.03645
    Supports arbitrary input sizes and pyramid feature extraction

Args:
    in_chans (int): Number of input image channels. Default: 3
    num_classes (int): Number of classes for classification head. Default: 1000
    depths (tuple(int)): Number of blocks in each stage. Default: (1, 1, 3, 1)
    embed_dims (tuple(int)): Patch embedding dimension. Default: (96, 192, 384, 768)
    num_heads (tuple(int)): Number of attention heads in different layers. Default: (3, 6, 12, 24)
    window_size (int): Window size. Default: 7
    mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
    qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
    drop_path_rate (float): Stochastic depth rate. Default: 0.1
    norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
in_chansdepths.
embed_dimsru   r   r   rv   rT   r   norm_epsr   r   r   r   r   r   	drop_ratedrop_path_ratenum_classesglobal_poolhead_norm_firstc                   > [         T U ]  5         UUS.n[        U5      nU[        U5      s=:X  a  [        U5      :X  d   e   e[        [	        U5      U
S9n[        [	        U	5      U
S9n	UU l        Xl        US   =U l        U l        UU l	        SU l
        / U l        [        XS   4SU0UD6U l        US   n[        UUSS9n/ n[        U5       Hl  nUU   n[!        UU4UU   US:  UUU   UUUUU   UU	UUUUUS	.UD6nUnUR#                  U5        U =R                  [%        US
US
-   -  SU 3S9/-  sl        Mn     [&        R(                  " U6 U l        U(       aB  U" U R                  40 UD6U l        [/        U R                  U4UU R                  S.UD6U l        OD[&        R2                  " 5       U l        [5        U R                  U4UU R                  US.UD6U l        U R7                  U R8                  5        g )Nr'   )epsr   Fr   rT   T)	stagewise)r   r   r   ru   r   r   rv   r   rT   r   r   r   r   r   r   r*   zstages.)num_chs	reductionmodule)	pool_typer  )r  r  rT   )r/   r0   lenr   r   r  r  num_featureshead_hidden_sizer  r   feature_inforN   stemr   r   r   r   dictr1   r   stagesnorm_prer   headr5   r   apply_init_weights)!r6   r  r	  r
  ru   r   r   rv   rT   r   r  r   r   r   r   r   r   r  r  r  r  r  r(   r)   r7   
num_stagesrR   dprr  irS   stager8   s!                                   r9   r0   DaVit.__init__B  sF   4 	/_
S^:s6{:::::^J7XF
} =8L& 4>rNBD1""'a=NZN2N	A'$Oz"A mG Qiq5%#A,'#! #A%+!1 /)#$ %E( FMM% $w!ac(U\]^\_S`"a!bb1 #4 mmV,
 &t'8'8?B?DM&!! &..	
 DI KKMDM-!! &..% DI 	

4%%&r;   c                    [        U[        R                  5      (       am  [        UR                  SS9  [        U[        R                  5      (       a9  UR
                  b+  [        R                  R                  UR
                  S5        g g g g )Ng{Gz?)stdr   )
isinstancer1   r{   r   weightry   init	constant_)r6   ms     r9   r"  DaVit._init_weights  s`    a##!((,!RYY''AFF,>!!!&&!, -?' $r;   c                 0    [        SU(       a  SS9$ / SQS9$ )Nz^stemz^stages\.(\d+)))z^stages\.(\d+).downsample)r   )z^stages\.(\d+)\.blocks\.(\d+)N)z	^norm_pre)i )r  r   )r  )r6   coarses     r9   group_matcherDaVit.group_matcher  s'    (.$
 	
5
 	
r;   c                 T    Xl         U R                   H  nUR                  US9  M     g )N)r   )r   r  r   )r6   r   r&  s      r9   r   DaVit.set_grad_checkpointing  s'    "([[E(((7 !r;   returnc                 .    U R                   R                  $ r>   )r   fc)r6   s    r9   get_classifierDaVit.get_classifier  s    yy||r;   c                 F    Xl         U R                  R                  X5        g r>   )r  r   reset)r6   r  r  s      r9   reset_classifierDaVit.reset_classifier  s    &		1r;   r<   indicesrY   
stop_early
output_fmtintermediates_onlyc                    US;   d   S5       e/ n[        [        U R                  5      U5      u  pU R                  U5      n[        U R                  5      S-
  n
[        R
                  R                  5       (       d  U(       d  U R                  nOU R                  SU	S-    n[        U5       H  u  pU R                  (       a/  [        R
                  R                  5       (       d  [        X5      nOU" U5      nX;   d  MT  U(       a  X:X  a  U R                  U5      nOUnUR                  U5        M     U(       a  U$ WU
:X  a  U R                  U5      nX4$ )a  Forward features that returns intermediates.

Args:
    x: Input image tensor
    indices: Take last n blocks if int, all if None, select matching indices if sequence
    norm: Apply norm layer to compatible intermediates
    stop_early: Stop iterating over blocks when last desired intermediate hit
    output_fmt: Shape of intermediate feature outputs
    intermediates_only: Only return intermediate features
Returns:

)NCHWzOutput shape must be NCHW.r   N)r   r  r  r  r   r   r  r   r   r   r  r   )r6   r<   r?  rY   r@  rA  rB  intermediatestake_indices	max_indexlast_idxr  feat_idxr&  x_inters                  r9   forward_intermediatesDaVit.forward_intermediates  s!   * Y&D(DD&"6s4;;7G"Q IIaLt{{#a'99!!##:[[F[[)a-0F(0OH&&uyy/E/E/G/Gu(!H'H0"mmA.GG$$W-  1   xa Ar;   
prune_norm
prune_headc                     [        [        U R                  5      U5      u  pEU R                  SUS-    U l        U(       a  [        R                  " 5       U l        U(       a  U R                  SS5        U$ )z?Prune layers not required for specified intermediates.
        Nr   r    )r   r  r  r1   r5   r  r=  )r6   r?  rM  rN  rF  rG  s         r9   prune_intermediate_layersDaVit.prune_intermediate_layers  s[     #7s4;;7G"Qkk.9q=1KKMDM!!!R(r;   c                    U R                  U5      nU R                  (       a:  [        R                  R	                  5       (       d  [        U R                  U5      nOU R                  U5      nU R                  U5      nU$ r>   )r  r   r   r   r  r   r  r  r  s     r9   forward_featuresDaVit.forward_features  sZ    IIaL""599+A+A+C+Ct{{A.AAAMM!r;   
pre_logitsc                 R    U(       a  U R                  USS9$ U R                  U5      $ )NT)rV  )r   )r6   r<   rV  s      r9   forward_headDaVit.forward_head  s$    0:tyyty,L		!Lr;   c                 J    U R                  U5      nU R                  U5      nU$ r>   )rT  rX  r  s     r9   r@   DaVit.forward  s'    !!!$a r;   )r  r  r   r   r  r  r  r  r  r  r  )rB   r   r   rB   r   rP           rB            rV   rQ   Tlayernorm2d	layernormgh㈵>r   TFr*   FFr   r     avgFNNFr  r>   )NFFrD  F)r   FT) rD   rE   rF   rG   rf   rH   r   r   rI   r  r0   r"  r   r   r  r2  r   r1   rg   r9  r   r=  r	   r   r   rK  rQ  rT  rX  r@   rJ   rK   rL   s   @r9   r   r   0  s   & &2*=)7  !+!,"*@!$%$)!&!$&#$$)1^'^' #s(O^' c3h	^'
 S#X^' ^' ^' ^' ^' ^' ^' c3h^' ^' ^' "^'  "!^'" #^'$ %^'& "'^'( )^'* +^', "-^' ^'@- YY
 
 YY8 8
 YY		  2C 2hsm 2 8<$$',3 ||3  eCcN343  	3 
 3  3  !%3  
tELL!5tELL7I)I#JJ	K3 n ./$#	3S	>*  	 M$ M r;   c                 Z   SS K n0 nU R                  5        GH  u  pVUR                  U5      (       a  UR                  US5      nOM1  UR	                  SSU5      nUR	                  SSU5      nUR                  SS5      nUR                  S	S
5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nXdU'   GM     U$ )Nr   rP  zconvs.([0-9]+)stages.\1.downsamplezblocks.([0-9]+)stages.\1.blocksdownsample.projdownsample.convstages.0.downsampler  zwindow_attn.norm.znorm1.zwindow_attn.fn.zattn.zchannel_attn.norm.zchannel_attn.fn.z	ffn.norm.znorm2.zffn.fn.net.zmlp.zconv1.fn.dwz	cpe1.projzconv2.fn.dwz	cpe2.proj)reitems
startswithreplacesub)
state_dictmodelprefixrp  out_dictr$   r   s          r9   _convert_florence2ry    s    H  "<<		&"%AFF$&=qAFF%':A>II'):;II+V4 II)84II'1II*H5II('2IIk8,IImV,IIm[1IIm[1' #* Or;   c                    SU ;   a  U $ SU ;   a  U S   n SU ;   a  [        X5      $ SSKn0 nU R                  5        H  u  pEUR                  SSU5      nUR                  SS	U5      nUR	                  S
S5      nUR	                  SS5      nUR	                  SS5      nUR	                  SS5      nUR	                  SS5      nUR	                  SS5      nXSU'   M     U$ )zRemap MSFT checkpoints -> timm zhead.fc.weightru  z vision_tower.convs.0.proj.weightr   Nzpatch_embeds.([0-9]+)rk  zmain_blocks.([0-9]+)rl  rm  rn  ro  r  zhead.zhead.fc.znorms.z
head.norm.zcpe.0r   zcpe.1r   )ry  rp  rq  rt  rs  )ru  rv  rp  rx  r$   r   s         r9   checkpoint_filter_fnr{  4  s    :%z!-
)Z7!*44H  "FF+-DaHFF*,?CII'):;II+V4IIgz*IIh-IIgv&IIgv& # Or;   c           	         [        S [        UR                  SS5      5       5       5      nUR                  SU5      nUR                  SS5      nU R	                  S5      (       a  Sn[        [        U U4[        [        SUS	9US
.UD6nU$ )Nc              3   *   #    U  H	  u  pUv   M     g 7fr>   r   ).0r%  r   s      r9   	<genexpr> _create_davit.<locals>.<genexpr>O  s     \.[da.[s   r	  r\  out_indicespretrained_strictT_flF)flatten_sequentialr  )pretrained_filter_fnfeature_cfgr  )	tupler   getpopendswithr   r   r{  r  )variant
pretrainedkwargsdefault_out_indicesr  strictrv  s          r9   _create_davitr  N  s    \i

8\8Z.[\\**],?@KZZ+T2F  2DkJ  E Lr;   c                 2    U SSSSS[         [        SSSS	.UE$ )
Nrg  )rB      r  )rV   rV   gffffff?bicubicz	stem.convzhead.fcz
apache-2.0)urlr  
input_size	pool_sizecrop_pctinterpolationmeanr)  
first_conv
classifierlicenser
   )r  r  s     r9   _cfgr  c  s3    =v9%.B!  r;   ztimm/)	hf_hub_idzmicrosoft/Florence-2-base)rB   r`  r`  )r  r  r  zmicrosoft/Florence-2-large)zdavit_tiny.msft_in1kzdavit_small.msft_in1kzdavit_base.msft_in1kdavit_large
davit_hugedavit_giantzdavit_base_fl.msft_florence2zdavit_huge_fl.msft_florence2r6  c           	      F    [        SSSS9n[        SSU 0[        U40 UD6D6$ )Nr\  r]  ra  r	  r
  ru   r  )
davit_tinyr  r  r  r  
model_argss      r9   r  r    s0    \6IUcdJ[*[Z@ZSY@Z[[r;   c           	      F    [        SSSS9n[        SSU 0[        U40 UD6D6$ )Nr   r   	   r   r]  ra  r  r  )davit_smallr  r  s      r9   r  r    s0    \6IUcdJ\:\jA[TZA[\\r;   c           	      F    [        SSSS9n[        SSU 0[        U40 UD6D6$ )Nr              rQ   r          r  r  )
davit_baser  r  s      r9   r  r    s0    \6KWefJ[*[Z@ZSY@Z[[r;   c           	      F    [        SSSS9n[        SSU 0[        U40 UD6D6$ )Nr  )r^  r_  r`     )rb  rc  rd  0   r  r  )r  r  r  s      r9   r  r    s0    \6KWfgJ\:\jA[TZA[\\r;   c           	      F    [        SSSS9n[        SSU 0[        U40 UD6D6$ )Nr  r  r  r  i   r   r  r  @   r  r  )r  r  r  s      r9   r  r    s0    \6LXghJ[*[Z@ZSY@Z[[r;   c           	      F    [        SSSS9n[        SSU 0[        U40 UD6D6$ )N)r   r   rc  rB   )r_  r`  r  i   )rc  rd  r  rP   r  r  )r  r  r  s      r9   r  r    s0    ]7MYijJ\:\jA[TZA[\\r;   c           
      N    [        SSSSSSSS9n[        S	SU 0[        U40 UD6D6$ )
Nr  r  r  rc  rB   Tr	  r
  ru   r   r   r   r   r  )davit_base_flr  r  s      r9   r  r    s=    (=DtJ ^Z^4
C]V\C]^^r;   c           
      N    [        SSSSSSSS9n[        S	SU 0[        U40 UD6D6$ )
Nr  r  r  rc  rB   Tr  r  )davit_huge_flr  r  s      r9   r  r    s?     (>/DtJ ^Z^4
C]V\C]^^r;   )zvision_tower.ri  )rP  )Frf   	functoolsr   typingr   r   r   r   r   r   torch.nnr1   torch.nn.functional
functionalr]   r	   	timm.datar   r   timm.layersr   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr   r   __all__rg   r!   rN   ri   rs   r   r   rH   r   r   r   r   r   r   ry  r{  r  r  default_cfgsr  r  r  r  r  r  r  r  r   r;   r9   <module>r     s  	  5 5      A H  H  H = * + 3 3 <) :&299 &R# #L$ $Pryy D?299 ?D U38_  F sCx S S   2bii 2jd299 dNX XveBII eP84*	 % ! 6&6$(--%1 %).-%1& ( \e \ \
 ]u ] ]
 \e \ \
 ]u ] ]
 \e \ \
 ]u ] ] _ _ _ _ _ _r;   