
    RЦimJ                        S r SSKJr  SSKJrJrJrJr  SSKrSSK	J
r
  SSKJrJr  SSKJrJrJrJrJr  SSKJr  SS	KJrJrJr  SS
KJrJr  SSKJrJr  SSK J!r!   " S S\
RD                  5      r#S r$SbS jr%ScS jr&  SdS\\'\RP                  4   S\!S\'S\)S\\'\RP                  4   4
S jjr*SeS jr+SfS jr,\" 0 S\," SSSSS9_S \," S!SSS"S#SS$9_S%\," S&SSS'9_S(\," S)SS"S#SS*9_S+\," 5       _S,\," S-SS"S#S.9_S/\," S0SSS'9_S1\," S2SS"S#SS*9_S3\," S4SS5S6SSS79_S8\," S9SS5S6SS:9_S;\," SSS6S<9_S=\," S>SS5S6SS:9_S?\," \\S@SA9_SB\," \\S@SA9_SC\," \\S@SA9_SD\," \\S@SA9_SE\," SFSGSHSISJSKSLSM9_\," SNSOSISHSJSKSLSP9\," SSISJSKSLSHSQ9SR.E5      r-\SgS\!4SS jj5       r.\SgS\!4ST jj5       r/\SgS\!4SU jj5       r0\SgS\!4SV jj5       r1\SgS\!4SW jj5       r2\SgS\!4SX jj5       r3\SgS\!4SY jj5       r4\SgS\!4SZ jj5       r5\SgS\!4S[ jj5       r6\SgS\!4S\ jj5       r7\SgS\!4S] jj5       r8\SgS\!4S^ jj5       r9\SgS\!4S_ jj5       r:\SgS\!4S` jj5       r;\" \<S3S8S;S;S=S,Sa.5        g)ha  Hybrid Vision Transformer (ViT) in PyTorch

A PyTorch implement of the Hybrid Vision Transformers as described in:

'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
    - https://arxiv.org/abs/2010.11929

`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
    - https://arxiv.org/abs/2106.10270

NOTE These hybrid model definitions depend on code in vision_transformer.py.
They were moved here to keep file sizes sane.

Hacked together by / Copyright 2020, Ross Wightman
    )partial)DictTupleTypeUnionN)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)StdConv2dSame	StdConv2dConvNormAct	to_ntupleHybridEmbed   )build_model_with_cfg)generate_default_cfgsregister_modelregister_model_deprecations)	resnet26d	resnet50d)ResNetV2create_resnetv2_stem)VisionTransformerc                     ^  \ rS rSrSSSSSS\R
                  \R                  SS4
S\S\S	\\\	\S
4   4   S\\\	\S
4   4   S\\\	\S
4   4   S\\
\\	\S
4   4   S\\R                     S\\R                     4U 4S jjjrSrU =r$ )ConvStem       @   )   r   r    Nin_chansdepthchannels.kernel_sizestridepadding
norm_layer	act_layerc                 @  > XS.n[         TU ]  5         [        U[        5      (       a0  [	        [        U5       Vs/ s H
  oSU-  -  PM     snS S S2   5      n[        U5      " U5      n[        U5      " U5      nU[        U5      s=:X  a   [        U5      s=:X  a  [        U5      :X  d   e   eUn[        [        U5      5       HQ  nU[        U5      S-
  :H  nU R                  U [        UX<   4XL   X\   Xl   UU(       + U(       + UUS.UD65        X<   nMS     g s  snf )Ndevicedtyper   r   )r#   r$   r%   bias
apply_norm	apply_actr&   r'   )
super__init__
isinstanceinttupleranger   len
add_moduler   )selfr    r!   r"   r#   r$   r%   r&   r'   r*   r+   ddiin_chs	last_conv	__class__s                  d/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/models/vision_transformer_hybrid.pyr1   ConvStem.__init__!   s'    /h$$eE1!Q$.EddKLH&{3E"7+FHs;'7H3x=HHHHHs8}%AS]Q..IOOqcK% (Ny
(='-%#% %  [F & Fs   D )__name__
__module____qualname____firstlineno__nnBatchNorm2dReLUr3   r   r   strr   Moduler1   __static_attributes____classcell__)r=   s   @r>   r   r       s     46782;8:*,..)+'!'! '! CsCx01	'!
 sE#s(O34'! #uS#X./'! 3U38_45'! RYY'! BII'! '!    r   c                  L    U R                  SS 5      U R                  SS 5      S.$ )Nr*   r+   r)   )get)kwargss    r>   _dd_from_kwargsrP   K   s#    jj406::gt;TUUrL   c                 T   UR                  SS5      nU(       a  SOSnU(       a  [        [        SS9O[        [        SS9n[	        U 5      (       a.  [        SU SSUR                  SS	5      S
UUS.[        S0 UD6D6nU$ [        UR                  SS	5      4US
US.[        S0 UD6D6nU$ )zResNet-V2 backbone helperpadding_sameTsamer   g:0yE>)epsr   r    r   F)layersnum_classesglobal_poolr    preact	stem_type
conv_layer)rY   rX   rZ   r@   )rN   r   r
   r   r6   r   rP   r   )rU   rO   rR   rY   rZ   backbones         r>   	_resnetv2r\   O   s    ::nd3L&BI5AD1wy^bGcJ
6{{ 	
ZZ
A.!	
 ''	
$ O (JJz1%
!	

 ''
 OrL   c                 4   0 nU R                  5        GH  u  pEUR                  U5      (       d  M  UR                  US5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS	5      nUR                  S
S5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUS:X  a  SnUR                  S5      nSU;   aU  UR                  SS5      nUR                  SS5      nUR                  n[
        R                  " UR                  S   5      X6'   XSU'   GM     U$ ) Nr   z
patch_emb.zpatch_embed.backbone.z
block.convconvz
block.normbnzpost_transformer_norm.znorm.zpre_norm_mha.0norm1zpre_norm_mha.1attnzpre_norm_ffn.0norm2zpre_norm_ffn.1zmlp.fc1zpre_norm_ffn.4zmlp.fc2z	qkv_proj.zqkv.z	out_proj.zproj.ztransformer.zblocks.zpos_embed.pos_embed.pos_embed	pos_embedr   zclassifier.projz	head.biaszhead.weight)items
startswithreplacesqueezeTtorchzerosshape)
state_dictmodelprefixoutkvbias_ks          r>   _convert_mobilecliprs   j   sy   
C  "||F##IIfb!IIl$;<IIlF+IIlD)II.8II&0II&/II&0II&	2II&	2IIk6*IIk7+IIni0//A		!A!YY0+>F		+];AA++aggaj1CKA1 #2 JrL   Trl   rm   interpolation	antialiasreturnc                 >    SSK Jn  SU ;   a  [        X5      n U" XX#S9$ )Nr   )checkpoint_filter_fnz1image_encoder.model.patch_emb.0.block.conv.weight)rt   ru   )vision_transformerrx   rs   )rl   rm   rt   ru   
_filter_fns        r>   rx   rx      s'     G:jH(;
j}ZZrL   c           	          UR                  SS5      nU=(       d    0 n[        [        4SU0UD6nUR                  SU5        UR                  SS5        [	        [
        U U4[        [        USS9S	.UD6$ )
Nout_indicesr   r[   embed_layer
patch_sizer   getter)r|   feature_cls)pretrained_filter_fnfeature_cfg)popr   r   
setdefaultr   r   rx   dict)variantr[   
embed_args
pretrainedrO   r|   r}   s          r>   !_create_vision_transformer_hybridr      s    **]A.K!rJ+GGJGK
m[1
lA& 2[hG  rL   c                 $    U SSS SSSSSSSS	S
.UE$ )Ni  )r      r   ?bicubicT)      ?r   r   zpatch_embed.backbone.stem.convheadz
apache-2.0)urlrV   
input_size	pool_sizecrop_pctrt   fixed_input_sizemeanstd
first_conv
classifierlicenser@   )r   rO   s     r>   _cfgr      s4    =t6f  rL   z*vit_tiny_r_s16_p8_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npzztimm/zpatch_embed.backbone.conv)r   	hf_hub_idcustom_loadr   z*vit_tiny_r_s16_p8_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)r     r         ?)r   r   r   r   r   r   z*vit_small_r26_s32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_light0-wd_0.03-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.03-res_224.npz)r   r   r   z*vit_small_r26_s32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)r   r   r   r   r   zvit_base_r26_s32_224.untrainedz'vit_base_r50_s16_384.orig_in21k_ft_in1kzthttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_384-9fd3c705.pth)r   r   r   r   z*vit_large_r50_s32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npzz*vit_large_r50_s32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npzz"vit_tiny_r_s16_p8_224.augreg_in21kzohttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npziSU  r   )r   r   rV   r   r   r   z"vit_small_r26_s32_224.augreg_in21kzshttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0.npz)r   r   rV   r   r   zvit_base_r50_s16_224.orig_in21k)r   rV   r   z"vit_large_r50_s32_224.augreg_in21kzrhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0.npzz!vit_small_resnet26d_224.untrainedzpatch_embed.backbone.conv1.0)r   r   r   z%vit_small_resnet50d_s16_224.untrainedz vit_base_resnet26d_224.untrainedz vit_base_resnet50d_224.untrainedzvit_base_mci_224.apple_mclip_ltzapple/mobileclip_b_lt_timmzYhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_blt.ptz
apple-amlri   )        r   r   )r   r   r   zpatch_embed.backbone.0.conv)r   r   r   rV   r   r   r   zapple/mobileclip_b_timmzWhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_b.pt)r   r   rV   r   r   r   r   )r   rV   r   r   r   r   )zvit_base_mci_224.apple_mclipz%vit_base_mci_224.apple_mclip2_dfndr2bc           	      h    [        SSS0UD6n[        SSSSS9n[         S	X S.[        U40 UD6D6nU$ )
z2R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 224 x 224.
    rU   r@            r   r~   	embed_dimr!   	num_headsr[   r   )vit_tiny_r_s16_p8_224r\   r   r   r   rO   r[   
model_argsrm   s        r>   r   r   	  W     --f-HcqIJ-i*2iMQR\Mg`fMgiELrL   c           	      h    [        SSS0UD6n[        SSSSS9n[         S	X S.[        U40 UD6D6nU$ )
z2R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 384 x 384.
    rU   r@   r   r   r   r   r   r   )vit_tiny_r_s16_p8_384r   r   s        r>   r   r     r   rL   c           	      b    [        S0 UD6n[        SSSS9n[         SX S.[        U40 UD6D6nU$ )R26+ViT-S/S32 hybrid.
    r   r      r   r!   r   r   )r   r   r   r   )vit_small_r26_s32_224r   r   s        r>   r   r     P     00H2;J-i*2iMQR\Mg`fMgiELrL   c           	      b    [        S0 UD6n[        SSSS9n[         SX S.[        U40 UD6D6nU$ )r   r   r   r   r   r   r   )vit_small_r26_s32_384r   r   s        r>   r   r   *  r   rL   c           	      b    [        S0 UD6n[        SSSS9n[         SX S.[        U40 UD6D6nU$ )zR26+ViT-B/S32 hybrid.
       r   r   r   r   )vit_base_r26_s32_224r   r   s        r>   r   r   5  sP     00H2<J-h)1hLPQ[Lf_eLfhELrL   c           	      b    [        S0 UD6n[        SSSS9n[         SX S.[        U40 UD6D6nU$ )zQR50+ViT-B/S16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
    r   r   r   r   )r      	   )vit_base_r50_s16_224r   r   s        r>   r   r   @  sP     -f-H2<J-h)1hLPQ[Lf_eLfhELrL   c           	      b    [        S0 UD6n[        SSSS9n[         SX S.[        U40 UD6D6nU$ )zR50+ViT-B/16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
r   r   r   r   r   )vit_base_r50_s16_384r   r   s        r>   r   r   K  sP    
 -f-H2<J-h)1hLPQ[Lf_eLfhELrL   c           	      b    [        S0 UD6n[        SSSS9n[         SX S.[        U40 UD6D6nU$ )R50+ViT-L/S32 hybrid.
             r   r   )r   r   r   r   )vit_large_r50_s32_224r   r   s        r>   r   r   W  P     00HB"=J-i*2iMQR\Mg`fMgiELrL   c           	      b    [        S0 UD6n[        SSSS9n[         SX S.[        U40 UD6D6nU$ )r   r   r   r   r   r   r   )vit_large_r50_s32_384r   r   s        r>   r   r   b  r   rL   c           	          [        S
U UR                  SS5      SS/S.[        S
0 UD6D6n[        SSSSS9n[	         SX S	.[        U40 UD6D6nU$ )zKCustom ViT small hybrid w/ ResNet26D stride 32. No pretrained weights.
    r    r   Tr   r   r    features_onlyr|   r   r   r   r!   r   	mlp_ratior   r@   )vit_small_resnet26d_224r   rN   rP   r   r   r   s        r>   r   r   m  s      J*C	
 
#F
#H 1QGJ-!k,4kOST^OibhOikELrL   c           	          [        S	U UR                  SS5      SS/S.[        S	0 UD6D6n[        SSSSS9n[	         S
X S.[        U40 UD6D6nU$ )zUCustom ViT small hybrid w/ ResNet50D 3-stages, stride 16. No pretrained weights.
    r    r   Tr   r   r   r   r   r@   )vit_small_resnet50d_s16_224r   rN   rP   r   r   r   s        r>   r   r   ~  s      J*C	
 
#F
#H 1QGJ-%o08oSWXbSmflSmoELrL   c           	          [        S
U UR                  SS5      SS/S.[        S
0 UD6D6n[        SSSS9n[	         SX S	.[        U40 UD6D6nU$ )zJCustom ViT base hybrid w/ ResNet26D stride 32. No pretrained weights.
    r    r   Tr   r   r   r   r   r   r@   )vit_base_resnet26d_224r   r   s        r>   r   r           J*C	
 
#F
#H 2<J- j+3jNRS]NhagNhjELrL   c           	          [        S
U UR                  SS5      SS/S.[        S
0 UD6D6n[        SSSS9n[	         SX S	.[        U40 UD6D6nU$ )JCustom ViT base hybrid w/ ResNet50D stride 32. No pretrained weights.
    r    r   Tr   r   r   r   r   r   r@   )vit_base_resnet50d_224r   r   s        r>   r   r     r   rL   c                     [        SSSSSUR                  SS5      [        R                  S.[	        S0 UD6D6n[        SSSS	S
9n[         SU[        SS9U S.[        U40 UD6D6nU$ )r   )r   r   r   )r   r   r   r   r    r   )r"   r$   r#   r%   r    r'   r   r   T)r   r!   r   no_embed_classF)proj)r[   r   r   r@   )vit_base_mci_224)r   rN   rE   GELUrP   r   r   r   s        r>   r   r     s      &J*'' 
#F
#H 2DQJ-%-$E:J!%j!;F!;E LrL   )vit_tiny_r_s16_p8_224_in21kvit_small_r26_s32_224_in21kvit_base_r50_s16_224_in21kvit_base_resnet50_224_in21kvit_large_r50_s32_224_in21kvit_base_resnet50_384r   )zimage_encoder.model.)r   T)NF)r   )F)=__doc__	functoolsr   typingr   r   r   r   ri   torch.nnrE   	timm.datar   r	   timm.layersr
   r   r   r   r   _builderr   	_registryr   r   r   resnetr   r   resnetv2r   r   ry   r   
Sequentialr   rP   r\   rs   rH   Tensorboolrx   r   r   default_cfgsr   r   r   r   r   r   r   r   r   r   r   r   r   r   rA   r@   rL   r>   <module>r      s    + +   A U U * Y Y ( 4 1(!r}} (!VV6B '	[ell*+[ [ [ 	[
 
#u||
[ 	 % T&0$ f.	30T& 1$ f.=SVdh3jT& 1$ i3T&  1$ j 3D3B!T&( %df)T&* .t C 300+T&2 1$ i33T&< 1$ i 3D3=T&J )$}C4O]a+cKT&R )$ BCT+;ST&Z &t(%[T&b )$ ACT+;cT&n ("(<Ig*ioT&r ,T"(<Ig.isT&v '"(<Ig)iwT&z '"(<Ig)i{T&@ &t.g|8U(AT&N %)+e|8U% .2|8U	.]T& Tn 9J   9J   9J   9J   8I   8I   8I   9J   9J   ;L    ?P    :K    :K    4E  ( H#G#G"C#D#GF' rL   