
    RЦiI                        S r SSKrSSKJr  SSKJrJrJrJrJ	r	J
r
JrJrJr  SSKrSSKJr  SSKJs  Jr  SSKJrJr  SSKJrJrJrJrJrJrJrJrJ r J!r!J"r"J#r#  SSK$J%r%  SS	K&J'r'  SS
K(J)r)  SSK*J+r+J,r,  S/r-S5S\\.\.4   S\R^                  4S jjr0 " S S\Rb                  5      r2 " S S\Rb                  5      r3 " S S\Rb                  5      r4 " S S\Rb                  5      r5S6S\6S\\6\4   4S jjr7\+" \7" SS9\7" SSSS9\7" SSS9\7" SS9\7" SSSS9\7" SS SS9\7" SSS9\7" S\\S!9\7" S\\S!9\7" SS\\S"9\7" SS#\\S$9\7" SS#\\S$9\7" SS\\S"9S%.5      r8S7S&\\6\R^                  4   S'\Rb                  S(\6S)\9S\\6\R^                  4   4
S* jjr:S8S+\6S,\9S\54S- jjr;\,S8S,\9S\54S. jj5       r<\,S8S,\9S\54S/ jj5       r=\,S8S,\9S\54S0 jj5       r>\,S8S,\9S\54S1 jj5       r?\,S8S,\9S\54S2 jj5       r@\,S8S,\9S\54S3 jj5       rA\,S8S,\9S\54S4 jj5       rBg)9a  BEiT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)

Model from official source: https://github.com/microsoft/unilm/tree/master/beit

@inproceedings{beit,
title={{BEiT}: {BERT} Pre-Training of Image Transformers},
author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},
booktitle={International Conference on Learning Representations},
year={2022},
url={https://openreview.net/forum?id=p-BhZSz59o4}
}

BEiT-v2 from https://github.com/microsoft/unilm/tree/master/beit2

@article{beitv2,
title={{BEiT v2}: Masked Image Modeling with Vector-Quantized Visual Tokenizers},
author={Zhiliang Peng and Li Dong and Hangbo Bao and Qixiang Ye and Furu Wei},
year={2022},
eprint={2208.06366},
archivePrefix={arXiv},
primaryClass={cs.CV}
}

At this point only the 1k fine-tuned classification weights and model configs have been added,
see original source above for pre-training models and procedure.

Modifications by / Copyright 2021 Ross Wightman, original copyrights below
    N)partial)	AnyCallableDictListOptionalSetTupleTypeUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpSwiGLU	LayerNormDropPathcalculate_drop_path_ratestrunc_normal_use_fused_attnresample_patch_embedresample_abs_pos_embedresize_rel_pos_bias_tablendgrid   )build_model_with_cfg)feature_take_indices)
checkpoint)generate_default_cfgsregister_modelBeitwindow_sizereturnc                 (   SU S   -  S-
  SU S   -  S-
  -  S-   nU S   U S   -  n[         R                  " [        [         R                  " U S   U[         R                  S9[         R                  " U S   U[         R                  S95      5      n[         R
                  " US5      nUSS2SS2S4   USS2SSS24   -
  nUR                  SSS5      R                  5       nUSS2SS2S4==   U S   S-
  -  ss'   USS2SS2S4==   U S   S-
  -  ss'   USS2SS2S4==   SU S   -  S-
  -  ss'   [         R                  " US-   4S-  XR                  S9nUR                  S5      USS2SS24'   US-
  USSS24'   US-
  USS2S4'   US-
  US	'   U$ )
a  Generate relative position index for window-based attention.

Creates a lookup table for relative position indices between all pairs of positions
within a window, including special handling for cls token interactions.

Args:
    window_size: Height and width of the attention window.

Returns:
    Relative position index tensor of shape (window_area+1, window_area+1)
    where +1 accounts for the cls token.
   r   r      devicedtypeN)sizer(   r)   )r   r   )torchstackr   arangelongflattenpermute
contiguouszerosr)   sum)r"   r(   num_relative_distancewindow_areacoordscoords_flattenrelative_coordsrelative_position_indexs           O/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/models/beit.pygen_relative_position_indexr<   I   s    Q/!3KN8JQ8NORSS a.;q>1K[[[^F%**E[^F%**E F ]]61-N$Q4Z0>!T1*3MMO%--aA6AACOAq!GA 22Aq!GA 22Aq!GKN 2Q 66#kka/AA/Ef\q\qr&5&9&9"&=ABF#%:Q%>AqrE"%:Q%>ABE"$9A$=D!""    c                   R  ^  \ rS rSr% Sr\R                  R                  \   \	S'            SS\
S\
S\S\S\S	\S
\\\
\
4      S\\
   4U 4S jjjrS\R                  4S jrSS\R                  S\\R                     S\R                  4S jjrSS jrSS jrSS jrSrU =r$ )	Attentionl   zMulti-head attention module with optional relative position bias.

Implements multi-head self-attention with support for relative position bias
and fused attention operations. Can use either standard or custom head dimensions.

fused_attndim	num_headsqkv_biasqkv_bias_separate	attn_drop	proj_dropr"   attn_head_dimc           	      t  > XS.n[         TU ]  5         X l        X-  nUb  UnXR                  -  nUS-  U l        [	        5       U l        X@l        [        R                  " XS-  4SS0UD6U l	        U(       a  [        R                  " [        R                  " U40 UD65      U l        U R                  S[        R                  " U40 UD6SS9  [        R                  " [        R                  " U40 UD65      U l        OSU l        SU l        SU l        U(       a  Xpl        S	US
   -  S-
  S	US   -  S-
  -  S-   U l        US
   US   -  n[        R                  " [        R                  " U R$                  U40 UD65      U l        U R                  S[        R                  " US-   US-   4U	[        R(                  S9SS9  OSU l        SU l        SU l        [        R,                  " U5      U l        [        R                  " X40 UD6U l        [        R,                  " U5      U l        U R5                  5         g)a  Initialize attention module.

Args:
    dim: Input feature dimension.
    num_heads: Number of attention heads.
    qkv_bias: If True, add learnable bias to query, key, value projections.
    qkv_bias_separate: If True, use separate bias for q, k, v projections.
    attn_drop: Dropout rate for attention weights.
    proj_drop: Dropout rate for output projection.
    window_size: Window size for relative position bias. If None, no relative position bias.
    attn_head_dim: Dimension per attention head. If None, uses dim // num_heads.
r'   Ng      r&   biasFk_bias
persistentr%   r   r   r:   )super__init__rC   scaler   rA   rE   nnLinearqkv	Parameterr,   emptyq_biasregister_bufferv_biasrK   r"   r5   relative_position_bias_tabler/   r:   DropoutrF   projrG   reset_parameters)selfrB   rC   rD   rE   rF   rG   r"   rH   r(   r)   ddhead_dimall_head_dimr6   	__class__s                  r;   rO   Attention.__init__t   s    2 /"#$$H..0%
(*!299S"2EE"E,,u{{<'F2'FGDK  5;;|+Jr+JW\ ],,u{{<'F2'FGDKDKDKDK**+k!n*<q*@QUVEWZ[E[)\_`)`D&%a.;q>9K02D66	HRH1JD-  )[1_kAo>vUZU_U_`  !   $D04D-+/D(I.IIl626	I. 	r=   r#   c                 X   U R                   U R                  R                  S5         R                  U R                  S   U R                  S   -  S-   U R                  S   U R                  S   -  S-   S5      nUR	                  SSS5      R                  5       nUR                  S5      $ )zGet relative position bias for the attention window.

Returns:
    Relative position bias tensor of shape (1, num_heads, window_area+1, window_area+1).
r+   r   r   r%   )rY   r:   viewr"   r1   r2   	unsqueezer]   relative_position_biass     r;   _get_rel_pos_biasAttention._get_rel_pos_bias   s     "&!B!B((--b1"3374Q$"2"21"559Q$"2"21"55924? 	 "8!?!?1a!H!S!S!U%//22r=   xshared_rel_pos_biasc                 R   UR                   u  p4nU R                  c  U R                  U5      nO[        R                  " U R                  U R
                  U R                  45      nU R                  (       a  U R                  U5      nXg-  nO)[        R                  " XR                  R                  US9nUR                  X4SU R                  S5      R                  SSSSS5      nUR                  S5      u  pn
U R                  (       ak  SnU R                   b  U R#                  5       nUb  X-   nOUb  Un[        R$                  " XU
UU R&                  (       a  U R(                  R*                  OS	S
9nOlXR,                  -  nXR/                  SS5      -  nU R                   b  XR#                  5       -   nUb  X-   nUR1                  SS9nU R)                  U5      nX-  nUR/                  SS5      R                  X4U5      nU R3                  U5      nU R5                  U5      nU$ )zForward pass of attention module.

Args:
    x: Input tensor of shape (batch_size, num_tokens, dim).
    shared_rel_pos_bias: Optional shared relative position bias from parent module.

Returns:
    Output tensor of shape (batch_size, num_tokens, dim).
N)weightrJ   r&   r+   r%   r   r              )	attn_mask	dropout_prB   )shaperV   rS   r,   catrK   rX   rE   Flinearrm   reshaperC   r1   unbindrA   rY   rh   scaled_dot_product_attentiontrainingrF   prP   	transposesoftmaxr[   rG   )r]   rj   rk   BNCrS   rD   qkvrel_pos_biasattns                r;   forwardAttention.forward   s    ''a;;((1+Cyy$++t{{DKK!HIH%%hhqkhhqxHkk!4>>26>>q!Q1M**Q-a??L00<#557&2#/#EL$02..a&.2mm$..**A JJAB++D00<4466".1<<B<'D>>$'DAKK1%%aA.IIaLNN1r=   c                 N   U R                   bR  [        R                  R                  U R                   5        [        R                  R                  U R                  5        U R
                  b)  [        R                  R                  U R
                  5        U R                  5         gz"Initialize parameters and buffers.N)rV   rQ   initzeros_rX   rY   _init_buffersr]   s    r;   r\   Attention.reset_parameters   sb    ;;"GGNN4;;'GGNN4;;',,8GGNN4<<=r=   c                     U R                   b  U R                   R                  5         U R                  bB  U R                  R                  [	        U R
                  U R                  R                  S95        gg).Compute and fill non-persistent buffer values.Nr(   )rK   zero_r:   copy_r<   r"   r(   r   s    r;   r   Attention._init_buffers  s]    ;;"KK''3((..+D,<,<TEaEaEhEhi 4r=   c                 $    U R                  5         gz"Initialize non-persistent buffers.Nr   r   s    r;   init_non_persistent_buffers%Attention.init_non_persistent_buffers      r=   )rF   rA   rK   rC   r5   r[   rG   rV   rS   rE   rY   r:   rP   rX   r"   )	   FFro   ro   NNNNNr#   N)__name__
__module____qualname____firstlineno____doc__r,   jitFinalbool__annotations__intfloatr   r
   rO   Tensorrh   r   r\   r   r   __static_attributes____classcell__ra   s   @r;   r?   r?   l   s   
 		%%
 "&+!!59+/C C  C  	C 
  $C  C  C  "%S/2C  $C=C  C J35<< 36 6HU\\<R 6^c^j^j 6p r=   r?   c                   R  ^  \ rS rSrSrSSSSSSSS\R                  \SSSS4S\S\S	\	S
\
S\	S\	S\
S\
S\
S\\
   S\\R                     S\\R                     S\\\\4      S\\   4U 4S jjjrSS jrSS\R$                  S\\R$                     S\R$                  4S jjrSrU =r$ )Blocki  zTransformer block with attention and MLP.

Standard transformer block consisting of multi-head self-attention and MLP
with residual connections and layer normalization. Supports layer scale and
stochastic depth regularization.
F      @ro   NrB   rC   rD   	mlp_ratio	scale_mlp
swiglu_mlprG   rF   	drop_pathinit_values	act_layer
norm_layerr"   rH   c           
        > UUS.n[         TU ]  5         U" U40 UD6U l        [        U4UUUUUUS.UD6U l        U	S:  a  [        U	5      O[        R                  " 5       U l        U" U40 UD6U l	        U(       a*  [        SU[        X-  5      U(       a  UOSUS.UD6U l        O*[        SU[        X-  5      UU(       a  UOSUS.UD6U l        U	S:  a  [        U	5      O[        R                  " 5       U l        Xl        U
(       aa  [        R                   " ["        R$                  " U40 UD65      U l        [        R                   " ["        R$                  " U40 UD65      U l        OSu  U l        U l        U R+                  5         g)	a  Initialize transformer block.

Args:
    dim: Input feature dimension.
    num_heads: Number of attention heads.
    qkv_bias: If True, add learnable bias to query, key, value projections.
    mlp_ratio: Ratio of MLP hidden dimension to input dimension.
    scale_mlp: If True, apply layer normalization in MLP.
    swiglu_mlp: If True, use SwiGLU activation in MLP.
    proj_drop: Dropout rate for projections.
    attn_drop: Dropout rate for attention.
    drop_path: Drop path rate for stochastic depth.
    init_values: Initial values for layer scale. If None, no layer scale.
    act_layer: Activation function class.
    norm_layer: Normalization layer class.
    window_size: Window size for relative position bias in attention.
    attn_head_dim: Dimension per attention head.
r'   )rC   rD   rF   rG   r"   rH   ro   N)in_featureshidden_featuresr   drop)r   r   r   r   r   NN )rN   rO   norm1r?   r   r   rQ   Identity
drop_path1norm2r   r   mlpr   
drop_path2r   rT   r,   rU   gamma_1gamma_2r\   )r]   rB   rC   rD   r   r   r   rG   rF   r   r   r   r   r"   rH   r(   r)   r^   ra   s                     r;   rO   Block.__init__  sr   J /*r*
	
#'	
 	
	 2;R(9-R[[]*r*
  #CO 4)2:	
 DH   #CO 4#)2: DH 2;R(9-R[[]&<<C(>2(>?DL<<C(>2(>?DL)3&DL$, 	r=   r#   c                     U R                   bi  [        R                  R                  U R                   U R                  5        [        R                  R                  U R
                  U R                  5        gg)zInitialize parameters.N)r   rQ   r   	constant_r   r   r   s    r;   r\   Block.reset_parametersp  sM    <<#GGdllD,<,<=GGdllD,<,<= $r=   rj   rk   c           	         U R                   cc  XR                  U R                  U R                  U5      US95      -   nXR	                  U R                  U R                  U5      5      5      -   nU$ XR                  U R                   U R                  U R                  U5      US9-  5      -   nXR	                  U R                  U R                  U R                  U5      5      -  5      -   nU$ )zForward pass of transformer block.

Args:
    x: Input tensor of shape (batch_size, num_tokens, dim).
    shared_rel_pos_bias: Optional shared relative position bias.

Returns:
    Output tensor of shape (batch_size, num_tokens, dim).
rk   )r   r   r   r   r   r   r   r   )r]   rj   rk   s      r;   r   Block.forwardv  s     <<OODIIdjjmQdI$effAOODHHTZZ]$;<<A  OODLL499TZZ]`s93t$tuuAOODLL488DJJqM3J$JKKAr=   )	r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   rQ   GELUr   r   r   r   r   r   Moduler
   rO   r\   r,   r   r   r   r   r   s   @r;   r   r     s:    #!#$!!!+/)+*359+/#Q Q  Q  	Q 
 Q  Q  Q  Q  Q  Q  "%Q  BIIQ  RYYQ  "%S/2Q  $C=Q  Q f> HU\\<R ^c^j^j  r=   r   c                      ^  \ rS rSrSrSS\\\4   S\4U 4S jjjrSS jrSS jr	SS	 jr
S\R                  4S
 jrSrU =r$ )RelativePositionBiasi  zRelative position bias module for window-based attention.

Generates learnable relative position biases for all pairs of positions
within a window, including special handling for cls token.
r"   rC   c           	        > X4S.n[         TU ]  5         Xl        US   US   -  U l        SUS   -  S-
  SUS   -  S-
  -  S-   n[        R
                  " [        R                  " Xb40 UD65      U l        U R                  S[        R                  " U R                  S-   U R                  S-   4U[        R                  S9SS9  U R                  5         g	)
zInitialize relative position bias module.

Args:
    window_size: Height and width of the attention window.
    num_heads: Number of attention heads.
r'   r   r   r%   r&   r:   FrL   N)rN   rO   r"   r6   rQ   rT   r,   rU   rY   rW   r/   r\   )r]   r"   rC   r(   r)   r^   r5   ra   s          r;   rO   RelativePositionBias.__init__  s     /&&q>KN:!"[^!3a!7AA<NQR<R SVW W,.LLEZ9lik9l,m)%KK))A-t/?/?!/CDV[`[e[ef 	 	
 	r=   r#   c                 v    [         R                  R                  U R                  5        U R	                  5         gr   )rQ   r   r   rY   r   r   s    r;   r\   %RelativePositionBias.reset_parameters  s$    
t889r=   c                     U R                   R                  [        U R                  U R                   R                  S95        g)r   r   N)r:   r   r<   r"   r(   r   s    r;   r   "RelativePositionBias._init_buffers  s2    $$**'(8(8A]A]AdAde	
r=   c                 $    U R                  5         gr   r   r   s    r;   r   0RelativePositionBias.init_non_persistent_buffers  r   r=   c                     U R                   U R                  R                  S5         R                  U R                  S-   U R                  S-   S5      nUR	                  SSS5      R                  5       $ )zGenerate relative position bias.

Returns:
    Relative position bias tensor of shape (num_heads, window_area+1, window_area+1).
r+   r   r%   r   )rY   r:   rd   r6   r1   r2   rf   s     r;   r   RelativePositionBias.forward  sm     "&!B!B4C_C_CdCdegCh!i!n!nq $"2"2Q"6"<%--aA6AACCr=   )rY   r6   r"   r   r   )r   r   r   r   r   r
   r   rO   r\   r   r   r,   r   r   r   r   r   s   @r;   r   r     sO     E#s(O      ,

D D Dr=   r   c            /       P  ^  \ rS rSrSrSSSSSSS	S	S
SSSSSSSS\SS
SSSSS4S\\\\\4   4   S\\\\\4   4   S\S\S\	S\S\S\S\
S\S\
S\
S\S\S\S\S \S!\\R                     S"\\   S#\
S$\
S%\
S&\4.U 4S' jjjrSES(\
S)S4S* jjrSFS+ jrSES,\R                  S(\
4S- jjr\R*                  R,                  S)\\	   4S. j5       r\R*                  R,                  SES/\
4S0 jj5       r\R*                  R,                  SGS1\
S)\\	\4   4S2 jj5       r\R*                  R,                  S)\R                  4S3 j5       rSHS\S\\	   4S4 jjr      SIS5\R>                  S6\\\\ \   4      S7\
S8\
S9\
S:\	S;\
S)\\ \R>                     \\R>                  \ \R>                     4   4   4S< jjr!   SJS6\\\ \   4   S=\
S>\
S)\ \   4S? jjr"S5\R>                  S)\R>                  4S@ jr#SGS5\R>                  SA\
S)\R>                  4SB jjr$S5\R>                  S)\R>                  4SC jr%SDr&U =r'$ )Kr!   i  zBEiT: BERT Pre-Training of Image Transformers.

Vision Transformer model with support for relative position bias and
shared relative position bias across layers. Implements both BEiT v1 and v2
architectures with flexible configuration options.
      r&     avg      Tr   Fro   NgMbP?img_size
patch_sizein_chansnum_classesglobal_pool	embed_dimdepthrC   rD   r   r   r   	drop_ratepos_drop_rateproj_drop_rateattn_drop_ratedrop_path_rater   r   use_abs_pos_embuse_rel_pos_biasuse_shared_rel_pos_biashead_init_scalec                   > UUS.n[         T U ]  5         X@l        X0l        XPl        U=U l        =U l        U l        SU l        SU l	        [        SUUUUS.UD6U l        U R                  R                  n[        U R                  S5      (       a  U R                  R                  5       OUn[        R                   " ["        R$                  " SSU40 UD65      U l        U(       a0  [        R                   " ["        R$                  " SUS-   U40 UD65      OSU l        [        R*                  " US9U l        U(       a(  [/        SU R                  R0                  US.UD6U l        OSU l        [5        UU5      n[        R6                  " [9        U5       Vs/ s H;  n[;        SUUU	U
UUUUUU   UUU(       a  U R                  R0                  OSS	.UD6PM=     sn5      U l        [9        U5       Vs/ s H  n[?        S
U 3UUS9PM     snU l         U R                  S:H  nU(       a  [        RB                  " 5       OU" U40 UD6U l"        U(       a	  U" U40 UD6O[        RB                  " 5       U l#        [        R*                  " U5      U l$        US:  a  [        RJ                  " Xd40 UD6O[        RB                  " 5       U l&        UU l'        U RQ                  SS9  gs  snf s  snf )a  Initialize BEiT model.

Args:
    img_size: Input image size.
    patch_size: Patch size for patch embedding.
    in_chans: Number of input image channels.
    num_classes: Number of classes for classification head.
    global_pool: Type of global pooling ('avg' or '').
    embed_dim: Embedding dimension.
    depth: Number of transformer blocks.
    num_heads: Number of attention heads.
    qkv_bias: If True, add learnable bias to query, key, value projections.
    mlp_ratio: Ratio of MLP hidden dimension to embedding dimension.
    swiglu_mlp: If True, use SwiGLU activation in MLP.
    scale_mlp: If True, apply layer normalization in MLP.
    drop_rate: Dropout rate.
    pos_drop_rate: Dropout rate for position embeddings.
    proj_drop_rate: Dropout rate for projections.
    attn_drop_rate: Dropout rate for attention.
    drop_path_rate: Stochastic depth rate.
    norm_layer: Normalization layer class.
    init_values: Initial values for layer scale.
    use_abs_pos_emb: If True, use absolute position embeddings.
    use_rel_pos_bias: If True, use relative position bias in attention.
    use_shared_rel_pos_bias: If True, share relative position bias across layers.
    head_init_scale: Scale factor for head initialization.
r'   r   F)r   r   r   r   
feat_ratioN)r|   )r"   rC   )rB   rC   rD   r   r   r   rG   rF   r   r   r   r"   zblocks.)modulenum_chs	reductionr   r   needs_resetr   ))rN   rO   r   r   r   num_featureshead_hidden_sizer   num_prefix_tokensgrad_checkpointingr   patch_embednum_patcheshasattrr   rQ   rT   r,   rU   	cls_token	pos_embedrZ   pos_dropr   	grid_sizer   r   
ModuleListranger   blocksdictfeature_infor   normfc_norm	head_droprR   headr   init_weights)!r]   r   r   r   r   r   r   r   rC   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r(   r)   r^   r   rdpriuse_fc_normra   s!                                   r;   rO   Beit.__init__  s   n /& &ENNND1DN!""'% 
!	

 
 &&22-4T5E5E|-T-TD'')Zdekk!Q	&HR&HI[jekk![1_i&VSU&VWpt

]3" 4 ! ,,66#! !D !%D'>mm  5\!%#  "  #!##%((a&%':JD,,66PT  "!%# $$ QVV[P\^P\1D'!yAFP\^ &&%/%0BKKMj6Qb6Q	6Az)2r2r{{}I.?JQBIIi;;TVT_T_Ta	. 	e,9%#"^s   AK/K r   r#   c                    U R                  [        U R                  US95        U R                  b  [	        U R                  SS9  [	        U R
                  SS9  U R                  5         U R                  (       a  [        U R                  [        R                  5      (       a  [	        U R                  R                  SS9  [        R                  " 5          U R                  R                  R                  U R                  5        U R                  R                   R                  U R                  5        SSS5        ggg! , (       d  f       g= f)zInitialize model weights.

Args:
    needs_reset: If True, call reset_parameters() on modules that have it.
        Set to False when modules have already self-initialized in __init__.
r   N{Gz?std)applyr   _init_weightsr  r   r   fix_init_weightr   
isinstancer  rQ   rR   rm   r,   no_gradmul_rJ   )r]   r   s     r;   r  Beit.init_weights?  s     	

74--;GH>>%$..c2dnn#.Jtyy"))$D$D$))**4		  %%d&:&:;		##D$8$89 ! %E s   AE  
Ec                    [         R                  " 5          [        U R                  5       H  u  p[        R
                  " SUS-   -  5      nUR                  R                  R                  R                  U5        UR                  R                  R                  R                  U5        M     SSS5        g! , (       d  f       g= f)zFix initialization weights according to BEiT paper.

Rescales attention and MLP weights based on layer depth to improve
training stability.
g       @r   N)r,   r  	enumerater  mathsqrtr   r[   rm   div_r   fc2)r]   layer_idlayerrP   s       r;   r  Beit.fix_init_weightS  s{     ]]_#,T[[#9		#A"67

&&++E2		$$))%0 $: __s   BB88
Cmc                 0   [        U[        R                  5      (       aM  [        UR                  SS9  UR
                  b+  [        R                  R                  UR
                  S5        ggU(       a#  [        US5      (       a  UR                  5         ggg)zInitialize model weights.

Args:
    m: Module to initialize.
    needs_reset: If True, call reset_parameters() on modules that have it.
r  r  Nr   r\   )
r  rQ   rR   r   rm   rJ   r   r   r   r\   )r]   r'  r   s      r;   r  Beit._init_weights_  sm     a##!((,vv!!!!&&!, "WQ(:;;  <[r=   c                 r    SS1nU R                  5        H  u  p#SU;   d  M  UR                  U5        M      U$ )zyGet parameter names that should not use weight decay.

Returns:
    Set of parameter names to exclude from weight decay.
r  r   rY   )named_parametersadd)r]   nwdn_s       r;   no_weight_decayBeit.no_weight_decaym  s=     K())+DA-2
 , 
r=   enablec                     Xl         g)zeEnable or disable gradient checkpointing.

Args:
    enable: If True, enable gradient checkpointing.
N)r   )r]   r2  s     r;   set_grad_checkpointingBeit.set_grad_checkpointingz  s
     #)r=   coarsec                     [        SSS/S9nU$ )zCreate parameter group matcher for optimizer parameter groups.

Args:
    coarse: If True, use coarse grouping.

Returns:
    Dictionary mapping group names to regex patterns.
z-^cls_token|pos_embed|patch_embed|rel_pos_bias)z^blocks\.(\d+)N)z^norm)i )stemr  )r  )r]   r6  matchers      r;   group_matcherBeit.group_matcher  s!     A-/CD
 r=   c                     U R                   $ )zGGet the classifier head.

Returns:
    The classification head module.
)r  r   s    r;   get_classifierBeit.get_classifier  s     yyr=   c                     Xl         Ub  X l        US:  a'  [        R                  " U R                  U5      U l        g[        R
                  " 5       U l        g)z}Reset the classification head.

Args:
    num_classes: Number of classes for new head.
    global_pool: Global pooling type.
Nr   )r   r   rQ   rR   r   r   r  )r]   r   r   s      r;   reset_classifierBeit.reset_classifier  sB     '"*>IAoBIIdnnk:	SUS^S^S`	r=   rj   indicesreturn_prefix_tokensr	  
stop_early
output_fmtintermediates_onlyc           	         US;   d   S5       eUS:H  n/ n	[        [        U R                  5      U5      u  pUR                  u  ppU R	                  U5      n[
        R                  " U R                  R                  UR                  S   SS5      U4SS9nU R                  b  XR                  -   nU R                  U5      nU R                  b  U R                  5       OSn[
        R                  R                  5       (       d  U(       d  U R                  nOU R                  SUS-    n[        U5       H~  u  nnU R                  (       a/  [
        R                  R                  5       (       d  [!        UUUS	9nOU" UUS	9nUU
;   d  MU  U	R#                  U(       a  U R%                  U5      OU5        M     U R&                  (       aJ  U	 Vs/ s H  nUSS2SU R&                  24   PM     nnU	 Vs/ s H  nUSS2U R&                  S24   PM     n	nU(       ac  U R                  R)                  X45      u  nnU	 Vs/ s H7  nUR+                  UUUS5      R-                  SS
SS5      R/                  5       PM9     n	n[
        R                  R                  5       (       d  U(       a  [1        [3        U	W5      5      n	U(       a  U	$ U R%                  U5      nX4$ s  snf s  snf s  snf )a  Forward pass that returns intermediate feature maps.

Args:
    x: Input image tensor of shape (batch_size, channels, height, width).
    indices: Block indices to return features from. If int, returns last n blocks.
    return_prefix_tokens: If True, return both prefix and spatial tokens.
    norm: If True, apply normalization to intermediate features.
    stop_early: If True, stop at last selected intermediate.
    output_fmt: Output format ('NCHW' or 'NLC').
    intermediates_only: If True, only return intermediate features.

Returns:
    If intermediates_only is True, returns list of intermediate tensors.
    Otherwise, returns tuple of (final_features, intermediates).
)NCHWNLCz)Output format must be one of NCHW or NLC.rH  r   r+   r   rs   Nr   r&   r%   )r   lenr  rt   r   r,   ru   r   expandr  r  r   r   is_scriptingr  r   r   appendr	  r   dynamic_feat_sizerx   r1   r2   listzip)r]   rj   rB  rC  r	  rD  rE  rF  rx   intermediatestake_indices	max_indexr   r/  heightwidthr   r  r  blkyprefix_tokensHWs                           r;   forward_intermediatesBeit.forward_intermediates  sz   2 _,Y.YY,&"6s4;;7G"Q  ggfQIIt~~,,QWWQZR@!D!L>>%NN"AMM!.2.?.?.Kt((*QU99!!##:[[F[[)a-0F'FAs&&uyy/E/E/G/GsA<H|<L $$TTYYq\qA ( !!ERS]Qq!D$:$:"::;]MSDQRMqQq$"8"8"99:MMR##55voFDAq^kl^kYZQYYq!Q3;;Aq!QGRRT^kMlyy%%'',@ ]M!BCM  IIaL TR ms   K0K;>K!
prune_norm
prune_headc                    [        [        U R                  5      U5      u  pEU R                  SUS-    U l        U(       a  [        R                  " 5       U l        U(       a,  [        R                  " 5       U l        U R                  SS5        U$ )a  Prune layers not required for specified intermediate outputs.

Args:
    indices: Indices of blocks to keep.
    prune_norm: If True, remove final normalization.
    prune_head: If True, remove classification head.

Returns:
    List of indices that were kept.
Nr   r    )r   rJ  r  rQ   r   r	  r
  r@  )r]   rB  r]  r^  rR  rS  s         r;   prune_intermediate_layersBeit.prune_intermediate_layers  sh      #7s4;;7G"Qkk.9q=1DI;;=DL!!!R(r=   c                    U R                  U5      n[        R                  " U R                  R	                  UR
                  S   SS5      U4SS9nU R                  b  XR                  -   nU R                  U5      nU R                  b  U R                  5       OSnU R                   HI  nU R                  (       a/  [        R                  R                  5       (       d  [        X1US9nMC  U" XS9nMK     U R                  U5      nU$ )zForward pass through feature extraction layers.

Args:
    x: Input tensor of shape (batch_size, channels, height, width).

Returns:
    Feature tensor of shape (batch_size, num_tokens, embed_dim).
r   r+   r   rs   Nr   )r   r,   ru   r   rK  rt   r  r  r   r  r   r   rL  r   r	  )r]   rj   r   rV  s       r;   forward_featuresBeit.forward_features	  s     QIIt~~,,QWWQZR@!D!L>>%NN"AMM!.2.?.?.Kt((*QU;;C&&uyy/E/E/G/Gs<H<	 
 IIaLr=   
pre_logitsc                    U R                   (       a;  U R                   S:X  a"  USS2U R                  S24   R                  SS9OUSS2S4   nU R                  U5      nU R	                  U5      nU(       a  U$ U R                  U5      $ )a  Forward pass through classification head.

Args:
    x: Feature tensor of shape (batch_size, num_tokens, embed_dim).
    pre_logits: If True, return features before final linear layer.

Returns:
    Logits tensor of shape (batch_size, num_classes) or pre-logits.
r   Nr   rs   r   )r   r   meanr
  r  r  )r]   rj   rf  s      r;   forward_headBeit.forward_head!  s     =A=M=MQV=V!T++,,-22q29\]^_ab^b\cALLONN1q0DIIaL0r=   c                 J    U R                  U5      nU R                  U5      nU$ )zForward pass through the model.

Args:
    x: Input tensor of shape (batch_size, channels, height, width).

Returns:
    Logits tensor of shape (batch_size, num_classes).
)rd  ri  )r]   rj   s     r;   r   Beit.forward1  s)     !!!$a r=   )r  r   r   r
  r  r   r   r  r  r   r   r   r	  r   r   r   r   r  r  r   )Tr   Fr   )NFFFrH  F)r   FT)(r   r   r   r   r   r   r   r   r
   strr   r   r   rQ   r   r   rO   r  r  r  r,   r   ignorer	   r0  r4  r   r   r:  r=  r@  r   r   r[  ra  rd  ri  r   r   r   r   s   @r;   r!   r!     s    5868#$ !!$#!#%$&$&$&*3+/$(%*,1%*5u-CsCx01u- c5c?23u- 	u-
 u- u- u- u- u- u- u- u- u- u- !u-  "!u-" "#u-$ "%u-& RYY'u-( "%)u-* "+u-, #-u-. &*/u-0 #1u- u-n: : :(
1!ryy !t ! YY
S 
 
 YY)T ) ) YYD T#s(^   YY		  
aC 
ahsm 
a 8<).$$',F ||F  eCcN34F  #'	F 
 F  F  F  !%F  
tELL!5tELL7I)I#JJ	KF T ./$#	3S	>*  	
 
c2%,, 5<< 01ell 1 1 1  %,,  r=   urlc                 $    U SSSSSSSSSS	S
S.UE$ )zCreate a default configuration dictionary for BEiT models.

Args:
    url: Model weights URL.
    **kwargs: Additional configuration parameters.

Returns:
    Configuration dictionary.
r   )r&   r   r   Ng?bicubicT)      ?rs  rs  zpatch_embed.projr  z
apache-2.0)rp  r   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizerh  r  
first_conv
classifierlicenser   )rp  kwargss     r;   _cfgr}  ?  s6     =t(  r=   ztimm/)	hf_hub_id)r&     r  g      ?)r~  rt  rv  iQU  )r~  r   )r&      r  )r~  rh  r  )r~  r   rh  r  gffffff?)r~  rv  rh  r  )z)beit_base_patch16_224.in22k_ft_in22k_in1kz)beit_base_patch16_384.in22k_ft_in22k_in1kz$beit_base_patch16_224.in22k_ft_in22kz*beit_large_patch16_224.in22k_ft_in22k_in1kz*beit_large_patch16_384.in22k_ft_in22k_in1kz*beit_large_patch16_512.in22k_ft_in22k_in1kz%beit_large_patch16_224.in22k_ft_in22kz*beitv2_base_patch16_224.in1k_ft_in22k_in1kz$beitv2_base_patch16_224.in1k_ft_in1kz%beitv2_base_patch16_224.in1k_ft_in22kz+beitv2_large_patch16_224.in1k_ft_in22k_in1kz%beitv2_large_patch16_224.in1k_ft_in1kz&beitv2_large_patch16_224.in1k_ft_in22k
state_dictmodelrw  	antialiasc           
      H   U R                  SU 5      n U R                  SU 5      n 0 nU R                  5        GHf  u  pVSU;   a  M  SU;   ab  UR                  R                  R                  R
                  u  pxpUR
                  S   U
:w  d  UR
                  S   U	:w  a  [        UX4UUSS9nOUS	:X  aO  UR
                  S
   UR                  R
                  S
   :w  a%  S
n[        UUR                  R                  UUUSS9nOUR                  S5      (       a  UR                  USS 5      nUR
                  UR                  R
                  :w  d   UR                  S   UR                  S
   :w  a)  [        UUR                  UR                  R
                  S9nXdU'   GMi     U$ )a  Filter and process checkpoint state dict for loading.

Handles resizing of patch embeddings, position embeddings, and relative position
bias tables when model size differs from checkpoint.

Args:
    state_dict: Checkpoint state dictionary.
    model: Target model to load weights into.
    interpolation: Interpolation method for resizing.
    antialias: If True, use antialiasing when resizing.

Returns:
    Filtered state dictionary.
r  r   r:   zpatch_embed.proj.weightr+   rr   T)rw  r  verboser  r   )new_sizer   rw  r  r  rY   Nir   )new_window_sizenew_bias_shape)getitemsr   r[   rm   rt   r   r  r   r  endswithget_submodulerY   r"   r   )r  r  rw  r  out_dictr   r   OIrY  rZ  r   r'  s                r;   checkpoint_filter_fnr    s    4J*5J H  "$)$)**//66<<JA!wwr{a1772;!#3(F"/'  +!''!*0E0Ea0H"H !&**44"3+#A ZZ677##AdsG,Aww!88>>>!--PQBRVWVcVcdeVfBf-$%MM#$#A#A#G#G
 A #B Or=   variant
pretrainedc           	      j    UR                  SS5      n[        [        X4[        [	        USS9S.UD6nU$ )zCreate a BEiT model.

Args:
    variant: Model variant name.
    pretrained: If True, load pretrained weights.
    **kwargs: Additional model arguments.

Returns:
    BEiT model instance.
out_indicesr&   getter)r  feature_cls)pretrained_filter_fnfeature_cfg)popr   r!   r  r  )r  r  r|  r  r  s        r;   _create_beitr    sF     **]A.K g1[hG 	E Lr=   c                 T    [        SSSSSSSSS9n[        S
S	U 0[        U40 UD6D6nU$ )z0BEiT base model @ 224x224 with patch size 16x16.r   r   r   rn   FT皙?r   r   r   rC   r   r   r   r   r  )beit_base_patch16_224r  r  r  r|  
model_argsr  s       r;   r  r    sF     B"#GJ fZf4PZKe^dKefELr=   c                 T    [        SSSSSSSSS9n[        S
S	U 0[        U40 UD6D6nU$ )z0BEiT base model @ 384x384 with patch size 16x16.r  r   r   r   FTr  r   r   r   r   rC   r   r   r   r  )beit_base_patch16_384r  r  s       r;   r  r    sF     s"#GJ fZf4PZKe^dKefELr=   c           
      R    [        SSSSSSSS9n[        S	SU 0[        U40 UD6D6nU$ )
z1BEiT large model @ 224x224 with patch size 16x16.r         FTh㈵>r   r   r   rC   r   r   r   r  )beit_large_patch16_224r  r  s       r;   r  r    sD     R2$HJ gjgDQ[Lf_eLfgELr=   c                 T    [        SSSSSSSSS9n[        S
S	U 0[        U40 UD6D6nU$ )z1BEiT large model @ 384x384 with patch size 16x16.r  r   r  r  FTr  r  r  )beit_large_patch16_384r  r  s       r;   r  r    F     t2$HJ gjgDQ[Lf_eLfgELr=   c                 T    [        SSSSSSSSS9n[        S
S	U 0[        U40 UD6D6nU$ )z1BEiT large model @ 512x512 with patch size 16x16.r  r   r  r  FTr  r  r  )beit_large_patch16_512r  r  s       r;   r  r    r  r=   c                 T    [        SSSSSSSSS9n[        S
S	U 0[        U40 UD6D6nU$ )z3BEiT v2 base model @ 224x224 with patch size 16x16.r   r   r   rn   FTr  r  r  )beitv2_base_patch16_224r  r  s       r;   r  r    sF     B"$HJ hzhTR\Mg`fMghELr=   c           
      R    [        SSSSSSSS9n[        S	SU 0[        U40 UD6D6nU$ )
z4BEiT v2 large model @ 224x224 with patch size 16x16.r   r  r  FTr  r  r  )beitv2_large_patch16_224r  r  s       r;   r  r    sD     R2$HJ i
idS]NhagNhiELr=   r   )r`  )rr  Trm  )Cr   r   	functoolsr   typingr   r   r   r   r   r	   r
   r   r   r,   torch.nnrQ   torch.nn.functional
functionalrv   	timm.datar   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r    __all__r   r   r<   r   r?   r   r   r!   rn  r}  default_cfgsr   r  r  r  r  r  r  r  r  r  r   r=   r;   <module>r     sN  P   O O O     A    + + # <( #U38_  #ell  #Ff		 fRqBII qh4D299 4Dn|299 |~c T#s(^ * %152 26 32
 -1-
 373 37 33
 37 33
 .2. 37"(<3
 -1"(<-
 .2 5;O.
 4817K4
 .217K.
 /3 5;O/u?& ?D5T#u||*;%< 5RYY 5_b 5{ 5  LP  QT  V[  Vb  Vb  Qb  Lc 5p# 4 d * d    d    t $   t $   t $    4    D  r=   