
    RЦijp                       S r SSKrSSKrSSKJrJrJr  SSKJr  SSK	J
r
JrJrJrJrJrJrJrJr  SSKrSSKJr  SSKJs  Jr  SSKJrJr  SSKJrJrJrJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(  SS	K)J*r*  SS
K+J,r,  SSK-J.r.J/r/  SSK0J1r1J2r2  SSK3J4r4J5r5  SSK6J7r7  SSK8J9r9J:r:  SS/r;\Rx                  " \=5      r>\ " S S5      5       r?S\?S\?4S jr@ SGS\R                  S\\B\B4   S\CS\\R                  \\B\B4   4   4S jjrDS\R                  4S jrE " S S5      rFS\?S\
4S jrG\/ " S  S!\R                  5      5       rI\.SSS\R                  4S"\R                  S#\BS$\CS%\\B   S&\R                  S\\R                     4S' jj5       rL\.    SHS\R                  S"\\R                     S(\MS#\BS)\CS\R                  4S* jj5       rN " S+ S\R                  5      rOS, rPSIS-\MS.\QS/\CS\
4S0 jjrRS1\\M\4   S2\OS\\M\4   4S3 jrSSJS4\MS\\M\4   4S5 jjrT\5" \T" S6S79\T" S6S79\T" S6S79\T" 5       \T" 5       \T" 5       \T" S6SS89\T" S6SS89S9.5      rUSKS:\MS;\CS\O4S< jjrV SKS:\MS;\CS\O4S= jjrW SKS:\MS;\CS\O4S> jjrX\4SKS;\CS\O4S? jj5       rY\4SKS;\CS\O4S@ jj5       rZ\4SKS;\CS\O4SA jj5       r[\4SKS;\CS\O4SB jj5       r\\4SKS;\CS\O4SC jj5       r]\4SKS;\CS\O4SD jj5       r^\4SKS;\CS\O4SE jj5       r_\4SKS;\CS\O4SF jj5       r`g)La  NaFlex Vision Transformer

An improved version of the Vision Transformer with:
1. Encapsulated embedding and position encoding in a single module
2. Support for linear patch embedding on pre-patchified inputs
3. Support for NaFlex variable aspect, variable resolution
4. Support for FlexiViT variable patch size
5. Support for NaViT fractional/factorized position embedding

Based on ideas from:
- Original Vision Transformer: https://arxiv.org/abs/2010.11929
- FlexiViT: https://arxiv.org/abs/2212.08013
- NaViT: https://arxiv.org/abs/2307.06304
- NaFlex (SigLip-2): https://arxiv.org/abs/2502.14786

Hacked together by / Copyright 2025, Ross Wightman, Hugging Face
    N)	dataclassfieldsreplace)partial)	CallableDictListOptionalSetTupleTypeUnionAnyIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STD)AttentionPoolLatentMlp	LayerNormPatchDropoutWithIndicesPatchEmbedInterpolator_assert	to_2tupleget_act_layerget_norm_layerapply_keep_indices_nlcdisable_compilercalculate_drop_path_rates   )build_model_with_cfg)feature_take_indices)register_notrace_functionregister_notrace_module)
checkpointnamed_apply)register_modelgenerate_default_cfgs)EvaBlock)Blockglobal_pool_nlcNaFlexVitCfg	NaFlexVitc                   Z   \ rS rSr% SrSr\\\\\4   4   \	S'   Sr
\\	S'   Sr\\	S'   Sr\\	S	'   S
r\\	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\   \	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\ \	S '   S!r!\\\\4      \	S"'   S#r"\ \	S$'   Sr#\\	S%'   Sr$\\	S&'   S'r%\ \	S('   S)r&\\	S*'   Sr'\\\\4      \	S+'   Sr(\\	S,'   S-r)\ \	S.'   Sr*\\	S/'   Sr+\\	S0'   Sr,\\	S1'   Sr-\\   \	S2'   S3r.\ \	S4'   Sr/\\	S5'   Sr0\\   \	S6'   Sr1\\   \	S7'   S'r2\ \	S8'   Sr3\\	S9'   S:r4\ \	S;'   Sr5\\    \	S<'   Sr6\\    \	S='   Sr7\\    \	S>'   Sr8\\    \	S?'   Sr9\\    \	S@'   Sr:\\    \	SA'   Sr;\\    \	SB'   SCr<\ \	SD'   Sr=\\	SE'   Sr>\\	SF'   Sr?\\	SG'   SHr@g)Ir+   :   zConfiguration for FlexVit model.

This dataclass contains the bulk of model configuration parameters,
with core parameters (img_size, in_chans, num_classes, etc.) remaining
as direct constructor arguments for API compatibility.
   
patch_size   	embed_dim   depth	num_headsg      @	mlp_ratioFscale_mlp_normTqkv_biasqk_norm	proj_bias        attn_drop_ratescale_attn_inner_normNinit_values	drop_ratepos_drop_ratepatch_drop_rateproj_drop_ratedrop_path_rateclass_tokenr   
reg_tokenslearned	pos_embed)r/   r/   pos_embed_grid_sizebicubicpos_embed_interp_modepos_embed_ar_preservingpos_embed_use_grid_sample 	rope_type     @rope_temperaturerope_ref_feat_shaperope_grid_offsetijrope_grid_indexingdynamic_img_padpre_norm
final_normfc_normmapglobal_poolpool_include_prefixattn_pool_num_headsattn_pool_mlp_ratioweight_initfix_initlinearembed_proj_typeinput_norm_layerembed_norm_layer
norm_layer	act_layerblock_fn	mlp_layer
attn_layerstandard	attn_type
swiglu_mlp	qkv_fusedenable_patch_interpolator )A__name__
__module____qualname____firstlineno____doc__r0   r   intr   __annotations__r2   r4   r5   r6   floatr7   boolr8   r9   r:   r<   r=   r>   r
   r?   r@   rA   rB   rC   rD   rE   rG   strrH   rJ   rK   rL   rN   rP   rQ   rR   rT   rU   rV   rW   rX   rZ   r[   r\   r]   r^   r_   ra   rb   rc   rd   re   rf   rg   rh   rj   rk   rl   rm   __static_attributes__rn       T/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/models/naflexvit.pyr+   r+   :   s    /1Jc5c?*+0IsE3OIsIu ND  HdGTItNE"'4' $(K%'IuM5 OU NENE KJ Is5=%S/2=!*3*$)T)&+t+ Is%e%59%S/29 e "" "OT! HdJ"GXd^" K %%)-#-+/%/ KHd $OS#&*hsm*&*hsm* !%J$#Ix}#"Hhsm"#Ix}# $J$  IsJIt ',t+rz   cfgreturnc                     [        U R                  R                  5       5      nUR                  5        VVs0 s H  u  p4X2;   d  M  X4_M     nnnU(       a  [	        U 40 UD6n U $ s  snnf )zIOverlay kwargs onto config, replacing config values with provided kwargs.)set__dataclass_fields__keysitemsr   )r|   kwargsconfig_fieldskvconfig_kwargss         r{   _overlay_kwargsr      s_     005578M&,llnKnda8JTQTnMKc+]+J Ls   A$A$Txr0   padc                 J   U R                   u  p4pVUu  pxU(       a?  XW-  S:w  d  Xh-  S:w  a/  XuU-  -
  U-  n	XU-  -
  U-  n
[        R                  " U SU
SU	45      n XW-  Xh-  pU R                  X4XX5      R	                  SSSSSS5      R                  X;U-  Xx-  U-  5      nXU44$ )aH  Patchify a batch of images.

Args:
    x: Input tensor of shape [B, C, H, W].
    patch_size: Patch dimensions (patch_h, patch_w).
    pad: Whether to pad images to be divisible by patch size.

Returns:
    Tuple of (patches, grid_size) where patches has shape [B, N, P*P*C]
    and grid_size is (num_patches_h, num_patches_w).
r               r   )shapeFr   viewpermutereshape)r   r0   r   BCHWphpwpad_hpad_wnhnwpatchess                 r{   batch_patchifyr      s      JA!FB !qv{"f""f"EE!a5)*WagffQ22*221aAq!DLLQUWPWY[Y`cdYdeG Hrz   _coordc           	         U S S 2S S 2S4   R                  SS9S-   nU S S 2S S 2S4   R                  SS9S-   n[        X5       VVs/ s H7  u  p4[        UR                  5       5      [        UR                  5       5      4PM9     snn$ s  snnf )Nr   r   dim)amaxziprt   item)r   max_ymax_xhws        r{   calculate_naflex_grid_sizesr      s    1a7O  Q '!+E1a7O  Q '!+E7:57HI7HtqS]CM*7HIIIs   >B	c                       \ rS rSrSrS\\\\4   \\   4   S\\\\4      S\S\S\	R                  S\	R                  4S	 jrS
 r\S 5       rSrg)NaFlexRopeIterator   zXIterator for generating batched ROPE embeddings for mixed mode with multiple grid sizes.size_to_indicesunique_sizes
batch_sizeseq_lendevicedtypec                 F   Xl         X l        X0l        X@l        XPl        Xpl        X`l        UR                  U l        UR                  U l        SUR                  -  UR                  -  U l
        SU l        0 U l        U H   nUR                  US9n	XR                  U'   M"     g )Nr   r   r   )roper   r   r   r   r   r   r4   r5   r   head_dim
_depth_idx_embeddings_per_size	get_embed)
selfrope_moduler   r   r   r   r   r   	grid_size
rope_embeds
             r{   __init__NaFlexRopeIterator.__init__   s      	.($
 &&
$..KOO+{/D/DD %'!%I$..Y.?J3=%%i0 &rz   c                     SU l         U $ )Nr   )r   r   s    r{   __iter__NaFlexRopeIterator.__iter__   s    rz   c           	         U R                   U R                  :  a  [        e[        R                  " U R
                  U R                  U R                  U R                  U R                  U R                  S9nU R                   HY  nUu  p4X4-  nU R                  U   nU R                  U   U R                      nU H  nUS S 2S U2S S 24   XS S 2S U2S S 24'   M     M[     U =R                   S-  sl         U$ )Nr   r   r   )r   r4   StopIterationtorchzerosr   r5   r   r   r   r   r   r   r   )	r   batch_embedr   r   r   
actual_lenbatch_indicesembedbis	            r{   __next__NaFlexRopeIterator.__next__   s    ??djj( kkOOT^^T\\4==**T[[
 **IDAJ 00;M --i8IE $5:1kzk1;L5M;J;12 $ + 	1rz   )r   r   r   r4   r   r   r   r5   r   r   r   r   N)ro   rp   rq   rr   rs   r   r   rt   r	   r   r   r   r   r   r   r   ry   rn   rz   r{   r   r      s    b> eCHotCy89> 5c?+	>
 > > > {{>:  rz   r   c           
         U R                   S;   =(       d"    U R                  S;  =(       d    U R                  nU(       a  U R                   nUS:X  a  U R                  S;  a  SnU R                  (       a  SOSU R                  -   n[        [        UU R                  U R                  U R                  U R                  US9$ U R                  =(       d    [        n0 nU R                  (       d  U R                  (       a  U R                  US'   U R                  US	'   U R                  (       a  U R                  US
'   U(       a  [        U40 UD6nU$ )zGet appropriate block function based on configuration.

Returns a partially applied block constructor with EVA-specific
or conflicting parameters pre-configured if needed.
)evar   )rM   noneri   r   r   r   )rj   rk   	scale_mlpscale_attn_innerrl   num_prefix_tokensr7   scale_attn_normrh   )rj   rN   rk   rD   rE   r   r(   r7   r=   rl   rf   r)   rh   )r|   use_eva_featuresrj   r   rf   block_kwargss         r{   get_block_fnr     s    	( 	\)	  MM	
"s}}L'HI"%//QqCNNJ~~(( 66mm/
 	
 <<(5!:!:-0-?-?L)*.1.G.GL*+>>),L&x8<8Hrz   c            )         ^  \ rS rSrSr                     S+S\\\\\4   4   S\S\S\\	   S\
S	\
S
\S\
S\\\\\\4   4      S\	S\\\\4      S\	S\
S\
S\\\R                        S\\
\\\R                        4   S\\\R                        S\S\
SS4(U 4S jjjrS,S jrS\\	\4   4S jrS-S\
S\\\\\4   4   4S jjrS\\\4   S\\\4   4S jr\S\R0                  S \R0                  SS4S! j5       r\S\R0                  S \R0                  SS4S" j5       rS\R0                  S#\\   SS4S$ jr\S\R0                  S \R0                  SS4S% j5       r\S\R0                  S \R0                  SS4S& j5       rS\R0                  S#\\   SS4S' jr  S.S\R0                  S \\R0                     S(\\R0                     S\\R0                  \\\\4      4   4S) jjr S*r!U =r"$ )/NaFlexEmbedsi1  aq  NaFlex Embedding module for Vision Transformers.

This module encapsulates the complete embedding process for Vision Transformers,
supporting both standard and NaFlex (NaViT + FlexiViT) functionality:

1. Patch embedding (via Conv2d or Linear)
2. Class and register token preparation
3. Position embedding addition with interpolation support
4. Pre-normalization (if requested)
5. Dropout application

NaFlex capabilities include:
- Variable aspect ratio and resolution via patch coordinates
- Patch type indicators for handling padding tokens in attention
- Flexible position embedding interpolation for arbitrary grid sizes
- Support for factorized position embeddings

The patch embedding can be one of two types:
- Conv2d-based (default): For standard image inputs [B, C, H, W]
- Linear-based: For pre-patchified inputs [B, N, P*P*C]

Args:
    patch_size: Size of patches for patch embedding
    in_chans: Number of input image channels
    embed_dim: Dimensionality of patch embedding
    proj_type: Type of embedding projection layer ('conv' or 'linear')
    input_norm_layer: Normalization layer applied to input (linear mode only)
    proj_norm_layer: Normalization layer applied after projection
    pos_embed: Type of position embedding ('learned', 'factorized', 'none')
    pos_drop_rate: Dropout rate for position embeddings
    class_token: Whether to include a class token
    reg_tokens: Number of register tokens to include
    bias: Whether to use bias in projection layers
    dynamic_img_pad: Whether to enable dynamic padding for variable resolution
    pos_embed_grid_size: Grid size for position embedding initialization
    pos_embed_interp_mode: Interpolation mode for position embedding resizing
    pos_embed_ar_preserving: Whether to preserve aspect ratio during position embedding interpolation
    default_img_size: Default image size for position embedding grid calculation
Nr0   in_chansr2   	proj_typer:   rD   rE   rU   default_img_sizerG   rH   rJ   rK   rL   rb   proj_norm_layerrd   r@   rm   r}   c                 &  > UUS.n[         TU ]  5         X`l        Xpl        Xl        Xl        Xl        [        U5      U l        X l	        X0l
        Xl        UU l        U(       a  SOSU l        U =R                  U-  sl        U(       a-  [        R                  " [         R"                  " SSU40 UD65      OSU l        U(       a,  [        R                  " [         R"                  " SXs40 UD65      OSU l        SU l        SU l        Ub  Xl        OZU	bW  [        U	5      U l        [-        [/        U R(                  U R                  5       VVs/ s H  u  nnUU-  PM     snn5      U l        US:X  a  U R                  S   U R                  S   -  U-  nUSL a
  Uc   S5       eUSL a  UO
U=(       d    SnU(       a  U" U5      OSU l        [        R2                  " UU4SU0UD6U l        S	U l        SU l        O?U(       a   eSU l        [        R:                  " UU4UUUS
.UD6U l        SU l        S	U l        U R                  (       a  [=        U R                  UUUSS9U l        OSU l        USL a
  Uc   S5       eUSL a  UO
U=(       d    SnU(       a  U" U5      O[        R@                  " 5       U l!        U
S;   a  U R*                  c  [E        S5      eSU l#        SU l$        SU l%        U
(       a  U
S:X  a  SU l&        OU
S:X  a  U R*                  c   eU R*                  u  nnSU l&        [        R                  " [         R"                  " SUU40 UD65      U l$        [        R                  " [         R"                  " SUU40 UD65      U l%        OXU R*                  c   eU R*                  u  nn[        R                  " [         R"                  " SUUU40 UD65      U l#        SU l&        [        RN                  " US9U l(        U RS                  5         gs  snnf )a  Initialize NaFlexEmbeds module.

Args:
    patch_size: Size of patches for patch embedding.
    in_chans: Number of input image channels.
    embed_dim: Dimensionality of patch embedding.
    proj_type: Type of embedding projection layer ('conv' or 'linear').
    proj_bias: Whether to use bias in projection layers.
    class_token: Whether to include a class token.
    reg_tokens: Number of register tokens to include.
    dynamic_img_pad: Whether to enable dynamic padding for variable resolution.
    default_img_size: Default image size for position embedding grid calculation.
    pos_embed: Type of position embedding ('learned', 'factorized', 'none').
    pos_embed_grid_size: Grid size for position embedding initialization.
    pos_embed_interp_mode: Interpolation mode for position embedding resizing.
    pos_embed_ar_preserving: Whether to preserve aspect ratio during interpolation.
    input_norm_layer: Normalization layer applied to input (linear mode only).
    proj_norm_layer: Normalization layer applied after projection.
    norm_layer: Default normalization layer.
    pos_drop_rate: Dropout rate for position embeddings.
    enable_patch_interpolator: Enable dynamic patch size support.
r   r   r   r   Nr`   Tz5`norm_layer` must be given when input_norm_layer=TruebiasF)kernel_sizestrider   )base_patch_sizer   r2   interpolation	antialiasz4`norm_layer` must be given when proj_norm_layer=True)
factorizedrF   zgCannot initialize position embeddings without grid_size.Please provide img_size or pos_embed_grid_size.r   r   rF   )p)*superr   has_class_tokennum_reg_tokensrJ   rK   rL   r   r0   r   r2   rU   rm   r   nn	Parameterr   empty	cls_token	reg_tokenr   rH   tupler   
norm_inputLinearprojflatten	is_linearConv2dr   patch_interpolatorIdentitynorm
ValueErrorrG   pos_embed_ypos_embed_xpos_embed_typeDropoutpos_dropreset_parameters)r   r0   r   r2   r   r:   rD   rE   rU   r   rG   rH   rJ   rK   rL   rb   r   rd   r@   rm   r   r   ddsr   	patch_dimr   r   	__class__s                               r{   r   NaFlexEmbeds.__init__[  s   \ /*(%:"'>$)B&#J/ ".)B& '2q*, NYekk!Q	&HR&HI^bV`ekk!Z&Qb&QRfj <@>B *':$)$-.>$?D!',TEZEZ\`\k\kAl-mAlAa1fAl-m'nD$   *T__Q-??(JI(D0Z5G HGHH-=-EzL\Ld`d=M.y9SWDO		)YMYM"MDI DL!DN (''"DO		 '! DI  DL"DN ))&< $!#3'D# '+D# $t+
0B 	CB	CC(74(?*oF]Y]2AOI.r{{}	 11d6N6N6VBC C 263737I/"(D,&++777++DAq".D!||EKK1i,N2,NOD!||EKK1i,N2,NOD++777++DAq\\%++aAy*OB*OPDN"+D 

]3 	S .ns   	P
c                    U R                   b(  [        R                  R                  U R                   SS9  U R                  b(  [        R                  R                  U R                  SS9  U R
                  b(  [        R                  R                  U R
                  SS9  U R                  b(  [        R                  R                  U R                  SS9  U R                  b)  [        R                  R                  U R                  SS9  g g )Ngư>)stdg{Gz?)r   r   initnormal_r   rG   r   r  r   s    r{   r  NaFlexEmbeds.reset_parameters  s    >>%GGOODNNO5>>%GGOODNNO5>>%GGOODNNO4'GGOOD,,#O6'GGOOD,,#O6 (rz   c                 >    [        U R                  U R                  S9$ )zGet feature information for feature extraction.

Args:
    location: Feature extraction location identifier

Returns:
    Dictionary containing feature channel count and reduction factor
)num_chs	reduction)dictr2   r0   )r   locations     r{   feature_infoNaFlexEmbeds.feature_info  s     DNNdooFFrz   	as_scalarc                 R    U(       a  [        U R                  5      $ U R                  $ )zGet the feature reduction ratio (stride) of the patch embedding.

Args:
    as_scalar: Whether to return the maximum dimension as a scalar

Returns:
    Feature reduction ratio as scalar or tuple
)maxr0   )r   r  s     r{   
feat_ratioNaFlexEmbeds.feat_ratio	  s      t''??"rz   img_sizec                    U R                   (       aR  [        R                  " US   U R                  S   -  5      [        R                  " US   U R                  S   -  5      4$ US   U R                  S   -  US   U R                  S   -  4$ )zCalculate grid (feature) size for given image size.

Takes into account dynamic padding when enabled.

Args:
    img_size: Input image size as (height, width)

Returns:
    Grid size as (grid_height, grid_width)
r   r   )rU   mathceilr0   )r   r  s     r{   dynamic_feat_sizeNaFlexEmbeds.dynamic_feat_size  s     99Xa[4??1+==>		(ST+X\XgXghiXjJj@kkkA;$//!"44hqkT__UVEW6WWWrz   r   patch_coordc           
      |  ^ ^^^^ [        U5      nT R                  R                  SS u  mmT R                  R                  SSSS5      R	                  5       mUUUU U4S jn0 n[        U5       H&  u  pgUR                  U/ 5      R                  U5        M(     UR                  5        H  u  pxU" U5      n	[        TR                  S   U	R                  S   5      n
TSS2SU
24   R                  S[        R                  " UTR                  S9U	SS2SU
24   R                  [        U5      SS5      5        M     g)	a/  Apply learned position embeddings to NaFlex batch in-place.

Interpolates learned 2D position embeddings for each sample in the batch
based on their individual grid sizes.

Args:
    x: Input tensor to add position embeddings to [B, N, C]
    patch_coord: Patch coordinates [B, N, 2] with (y, x) values
r   r   r   r   c                   > U S   T:X  a*  U S   T:X  a!  TR                   R                  STT-  S5      nO}TR                  (       a  [        [	        U 5      5      OU n[
        R                  " TUTR                  SSS9SS2SS2SU S   2SU S   24   R                  S5      R                  SS5      nUR                  TR                  S	9$ )
z
Return a flattened positional-embedding grid at an arbitrary spatial resolution.

Converts the learned 2-D table stored in NCHW format (pos_embed_nchw) into
a (1, H*W, C) sequence that matches the requested size.
r   r   FTsizemodealign_cornersr   Nr   r   )rG   r   rK   r   r  r   interpolaterJ   r   	transposetor   )r'  pos_embed_flat_interp_sizeorig_horig_wpos_embed_nchwr   r   s      r{   	_interp2d?NaFlexEmbeds._apply_learned_naflex_pos_embed.<locals>._interp2d;  s     Q6!Q6(9!%!7!76F?B!O7;7S7SyT3Y]!""%33"'"" Qa(47(*", -4GAJyyA  "$$177$33rz   Nr   r%  )r   rG   r   r   rv   	enumerate
setdefaultappendr   min
index_add_r   	as_tensorr   expandlen)r   r   r"  naflex_grid_sizesr3  r   r   r   r   r.  r   r0  r1  r2  s   ``         @@@r{   _apply_learned_naflex_pos_embed,NaFlexEmbeds._apply_learned_naflex_pos_embed'  s      8D--a2//1a;AAC	4 	4* =?01EB&&q"-44R8 2 !0 5 5 7A 'q\N!''!*n&:&:1&=>Ga'kN%%ahh?q(7({+223}3Er2N !8rz   c                 |   UR                   nUR                  u  pEnUR                  SS9R                  S-   nU R                  (       a(  UR                  SS9nUR                  5       n	U	=pX-  =pO'UR                  SS9u  pXSS2S4   -  nXSS2S4   -  n[        R                  " USSU[        R                  S9nXSS2SS4'   XSS2SS4'   US-
  USS2SS4'   US-
  USS2SS4'   [        R                  " XXjU4SS	9n[        R                  " U R                  R                  SSSS5      R                  US
S
S
5      R                  5       UU R                   SSS9R#                  UR$                  S9n[        R&                  " XC[        R(                  S9R+                  S5      nUUUSS2US   US   4   -  ng)af  Apply learned position embeddings to NaFlex batch using grid_sample.

Uses F.grid_sample for efficient interpolation of learned 2D position embeddings
based on patch coordinates. Based on proposal by https://github.com/stas-sl

Args:
    x: Input tensor to add position embeddings to [B, N, C]
    patch_coord: Patch coordinates [B, N, 2] with (y, x) values
r   r   r   Nr   r   r   Fr)  r%  borderr(  r)  padding_moder*  .r   .r   )r   r   r  valuesrK   r   r   r   float32r   affine_gridgrid_samplerG   r   r<  rv   rJ   r-  r   arangelong	unsqueeze)r   r   r"  r   r   Nr   shapesL_iL_globalgrid_size_ygrid_size_xscale_xscale_ythetagridrG   r   s                     r{   +_apply_learned_naflex_pos_embed_grid_sample8NaFlexEmbeds._apply_learned_naflex_pos_embed_grid_sample`  s    ''aQ'..2''++!+$CxxzH(00K (.Gg'-{{q{'9$K!1a4L0G!1a4L0GAq!F%--H aAg aAg 1aAg 1aAg}}U$DTYZMMNN""1aA.55aRDJJL++!
 "177"
 	 \\!%**=GGJ	Yr1k&1;v3FFGGrz   r   c                 6   U R                   R                  SS u  p4US   U:X  a)  US   U:X  a   U R                   R                  SX4-  S5      nOU R                  (       a  [	        U5      nXf4nOUn[
        R                  " U R                   R                  SSSS5      R                  5       UU R                  SSS9S	S	2S	S	2S	US   2S	US   24   R                  S5      R                  SS5      nUR                  UR                  S
9nUR                  U5        g	)a  Apply learned position embeddings to standard 2D batch in-place.

Interpolates learned 2D position embeddings to match the specified grid size.

Args:
    x: Input tensor to add position embeddings to [B, H*W, C]
    grid_size: Target grid size as [height, width]
r   r   r   r%  r   FTr&  Nr*  )rG   r   r   rK   r  r   r+  r   rv   rJ   r   r,  r-  r   add_)r   r   r   r0  r1  r.  Lr/  s           r{   _apply_learned_pos_embed%NaFlexEmbeds._apply_learned_pos_embed  s    --a2Q<6!ilf&<!^^33AvKN ++	N t(]]&&q!Q288:!//# MYq\M=IaL=02 3:'!*YYq!_  (***9	~rz   c           
        ^ [        U5      n[        U5      TR                  S5      :X  d   eU R                  R                  S   U R
                  R                  S   pT0 n[        U5       H&  u  pxUR                  U/ 5      R                  U5        M(     S[        R                  S[        S[        S[        R                  4U4S jjn	UR                  5        GH  u  pUu  pU R                  (       a  [        X5      =pOXpU	" U R                  X5      SS2SU24   nU	" U R
                  X5      SS2SU24   nUR                  S	5      UR                  S5      -   nUR!                  SS	5      n[#        TR                  S   UR                  S   5      nTSS2SU24   R%                  S[        R&                  " U
TR(                  S
9USS2SU24   R+                  [        U
5      SS5      5        GM     g)a3  Apply factorized position embeddings to NaFlex batch in-place.

Uses separate Y and X position embedding tables that are interpolated
and combined for each sample's grid size.

Args:
    x: Input tensor to add position embeddings to [B, N, C]
    patch_coord: Patch coordinates [B, N, 2] with (y, x) values
r   r   table
new_lengthorig_lengthr}   c                    > X:X  a  U R                  TR                  S9$ [        R                  " U R	                  SSS5      R                  5       USSS9R	                  SSS5      R                  TR                  S9$ )zt
Resample a 1-D positional-embedding table to specified length
and return it in (1, L, C) layout, dtype matching x.
r*  r   r   r   r`   Fr'  r(  r)  r-  r   r   r+  r   rv   ra  rb  rc  r   s      r{   	_interp1dBNaFlexEmbeds._apply_factorized_naflex_pos_embed.<locals>._interp1d  sr    
 (xxaggx..==aA&,,.#	
 gaArrr01rz   Nr   r5  r%  )r   r=  r'  r   r   r  r6  r7  r8  r   Tensorrt   r   rK   r  rN  r   r9  r:  r;  r   r<  )r   r   r"  r>  r0  r1  r   r   r   rh  r   target_htarget_wlen_ylen_xpe_ype_xposr   s    `                 r{   "_apply_factorized_naflex_pos_embed/NaFlexEmbeds._apply_factorized_naflex_pos_embed  s     8D$%222 ))//2D4D4D4J4J14M =?01EB&&q"-44R8 2	1U\\ 	1s 	1 	1QVQ]Q] 	1 !0 5 5 7A!"H++ #H 77'uT--u=a(lKDT--u=a(lKD ..#dnnQ&77C++a#C!''!*ciil3Ga'kN%%ahh?AxxK ''M(:BC !8rz   c                   ^^^ TR                   nTR                  u  mnmUR                  SS9S-   nU R                  (       a(  UR                  SS9nUR                  5       nU=pXv-  =pO)UR                  S5      u  pXSS2S4   -  n
XSS2S4   -  nS[        R
                  S[        R
                  S[        R
                  S[        R
                  4UUU4S	 jjnU" U R                  XS
9nU" U R                  XS
9n[        R                  " TU[        R                  S9R                  S5      nTXSS2SUS   4   XSS2SUS   4   -   -  mg)au  Apply factorized position embeddings to NaFlex batch using grid_sample.

Uses F.grid_sample for efficient interpolation of separate Y and X position
embedding tables based on patch coordinates. Based on proposal by https://github.com/stas-sl

Args:
    x: Input tensor to add position embeddings to [B, N, C]
    patch_coord: Patch coordinates [B, N, 2] with (y, x) values
r   r   r   Nra  scale
out_lengthr}   c                   > U R                  SSS5      R                  S5      R                  TSSS5      R                  5       n[        R
                  " TSSTR                  S9nXS S 2SS4'   US-
  US S 2SS4'   SUS S 2SS4'   [        R                  " UTTSU4SS9n[        R                  " X5S	SS
S9nUR                  TR                  5      $ )Nr   r   r   r%  r   r5  FrB  bilinearrC  rD  )r   rN  r<  rv   r   r   r   r   rJ  rK  r-  r   )	ra  ru  rv  perW  rX  r   r   r   s	         r{   rh  NNaFlexEmbeds._apply_factorized_naflex_pos_embed_grid_sample.<locals>._interp1d  s    q!Q'11!4;;Ar2rJPPRBKK1a9E"!Q'N"QYE!Q'NE!Q'N==Aq*(=USDrj\deB55>!rz   )ru  rv  r   rG  rF  )r   r   r   rK   r   rj  r  r   rL  rM  rN  )r   r   r"  r   _rP  rQ  rR  rS  rT  rU  rV  rh  rp  ro  r   r   r   s    `              @@r{   ._apply_factorized_naflex_pos_embed_grid_sample;NaFlexEmbeds._apply_factorized_naflex_pos_embed_grid_sample  sP    ''1a!!a!(1,''++!+$CxxzH(00K (.Gg (.{{1~$K!1a4L0G!1a4L0G	"U\\ 	"%,, 	"ELL 	"]b]i]i 	" 	" ))Q))Q\\!F%**=GGJ	TaK//04Aq+fBU8U3VVVrz   c                   ^ U R                   R                  S   U R                  R                  S   pCUu  pVU R                  (       a  [	        XV5      =pxOXVpS[
        R                  S[        S[        S[
        R                  4U4S jjn	U	" U R                   Xs5      SS2SU24   n
U	" U R                  X5      SS2SU24   nU
R                  S5      UR                  S5      -   nUR                  SS5      nTR                  U5        g)	a-  Apply factorized position embeddings to standard 2D batch in-place.

Uses separate Y and X position embedding tables that are interpolated
and combined for the specified grid size.

Args:
    x: Input tensor to add position embeddings to [B, H*W, C]
    grid_size: Target grid size as [height, width]
r   ra  rb  rc  r}   c                    > X:X  a  U R                  TR                  S9$ [        R                  " U R	                  SSS5      R                  5       USSS9R	                  SSS5      R                  TR                  S9$ )Nr*  r   r   r   r`   Fre  rf  rg  s      r{   rh  ;NaFlexEmbeds._apply_factorized_pos_embed.<locals>._interp1d6  sp    (xxaggx..==aA&,,.#	
 gaArrr01rz   Nr   )r   r   r  rK   r  r   rj  rt   rN  r   r\  )r   r   r   r0  r1  rk  rl  rm  rn  rh  ro  rp  rG   r.  s    `            r{   _apply_factorized_pos_embed(NaFlexEmbeds._apply_factorized_pos_embed   s     ))//2D4D4D4J4J14M&''33EE#5	1U\\ 	1s 	1 	1QVQ]Q] 	1 ))59!YhY,G))59!YhY,G NN1%q(99	"**1a0	~rz   patch_validc           
         SnUR                   S   nU R                  (       Ga:  Uc:  [        UR                  S:H  S5        [	        XR
                  U R                  S9u  pO/[        UR                  S:H  =(       d    UR                  S:H  S5        U R                  (       a{  UR                  S:X  ak  [        U R                  SL S	5        U R                  UU R                  R                  U R                  R                  [        UR                   S
S 5      SS9nGOLUR                  S
5      nU R                  b  U R                  U5      nU R                  U5      nGO
[        UR                  S:H  S5        U R                  (       a  UR                   SS u  pgU R
                  S   X`R
                  S   -  -
  U R
                  S   -  nU R
                  S   XpR
                  S   -  -
  U R
                  S   -  n	[        R                   " USU	SU45      nU R                  U5      nUR                   SS nU R                  (       a!  UR                  S
5      R#                  SS
5      nU R%                  U5      nU R&                  S:X  aD  Ub  U R)                  XS9  OU R*                  (       a  U R-                  XS9  OcU R/                  XS9  OSU R&                  S:X  aC  Ub  U R1                  XS9  O0U R*                  (       a  U R3                  XS9  OU R5                  XS9  / n
U R6                  b,  U
R9                  U R6                  R;                  USS5      5        U R<                  b,  U
R9                  U R<                  R;                  USS5      5        U
(       a  [>        R@                  " X/-   SS9nU RC                  U5      nX4$ )aw  Forward pass for patch embedding with position encoding.

Args:
    x: Input tensor. Supported formats:
        - [B, C, H, W] for conv mode
        - [B, N, P*P*C] for pre-patchified linear mode (normal)
        - [B, N, Ph, Pw, C] for pre-patchified linear mode (variable patch size)
    patch_coord: Optional patch coordinates [B, N, 2] for NaFlex mode.
    patch_valid: Optional validity mask for patches [B, N] for NaFlex mode.

Returns:
    Tuple of (embedded_tensor, grid_size) where:
        - embedded_tensor: [B, num_prefix_tokens + N, embed_dim]
        - grid_size: (H, W) tuple for standard mode, None for NaFlex mode
Nr   r   z-Expecting 2D image input with input ndim == 4)r   r   r   z/Expecting patchified input with ndim == 3 or 5.z,input norm not supported with patch resizingr   T)r0   r   zConvolutional input must be 4Dr   rF   )r   )r"  r   r%  r   )"r   r   r   ndimr   r0   rU   rm   r   r   r   weightr   r   r   r   r   r,  r   r  r^  rL   rY  r?  r  r|  rr  r   r8  r<  r   r   catr  )r   r   r"  r  r   r   r   r   r   r   to_cats              r{   forwardNaFlexEmbeds.forwardJ  s2   * 04	GGAJ>>>"!%TU-adFZFZ[9 !2qvv{4ef --!&&A+4/1_` ++II$$IINN$QWWQq\2" ,  IIaL??.*AIIaLAFFaK!AB##wwrs|+a//!2D.DDXYHZZ+a//!2D.DDXYHZZEE!a512		!AI||IIaL**1a0 IIaL)+$--a-E 11DDQD`888T  L0$000H 11GGGc;;A;W >>%MM$..//2r:;>>%MM$..//2r:;		&3,A.A MM!|rz   )r   r   rU   r2   rm   r   r   r   r   r   r   r   r   r   r0   r  rG   rK   rH   rJ   r  rL   r  r   r   r   )r/   r   r1   NTTr   FNrF   )   r  rI   FFNNNr;   FNNr}   NTNN)#ro   rp   rq   rr   rs   r   rt   r   r
   rx   rw   r   r   Modulerv   r   r  r   r   r  r  r   r   r   rj  r?  rY  r	   r^  rr  r|  r  r  ry   __classcell__r	  s   @r{   r   r   1  s   &T 79 '+" $$)FJ&=E)2,1.3:>FJ48#%.3-U c5c?23U  U  	U 
  }U  U  U  U  "U  'uS%S/-A'BCU  U  "*%S/!:U  $'U  &*U  (,U   'tBII7!U " #4$ryy/)B#BC#U $ !bii1%U & !'U ( (,)U . 
/U  U n
7	GS#X 	G#D #E#uS#X:N4O #X%S/ XeCHo X  6||6 6 
	6 6p ,H||,H ,H 
	,H ,H\!||! Cy! 
	!F ;||; ; 
	; ;z -W||-W -W 
	-W -W^(||( Cy( 
	(Z 3726	g||g "%,,/g "%,,/	g
 
u||XeCHo66	7g grz   r   r  r   	symmetricq_lenr   c                     U c  gU R                  5       n U R                  u  pVUnUS:  a9  U R                  XQ4[        R                   S9n[        R                  " X/SS9n Xq-  nU(       a5  U R                  S5      U R                  S5      -  n	U	R                  S5      n	O)U=(       d    UnU SS2SSSS24   R                  USX75      n	[        R                  " XS9n
U
R                  U	) [        R                  " U5      R                  5        U
$ )a  Creates an attention mask from patch validity information.

Supports two modes controlled by `symmetric`:
1. `symmetric=True` (default): Creates a symmetric mask of shape
   [B, 1, seq_len, seq_len]. An attention pair (i, j) is allowed only if
   both token i and token j are valid. Suitable for standard self-attention.
2. `symmetric=False`: Creates a potentially non-square mask of shape
   [B, 1, q_len, kv_len]. An attention pair (q, k) is allowed only if
   the key/value token k is valid. Query token validity is not checked
   in the mask itself. Useful for cross-attention or specific self-attention
   implementations `q_len` can be specified.

Used for NaFlex mode to handle variable token counts and padding tokens.

Args:
    patch_valid: Tensor of shape [B, N] with True for valid patches, False for padding.
    num_prefix_tokens: Number of prefix tokens (class token, register tokens)
        to prepend, which are always considered valid.
    symmetric: If True, create a symmetric mask.
        If False, create an expanded mask based only on key/value validity.
    q_len: Query sequence length override. Only used when `symmetric` is False.
        Defaults to the key/value sequence length (`kv_len`) if None.
    dtype: Dtype of the output attention mask (e.g., torch.float32).

Returns:
    Attention mask tensor. Additive mask (-inf for masked, 0 for unmasked).
    Shape is [B, 1, seq_len, seq_len] if symmetric=True,
    or [B, 1, q_len, kv_len] if symmetric=False.
Nr   r*  r   r   r%  )rw   r   new_onesr   r  rN  r<  
zeros_likemasked_fill_finfor9  )r  r   r  r  r   r   rO  kv_lenprefix_valid	mask_bool
mask_floats              r{   create_attention_maskr    s   J ""$KDAF 1"++Q,B%**+Uii ;C#))"-0E0Ea0HH	''*	 4q 0188AuM	 !!)9JYJE(:(>(>?rz   	pool_typereduce_include_prefixc                 p   Ub  US;  a  [        U UUUS9n U $ US:  aH  U(       a6  UR                  U R                  S   U5      n[        R                  " XQ/SS9nOU SS2US24   n UR                  U R                  5      nUS:X  aC  XR                  S5      -  R                  SS9nUR                  SS	S
9R                  SS9nXx-  n	U	$ US:X  a  XR                  S5      -  R                  SS9nUR                  SS	S
9R                  SS9nXx-  n
U R                  5       n[        R                  " UR                  5      R                  X) '   UR                  SS9nSX-   -  $ US:X  aL  U R                  5       n[        R                  " UR                  5      R                  X) '   UR                  SS9$  e)a$  Global pooling with NaFlex support for masked tokens.

Applies global pooling while respecting patch validity masks to exclude
padding tokens from pooling operations.

Args:
    x: Input tensor with shape [B, N, C]
    patch_valid: Optional validity mask for patches [B, N-num_prefix_tokens]
    pool_type: Type of pooling ('token', 'avg', 'avgmax', 'max')
    num_prefix_tokens: Number of prefix tokens (class/register)
    reduce_include_prefix: Whether to include prefix tokens in pooling reduction

Returns:
    Pooled tensor with shape [B, C]
N)avgavgmaxr  r  r   r  r   r   r   r  r%  T)r   keepdim)r9  r  g      ?r  )r*   r  r   r   r  r-  r   rN  sumclampcloner  r9  r   )r   r  r  r   r  r  patch_valid_floatmasked_sumsvalid_countspooled
masked_avgmasked_x
masked_maxs                r{   global_pool_naflexr    s   . i/GG/"7	
  1  '//
<MNL))\$?QGK !&''(A#qww/E66r::??A?F(,,D,AGGAGN+	h	66r::??A?F(,,D,AGGAGN /
 779!&X^^!<!@!@]]q])
 j-..	e	779!&X^^!<!@!@}}}##urz   c                     ^  \ rS rSrSr      S/S\\   S\S\S\\\\	\\4   4      SS4
U 4S	 jjjr
S0S
 jrS1S\S\SS4S jjr\R                   R#                  5       S2S\S\SS4S jj5       r\R                   R"                  S\4S j5       r\R                   R"                  S3S\S\4S jj5       r\R                   R"                  S4S\SS4S jj5       r\R                   R"                  S\R2                  4S j5       r\S\R8                  S\R8                  S\\R8                  \\R8                     \4   4S j5       rS5S\S\\   SS4S jjr S\\\R8                  4   4S jr!          S6S\\R8                  \\\R8                  4   4   S\\\\\   4      S\S\S \S!\S"\S#\S\\R8                     S$\\R8                     S%\\R8                     S\\\R8                     \	\R8                  \\R8                     4   \\\4   4   4S& jjr"   S7S'\R8                  S\\R8                     S$\\R8                     S%\\R8                     S\\R8                  \\\R8                  4   4   4
S( jjr#  S8S\R8                  S)\\   S$\\R8                     S\R8                  4S* jjr$  S9S'\R8                  S+\S$\\R8                     S\R8                  4S, jjr%   S7S\\R8                  \\\R8                  4   4   S\\R8                     S$\\R8                     S%\\R8                     S\R8                  4
S- jjr&S.r'U =r($ ):r,   iB  aV  NaFlexVit: Vision Transformer with NaFlex support for flexible input handling.

A flexible implementation of Vision Transformer that supports:
- Standard image classification with various pooling strategies
- NaFlex functionality for variable aspect ratios and resolutions
- Linear patch embedding for pre-patchified inputs
- Multiple position embedding strategies (learned, factorized, rope)
- Comprehensive attention masking for efficient batch processing
- Encapsulated embedding and position encoding in FlexEmbeds module
- Compatible with standard ViT checkpoints through checkpoint filtering
Nr|   r   num_classesr  r}   c                 0  > [         TU ]  5         XVS.nU=(       d
    [        5       nU(       a  [        U40 UD6nUR                  S;   d   eUR
                  (       d  UR                  S:w  d   eUR                  S;   d   e[        UR                  5      =(       d    [        n	[        UR                  5      n
[        UR                  5      =(       d    [        R                  n[        U5      nUR                   =(       d    ["        nX0l        X l        UR                  U l        UR(                  =U l        =U l        U l        UR
                  (       a  SOSU l        U =R.                  UR0                  -  sl        UR0                  U l        UR
                  U l        UR6                  U l        SU l        [;        S+0 SUR<                  _S	U_S
UR(                  _SUR>                  _SUR@                  (       + _SUR
                  _SUR0                  _SU_SURB                  _SUR                  _SURD                  _SURF                  _SURH                  _SURJ                  _SU
_SURL                  _S[O        USS5      _UD6U l(        UR@                  (       a  U	" UR(                  40 UD6O[        RR                  " 5       U l*        SU l+        SU l,        URZ                  (       a  URZ                  S:w  a  SSK.J/nJ0n  URZ                  S:X  aN  U" UR(                  4URb                  URd                  URf                  SURh                  S.UD6U l+        SU l,        OURZ                  S:X  a\  U" UR(                  URd                  -  4URf                  SSURj                  URl                  URh                  S .UD6U l+        SU l,        O[o        S!URZ                   35      eURp                  S:  a$  [s        URp                  U R.                  S"9U l:        OSU l:        [w        URx                  URb                  5      n[        Rz                  " [}        URb                  5       Vs/ s Hw  nU" S+UR(                  URd                  UR~                  UR                  UR                  UR                  UR                  UR                  UR                  UU   U	UUUS#.UD6PMy     sn6 U lF        U RP                  R                  SS$9n[}        URb                  5       Vs/ s H  n[        S%U 3UR(                  US&9PM     snU lI        UR                  (       a$  UR                  (       d  U	" UR(                  40 UD6O[        RR                  " 5       U lL        UR                  S':X  a[  [        U R(                  4UR                  =(       d    URd                  UR                  =(       d    UR~                  U	US(.UD6U lP        OSU lP        UR                  nUc  UR                  S):H  nUR                  (       a  U(       a  U	" UR(                  40 UD6O[        RR                  " 5       U lK        [        R                  " UR                  5      U lS        US:  a"  [        R                  " U R(                  U40 UD6O[        RR                  " 5       U lU        UR                  U lW        UR                  U lX        U R                  UR                  SS*9  gs  snf s  snf ),aX  Initialize NaFlexVit model.

Args:
    cfg: Model configuration. If None, uses default NaFlexVitCfg.
    in_chans: Number of input image channels.
    num_classes: Number of classification classes.
    img_size: Input image size (for backwards compatibility with classic vit).
    **kwargs: Additional config parameters to override cfg values.
r   rM   r  r  r  tokenrY   r  )rM   r   rF   r   r   r   Fr0   r   r2   r   r:   rD   rE   r   rU   rG   rH   rJ   rK   rL   r   r@   rm   Nr   )RotaryEmbeddingCatRotaryEmbeddingMixedmixed)r4   r5   temperature
feat_shapegrid_indexingTaxial)r  	in_pixelsr  ref_feat_shapegrid_offsetr  zUnknown rope_type: )r   )r   r5   r6   r8   r9   r:   r>   	proj_drop	attn_drop	drop_pathrd   re   rg   r4   )r  zblocks.)moduler  r  rY   )r5   r6   rd   re   r  needs_resetrn   )Zr   r   r+   r   rZ   rD   rG   r   rd   r   rc   r   re   r   GELUr   rg   r   r  r   r2   num_featureshead_hidden_sizer   rE   r   r   r[   grad_checkpointingr   r0   ra   rV   rU   rH   rJ   rK   rL   r@   getattrembedsr   norm_prer   rope_is_mixedrN   timm.layers.pos_embed_sincosr  r  r4   r5   rP   rT   rQ   rR   r   rA   r   
patch_dropr   rC   
Sequentialranger6   r8   r9   r:   r>   rB   r<   blocksr  r  r  rW   rX   r   r   r\   r]   	attn_poolr  r?   	head_dropr   headr^   weight_init_moder_   init_weights)r   r|   r   r  r  r   r   r   r  rd   rc   re   rf   rg   r  r  dpripatch_reductionrX   r	  s                       r{   r   NaFlexVit.__init__O  s!   & 	/ #\^!#00C "NNNN#//W"<<<}} EEEE $CNN3@y
)#*>*>?!#--0;BGG	$MM(S	 ' ??EH]]RRD1DN&)oo1#..0!nn"#&#:#: "' # 
~~

 mm
 ))	

 ,,&
 
 ~~
 &
  //
 mm
 !$ 7 7
 #&";";
 %($?$?
 '*&C&C
 -
  ++!
" '.c3NPU&V%
( <?<<
3==7B7R[[] *.	"==S]]f4]}}'0MM))!mm # 4 4#"%"8"8 	 &*"').MMS]]2	 # 4 4###&#:#: # 4 4"%"8"8	 		 &+" #6s}}o!FGG "5##"&"8"8DO
 #DO ((:(:CIIFmm$ 399%%&
$ &#  MM------OO,,,,a&%## " &%&
 , ++0040@ 399%
% '!s}}X%

 8;~~ckkJs}}33_a_j_j_l	 ??e#011BS]]11BS]]%# DN "DN ++?oo.G:=..Wz#--626Z\ZeZeZgCMM2DORSOBIIdnnk@R@Y[YdYdYf	 ##//u=m&
.
s   %A>\"\c                 .   S[         R                  S[        SS4S jn[        U R                  5       H  u  p#[        US5      (       a*  U" UR                  R                  R                  US-   5        [        US5      (       a*  U" UR                  R                  R                  US-   5        [        US	5      (       a   U" UR                  R                  US-   5        [        US
5      (       d  M  U" UR                  R                  US-   5        M     g)z8Apply initialization weight fix with layer-wise scaling.param	_layer_idr}   Nc                     [         R                  " 5          U R                  [        R                  " SU-  5      5        S S S 5        g ! , (       d  f       g = f)Ng       @)r   no_graddiv_r  sqrt)r  r  s     r{   rescale*NaFlexVit.fix_init_weight.<locals>.rescale  s.    

499S9_56 !s   )A
Aattnr   mlpattn_out_projmlp_out_proj)r   rj  rt   r6  r  hasattrr  r   r  r  fc2r  r  )r   r  layer_idlayers       r{   fix_init_weightNaFlexVit.fix_init_weight  s    	75<< 	7C 	7D 	7  )5OHuf%%

..1=ue$$		,,hl;uo..++22HqLAun--**118a<@  6rz   r(  r  c                     U=(       d    U R                   nUS;   d   eSU;   a!  [        R                  " U R                  5      * OSn[	        [        XUS9U 5        U R                  (       a  U R                  5         gg)a>  Initialize model weights according to specified scheme.

Args:
    mode: Initialization mode ('jax', 'jax_nlhb', 'moco', or '')
    needs_reset: If True, call reset_parameters() on modules (default for after to_empty()).
        If False, skip reset_parameters() (for __init__ where modules already self-initialized).
)jaxjax_nlhbmocorM   nlhbr;   r  N)r  r  logr  r%   get_init_weights_vitr_   r  )r   r(  r  	head_biass       r{   r  NaFlexVit.init_weights  sm     ,t,,666639T>TXXd..//r	(kRTXY==  " rz   checkpoint_pathprefixc                 4   ^ SSK Jm  SU4S jjnU" XU5        g )Nr   )_load_weightsc                 >  > [         R                  " USS9n[        U[        5      (       a  SU;   a  US   n[	        UR                  5       5       H  nUR                  S5      (       a  UR                  U5      USU-   '   M2  UR                  S5      (       a  UR                  U5      USU-   '   Ma  UR                  S5      (       a  UR                  U5      USU-   '   M  UR                  S5      (       d  M  UR                  U5      USUS	S
 -   '   M     T" XU5      $ )z8Adapter function to handle the different model structurecpu)map_location
state_dictr   embeds.r   rG   patch_embedr3   N)r   load
isinstancer  listr   
startswithpop)modelr  r  r  r   _orig_load_weightss        r{   _load_weights_adapter8NaFlexVit.load_pretrained.<locals>._load_weights_adapter$  s    O%HJ*d++
0J'5
 *//+,<<,,0:q0AJy1}-\\+..0:q0AJy1}-\\+..0:q0AJy1}-\\-005?^^A5FJy1RS612 - &e@@rz   rM   )vision_transformerr  )r   r  r  r	  r  s       @r{   load_pretrainedNaFlexVit.load_pretrained  s     	L	A& 	dV<rz   c                     1 SknU R                   (       aD  [        U R                   S5      (       a)  UR                  U R                   R                  5       5        U$ )zGet set of parameter names that should not have weight decay applied.

Returns:
    Set of parameter names to skip during weight decay
>   embeds.cls_tokenembeds.pos_embedembeds.reg_tokenno_weight_decay)r   r  updater  )r   	skip_lists     r{   r  NaFlexVit.no_weight_decay9  sC     Q	99,=>>TYY6689rz   coarsec                     [        SSS/S9$ )zGet parameter group matcher for optimizer parameter grouping.

Args:
    coarse: Whether to use coarse-grained grouping

Returns:
    Dictionary mapping group names to regex patterns
z^embeds)z^blocks\.(\d+)N)z^norm)i )stemr  )r  )r   r  s     r{   group_matcherNaFlexVit.group_matcherE  s     -/CD
 	
rz   enablec                     Xl         [        U R                  S5      (       aL  [        U R                  R                  S5      (       a&  U R                  R                  R	                  U5        ggg)z|Enable or disable gradient checkpointing for memory efficiency.

Args:
    enable: Whether to enable gradient checkpointing
r  set_grad_checkpointingN)r  r  r  r  r  )r   r  s     r{   r   NaFlexVit.set_grad_checkpointingT  sS     #)4;;..74;;;R;RTl3m3mKK##::6B 4n.rz   c                     U R                   $ )zMGet the classification head module.

Returns:
    Classification head module
)r  r   s    r{   get_classifierNaFlexVit.get_classifier_  s     yyrz   r   r"  c           	         [        U5      n0 n/ n[        U5       H2  u  pgXt;  a  / XG'   UR                  U5        XG   R                  U5        M4     UR                  u  pn
XR                  -
  nU R
                  (       a/  [        U R                  UUUUUR                  UR                  5      $ [        R                  " XU R                  R                  S-  UR                  UR                  S9n[        U R                  S5      (       a^  U R                  R                  U5      n[        X]UR!                  5       5       H$  u  p~nUu  nnUU-  nU H  nUSU XSU24'   M     M&     OGUR#                  5        H3  u  pvU R                  R%                  US9nUu  nnUU-  nUSU XSU24'   M5     UR'                  S5      nU$ )a  Generate ROPE position embeddings for NaFlex batch with variable grid sizes.

Args:
    x: Input tensor [B, N, C]
    patch_coord: Patch coordinates [B, N, 2] with (y, x) values

Returns:
    ROPE embeddings:
    - Axial mode: Tensor of shape [B, 1, N, dim*2]
    - Mixed mode: List of tensors, each of shape [B, num_heads, N, dim], one per depth layer
    - Mixed mode with iterator: Iterator yielding tensors per depth
r   r   get_batch_embedsNr   r   )r   r6  r8  r   r   r  r   r   r   r   r   r   r   r  r$  r   rH  r   r   rN  )r   r   r"  r>  r   r   r   r   r   rO  r   r   rope_embedsunique_embedsr   r   r   r   r   r   s                       r{   _generate_rope_naflexNaFlexVit._generate_rope_naflexh  s   & 8D &'89MB/-/*##I.&--b1	 : ''a,,,%		  kk!diimma.?qwwWXW_W_`499011 II66|DM36|TcTjTjTl3m/	- 1U
'B38*3EKKZK0 ( 4n "1!6!6!8	!YY00y0A
 1U
/9+:/FO,	 "9 "++A.rz   rZ   c                 (   Xl         UbB  US;   d   eUS:X  a  U R                  c   S5       eUS:w  a  U R                  b  SU l        X l        US:  a'  [        R                  " U R
                  U5      U l        g[        R                  " 5       U l        g)zReset the classification head with new number of classes and pooling.

Args:
    num_classes: Number of classes for new classification head
    global_pool: Optional new global pooling type
Nr  rY   z=Cannot currently add attention pooling in reset_classifier().r   )r  r  rZ   r   r   r2   r   r  )r   r  rZ   s      r{   reset_classifierNaFlexVit.reset_classifier  s     '""NNNNe#(>]]]u%$..*D!%*>IAoBIIdnnk:	SUS^S^S`	rz   c                 2   USLnU R                  UUUS9u  pSnU R                  b9  Ub  U R                  X5      nO$Ub  U R                  R                  US9nO S5       eSnU R                  (       ak  U R
                  b^  U R                  U5      u  pUb  UR                  SU5      nUb3  U R                  (       d"  [        XXS9nU(       d  UR                  S5      nUc  [        UU R                  UR                  S9nU R                  U5      nUUUUUS.$ )	zQForward pass through patch / abs pos / rope pos embeds and patch dropout
        N)r"  r  r   z4Expected one of patch_coord or grid_size to be validr   pos_embed_has_batch)r   r   )r   r  r%  	attn_maskkeep_indices)r  r   r'  r   trainingr  gatherr  r   rN  r  r   r   r  )	r   r   r"  r  r/  naflex_moder   r%  r0  s	            r{   _forward_embedsNaFlexVit._forward_embeds  s9    "- {{## # 
 99 &"88H&"ii11	1BTTTu 04==T__8"ooa0OA&)00LA&t/A/A 5Q\s""-"7"7":K -"&"8"8ggI MM!&&"(
 	
rz   indicesreturn_prefix_tokensr   
stop_early
output_fmtintermediates_onlyoutput_dictr  r/  c           
         US;   d   S5       eUS:H  n/ n[        [        U R                  5      U5      u  p[        U[        5      (       a  US   n	US   n
US   n S5       eUnUR
                  S	S
 u  nnU R                  R                  UU45      u  nnU R                  UU	U
US9nUS   nUR                  SS
5      nUR                  SS
5      nUR                  SS
5      n[        R                  R                  5       (       d  U(       d  U R                  nOU R                  S
US-    nU R                  =(       a#    [        R                  R                  5       (       + nU R                  (       a  Ub  [        [!        U R                  U5      5       H  u  nu  nnU R"                  (       a,  U R$                  b  Ub  ['        UUUUR                  SS5      S9nU(       a  [)        UUUUS9nOU" UUUS9nUU;   d  Mj  UR+                  U(       a  U R-                  U5      OU5        M     O~[        U5       Ho  u  nnUb  U(       a  [)        UUUUS9nO!U" UUUS9nOU(       a  [)        UXS9nOU" XS9nUU;   d  MF  UR+                  U(       a  U R-                  U5      OU5        Mq     U R.                  (       aK  U Vs/ s H  nUS
S
2SU R.                  24   PM     nnU Vs/ s H  nUS
S
2U R.                  S
24   PM     nnOS
nU(       aQ  U Vs/ s HD  nUR1                  UR
                  S   UUS5      R3                  SSSS5      R5                  5       PMF     nnU(       a5  0 nUUS'   Ub  U(       a  UUS'   U(       d  U R-                  U5      n U US'   U$ [        R                  R                  5       (       d  U(       a  Ub  [7        [!        UU5      5      nU(       a  U$ U R-                  U5      nX4$ s  snf s  snf s  snf )a  Forward features that returns intermediates.

Args:
    x: Input image tensor
    indices: Take last n blocks if int, all if None, select matching indices if sequence
    return_prefix_tokens: Return both prefix and spatial intermediate tokens
    norm: Apply norm layer to all intermediates
    stop_early: Stop iterating over blocks when last desired intermediate hit
    output_fmt: Shape of intermediate feature outputs
    intermediates_only: Only return intermediate features
    output_dict: Return outputs as a dictionary with 'image_features' and 'image_intermediates' keys
    patch_coord: Optional patch coordinates [B, N, 2] for NaFlex mode
    patch_valid: Optional patch type indicators (1=patch, 0=padding) for NaFlex
    attn_mask: Optional attention mask for masked attention
Returns:
    A tuple with (final_features, intermediates), a list of intermediate features, or a dictionary containing
    'image_features' and 'image_intermediates' (and optionally 'image_intermediates_prefix')
)NCHWNLCz)Output format must be one of NCHW or NLC.r=  r"  r  r   FzWIP, patch mode needs more workr  Nr"  r  r/  r%  r0  r/  r   r3  r-  r   r/  r/  r   r%  r   r   image_intermediatesimage_intermediates_prefiximage_features)r!   r=  r  r  r   r   r  r   r4  getr   jitis_scriptingr  r  r6  r   r1  r  r   r$   r8  r   r   r   r   
contiguousr  )!r   r   r6  r7  r   r8  r9  r:  r;  r"  r  r/  r   intermediatestake_indices	max_indexr   heightwidthr   r   r  r%  r0  r  do_checkpointingr  blkr   yprefix_tokensresult_dictx_finals!                                    r{   forward_intermediatesNaFlexVit.forward_intermediates  s   F _,Y.YY,&"6s4;;7G"QaM*KM*K	lG;;;5GGGBCLMFE;;00&%ADAq %%##	 & 
 9jj5zz.$7JJ{D1	 99!!##:[[F[[)a-0F22S599;Q;Q;S7S+"9(1#dkk;2O(P$$C==T__%@\E]!7"$,2JJ}e,L	"J $"3
iPAAJ)DA$!((11E! )Q$ $F+3*'&sAK9UyI'&sAC7$!((11E ," !!ERS]Qq!D$:$:"::;]MSDQRMqQq$"8"8"99:MMRM M '&A 		!''!*aB/771aCNNP&   K1>K-.(-A<I89 &))A,07,- yy%%'',@]E^ ]M!BCM  IIaLO TRs   ?O;$P APr   c                 6   USLnU R                  UUUUS9nUS   nUR                  SS5      nUR                  SS5      n	UR                  SS5      nU R                  =(       a#    [        R                  R                  5       (       + n
U R                  (       av  Ubs  [        [        U R                  U5      5       HO  u  nu  pU R                  (       a  U R                  b  U	b  [        UUU	US9nU
(       a  [        XXS9nMH  U" X}US9nMQ     O]Ub.  U R                   H  nU
(       a  [        XXS9nM  U" XxUS9nM     O,U R                   H  nU
(       a  [        XUS	9nM  U" XtS	9nM     U R                  U5      nU(       a  UUR                  S
S5      S.$ U$ )z	
        Nr?  r   r%  r0  r/  r-  r@  rA  r  )r   r  )r4  rE  r  r   rF  rG  r  r6  r   r  r1  r  r   r$   r   )r   r   r"  r  r/  r3  r  r   r%  r0  rN  r  rO  r   s                 r{   forward_featuresNaFlexVit.forward_features  s    "- %%##	 & 
 9jj5zz.$7JJ{D1	  22S599;Q;Q;S7S+"9(1#dkk;2O(P$$C==T__%@\E]!7"$,7	"J $"3
PAA)DA )Q ${{#"3QAA9EA	 # {{#"3Y?AA3A	 # IIaL%zz-> 
 rz   r  c                 Z   U R                   bk  [        UU R                  (       a  U R                  OSSSUR                  S9nU R                  (       d  US S 2U R                  S 24   nU R                  XS9nU$ Uc  U R
                  OUn[        UUUU R                  U R                  S9nU$ )Nr   Fr   )r   r  r  r   rA  r  )r  r  r[   r   r   rZ   r  )r   r   r  r  r/  s        r{   _poolNaFlexVit._pool  s     >>%-<@<T<T$"8"8Z[ggI ++a//001q6AH(1(9D$$y	"44"&":":
 rz   
pre_logitsc                     U R                  XS9nU R                  U5      nU R                  U5      nU(       a  U$ U R                  U5      $ )N)r  )rZ  rX   r  r  )r   r   r\  r  r   s        r{   forward_headNaFlexVit.forward_head  sD     JJwJ8LLONN1q0DIIaL0rz   c                    [        U[        5      nU=(       d    USLnU(       a  U(       a<  US   nUR                  SU5      nUR                  SU5      nUR                  SU5      nOUn[        USLS5        [        USLS5        U R	                  UUUUS9nU R
                  " S	0 UD6nU$ U R	                  U5      nU R                  U5      nU$ )
a0  Forward pass with optional NaFlex support.

Args:
    x: Input tensor. Supported formats:
        - [B, C, H, W] standard image input
        - [B, N, P*P*C] pre-patchified tensor (flattened patches)
        - [B, N, Ph, Pw, C] pre-patchified tensor (variable patch size)
        - Dict from NaFlex collator
    patch_coord: Optional patch coordinates [B, N, 2] for NaFlex mode.
    patch_valid: Optional patch validity indicators for NaFlex.
    attn_mask: Optional attn mask to override defaults generated from patch_valid

Returns:
    Model output tensor.
Nr   r  r"  r/  z&patch_coord is required in naflex modez&patch_valid is required in naflex mode)r   r  r"  r/  rn   )r  r   rE  r   rW  r^  )	r   r   r"  r  r/  input_is_dictr3  r   featuress	            r{   r  NaFlexVit.forward  s    , #1d+#>{$'>I,eeM;?eeM;?EE+y9	Kt+-UVKt+-UV,,''#	 - H !!-H-A  %%a(A!!!$Arz   )r  r  r2   r  rX   r  r_   rZ   r  r   r  r  r  r   r   r  r  r  r   r   r  r[   r   r  r  )Nr     NNNr  )rM   Tr  Fr  )N)
NFFFr=  FFNNN)NNNr  )FN))ro   rp   rq   rr   rs   r
   r+   rt   r   r   r   r  rx   rw   r  r   rF  ignorer  r   r  r   r  r  r   r  r!  r   rj  r	   r   r'  r*  r4  rT  rW  rZ  r^  r  ry   r  r  s   @r{   r,   r,   B  s   
 +/#>Bl>,'l> l> 	l>
 uS%S/%9:;l> 
l> l>\A"# # # #" YY=s =C = = =2 YY	 	 	 YY
D 
T 
 
 YYCT CT C C YY		   B||B B 
u||T%,,/4	5	B BHaC ahsm aW[ a"<
 
c5<<	 <
B 8<).$$', %262604S U\\4U\\(9#::;S  eCcN34S  #'	S 
 S  S  S  !%S  S  "%,,/S  "%,,/S   -S  
tELL!5tELL7I)I#JDQTVYQYNZ	[S p 372604>\\> "%,,/> "%,,/	>
  -> 
u||T#u||"344	5>F (,26	||  } "%,,/	
 
B  %26		1\\	1 	1 "%,,/		1
 
	1 3726040U\\4U\\(9#::;0 "%,,/0 "%,,/	0
  -0 
0 0rz   c                    U S   nU S   nU S   n[        [        U5      5       H  nX4   X$      nXS S 2S4   R                  5       S-   R                  5       nXS S 2S4   R                  5       S-   R                  5       nUR	                  XgSSS5      R                  SSS	SS5      nUR	                  SUS-  US-  5      nSS
KJn  U" USU S3SS9  M     g )Nr"  r  r   r   r   r/   r   r   r   )
save_imagepatch_z.jpgT)	normalize)r  r=  r  r   r   r   torchvision.utilsrh  )	r   r"  r  r   r  patchr   r   rh  s	            r{   _debug_dump_patchesrm  -  s    M"KM"K	lG3w< 
;>*Aq!%%'!+113Aq!%%'!+113aBA.66q!Q1Ea2qt,05F1#T*d; !rz   r(  r  r  c                 j    SSK JnJnJnJn  SU ;   a
  [        X1US9$ SU ;   a	  [        XBS9$ [        XRS9$ )zFFunction imported from vision_transformer.py to maintain compatibilityr   )init_weights_vit_jaxinit_weights_vit_mocoinit_weights_vit_timminit_weights_reset_parametersr  )r  r  r  r  )r  ro  rp  rq  rr  r   )r(  r  r  ro  rp  rq  rr  s          r{   r  r  <  sB      }+kZZ	4,FF,FFrz   r  r  c                 2   0 nU R                  5        GH  u  p4US:X  Ga  [        UR                  S5      (       Ga  UR                  S:X  Ga  SnSnSU ;   a  U S   R                  S   nSU ;   a  U S   R                  S   nXV-   nUR                  S   nX-
  n	[
        R                  " U	5      n
[
        R                  " U5      nX:w  a  U
R                  5       (       a}  UR                  5       (       dh  U	nUSS2SU24   nUR                  5       (       a  U S==   U-  ss'   USS2XV24   nUR                  5       (       a  U S==   U-  ss'   USS2US24   nU
n[        U5      nX-  U:X  a!  UR                  SXUR                  S   5      nO[        UR                  R                  S	5      (       ai  UR                  R                  R                  u  pX-  U:X  a!  UR                  SXUR                  S   5      nO[        R                  S
U SX-   S35        XBS'   GM  US:X  a  XBS'   GM  US:X  a  XBS'   GM+  UR                  S5      (       a:  USS nUS:X  a#  UR!                  SSSS5      R#                  S5      nSU-   nXBU'   GM{  XBU'   GM     U$ )zZHandle state dict conversion from original ViT to the new version with combined embedding.rG   r   r   r   r   r   Nr   r   z-Position embedding size mismatch: checkpoint=z, model=z?. Using default initialization and will resize in forward pass.r  r  r  zpatch_embed.r3   zproj.weightr   )r   r  r  r  r   r  r  
is_integernumelrt   r   r  r   _loggerwarningr  r   r   )r  r  out_dictr   r   num_cls_tokennum_reg_tokenr   num_patchesnum_patches_no_prefixgrid_size_no_prefixr   cls_token_embreg_token_embr   r   suffixnew_keys                     r{   checkpoint_filter_fnr  M  s    H  "u||[11affk ! !*,$.{$;$A$A!$DM*,$.{$;$A$A!$DM$1$A!  ggaj(3(G%&*ii0E&F# IIk2	'40;;==iFZFZF\F\ #8K$%a=&8$9M$**,,";/=@/$%a)D&D$EM$**,,";/=@/!.//0A 3I	N	 (K7		!Y1771:FA u||77EE$||77AA5K/ !		!Q1771: >A $OO"OP[}\dfgfkdm n` !a ,-'(++,'(++,'(\\.))rsVF&IIaAq)11!4&(G !WQKC #F Orz   urlc                 2    U SSS SS[         [        SSSS.UE$ )	Nrd  )r     r  g      ?rI   zembeds.projr  z
apache-2.0)r  r  
input_size	pool_sizecrop_pctr   meanr  
first_conv
classifierlicenser   )r  r   s     r{   _cfgr    s7    #"'%#  rz   ztimm/)	hf_hub_id)r  r  )z)naflexvit_base_patch16_gap.e300_s576_in1kz-naflexvit_base_patch16_par_gap.e300_s576_in1kz0naflexvit_base_patch16_parfac_gap.e300_s576_in1kz$naflexvit_base_patch16_map.untrainedz,naflexvit_so150m2_patch16_reg1_gap.untrainedz,naflexvit_so150m2_patch16_reg1_map.untrainedz&naflexvit_base_patch16_siglip.v2_webliz(naflexvit_so400m_patch16_siglip.v2_weblivariant
pretrainedc           
         UR                  SS5      nUR                  S[        5       5      n[        [        5       Vs1 s H  oUR                  iM     nn[	        U5       Vs0 s H  owU;   d  M
  XrR                  U5      _M     nnU(       a  [        U40 UD6n[        [        X4[        U[        USS9S.UD6n	U	$ s  snf s  snf )Nout_indicesr   r|   getter)r  feature_cls)pretrained_filter_fnr|   feature_cfg)
r  r+   r   namer  r   r    r,   r  r  )
r  r  r   r  r|   fcfg_field_namesr   cfg_updatesr  s
             r{   _create_naflexvitr    s    **]A.K
**ULN
+C'-l';<';!vv';O<-1&\R\/=Q#1jjm#\KRc1[1 71[hG	
 E L =Rs   B<"	C/Cc           	      6   UR                  SS5        UR                  SS5        UR                  SS5      nUR                  SS5      nUc  US:X  a  SnSUR                  S	S5      UUUR                  S
S5      UR                  SS5      S.UEn[        X40 UD6$ )a  Create FlexVit model from classic VisionTransformer configuration.

This function handles the parameter mapping and configuration logic needed
to create FlexVit models that are compatible with classic VisionTransformer
configurations and pretrained weights.

Args:
    variant: Model variant name
    pretrained: Whether to load pretrained weights
    **kwargs: Classic VisionTransformer parameters

Returns:
    FlexVit model instance
no_embed_classNdynamic_img_sizerZ   r  rX   r  TrD   r7   Fr   )rH   rD   rZ   rX   r7   r=   r  rE  r  )r  r  r   gprX   flex_kwargss         r{   _create_naflexvit_from_classicr    s    ( JJ&
JJ!4( 
M7	+BjjD)G2;  $zz-6 **%5u=!',=u!E K W@K@@rz   c           	         UR                  SS5        UR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS	5      nUR                  S
S5      nU(       a  U(       a  SOSnOSnUR                  SS5      n	UR                  SS5      n
UR                  SS5      nUR                  SS5      nUc  U	S:H  n0 SS_SUR                  SS5      _SUR                  SUR                  SS5      5      _SU	_SU
_SU_SU_SUR                  SS5      (       a  SOS_S U_SU_SU_S
U_S!UR                  S"S5      _S#UR                  S#S$5      _S%UR                  S%S5      _S&UR                  S&S5      _S'UR                  S(S5      _S)UR                  S*S5      0EUEn[        X40 UD6$ )+a~  Create NaFlexVit model from EVA configuration.

This function handles the parameter mapping and configuration logic needed
to create NaFlexVit models that are compatible with EVA configurations
and pretrained weights.

Args:
    variant: Model variant name
    pretrained: Whether to load pretrained weights
    **kwargs: EVA model parameters

Returns:
    NaFlexVit model instance
r  Nuse_rot_pos_embFrope_mixed_moderP   rO   rR   r;   rT   rS   r  r  r   rZ   r  use_pre_transformer_normuse_post_transformer_normTuse_fc_normrH   rD   rE   r   r   rV   rW   rX   rG   use_abs_pos_embrF   rN   rQ   r  rj   r   rk   rl   r7   r   r=   r   r  )r  r  r   r  r  rP   rR   rT   rN   r  r  r  r  naflex_kwargss                 r{   _create_naflexvit_from_evar    sH   ( JJ& jj!2E:Ojj!2E:Ozz"4f=zz"4b9$8$?.GG		 
M5	)B%zz*DeL &

+F M**]D1KEktvzz-6 	vzz"2FJJ|Q4OP 	r	
 	, 	/ 	; 	&**->"E"EY6 	Y 	, 	, 	0 	vzz*:DA 	VZZU3 	fjju5  	VZZT2!" 	&**[%8#$ 	 ,>!F%& 'M, WBMBBrz   c                 @    [        SSSSSSSSS9n[        S
XS	.UD6nU$ )zCViT-Base with NaFlex functionality and global average pooling.
    r/   r1   r3   h㈵>r  r   T)r0   r2   r4   r5   r>   rZ   rE   rX   r  r|   )naflexvit_base_patch16_gapr+   r  r  r   r|   r  s       r{   r  r  B  s@     	C eze^deELrz   c                 B    [        SSSSSSSSSS9	n[        S
XS	.UD6nU$ )z]ViT-Base with NaFlex functionality, aspect preserving pos embed, global average pooling.
    r/   r1   r3   r  Tr  r   )	r0   r2   r4   r5   r>   rK   rZ   rE   rX   r  )naflexvit_base_patch16_par_gapr  r  s       r{   r  r  T  sC      $
C i:ibhiELrz   c                 D    [        SSSSSSSSSSS	9
n[        SXS
.UD6nU$ )zjViT-Base with NaFlex functionality, aspect preserving & factorized pos embed, global average pooling.
    r/   r1   r3   r  Tr   r  r   )
r0   r2   r4   r5   r>   rK   rG   rZ   rE   rX   r  )!naflexvit_base_patch16_parfac_gapr  r  s       r{   r  r  g  sF      $C ljleklELrz   c           
      >    [        SSSSSSSS9n[        S	XS.UD6nU$ )
zBViT-Base with NaFlex functionality and MAP attention pooling.
    r/   r1   r3   r  rY   r   )r0   r2   r4   r5   r>   rZ   rE   r  )naflexvit_base_patch16_mapr  r  s       r{   r  r  {  s=     C eze^deELrz   c                 D    [        SSSSSSSSS	S
S9
n[        SXS.UD6nU$ )  ViT-SO150M2 with NaFlex functionality for variable aspect ratios and resolutions.

This model supports:
1. Variable aspect ratios and resolutions via patch coordinates
2. Position embedding interpolation for arbitrary grid sizes
3. Explicit patch coordinates and valid token masking
r/   @        NN@r  Fr   r  T)
r0   r2   r4   r5   r6   r>   r8   rE   rZ   rX   r  )"naflexvit_so150m2_patch16_reg1_gapr  r  s       r{   r  r    sF     C mzmflmELrz   c                 B    [        SSSSSSSSS	S
9	n[        SXS.UD6nU$ )r  r/   r  r  r  r  r  Fr   rY   )	r0   r2   r4   r5   r6   r>   r8   rE   rZ   r  )"naflexvit_so150m2_patch16_reg1_mapr  r  s       r{   r  r    sC     
C mzmflmELrz   c           	      <    [        SSSSSSS9n[        SXS.UD6nU$ )	zGViT-Base with NaFlex functionality and SigLIP-style configuration.
    r/   r1   r3   	gelu_tanhrY   )r0   r2   r4   r5   re   rZ   r  )naflexvit_base_patch16_siglipr  r  s       r{   r  r    s:     C h*haghELrz   c           
      >    [        SSSSSSSS9n[        S	XS.UD6nU$ )
zUViT-SO400M with NaFlex functionality for variable aspect ratios and resolutions.
    r/   i     gZӼ@r  rY   )r0   r2   r4   r5   r6   re   rZ   r  )naflexvit_so400m_patch16_siglipr  r  s       r{   r  r    s=     C jJjcijELrz   r  )Nr  r   F)r  r;   Tr  re  )ars   loggingr  dataclassesr   r   r   	functoolsr   typingr   r   r	   r
   r   r   r   r   r   r   torch.nnr   torch.nn.functional
functionalr   	timm.datar   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   _builderr    	_featuresr!   _features_fxr"   r#   _manipulater$   r%   	_registryr&   r'   r   r(   r  r)   r*   __all__	getLoggerro   rv  r+   r   rj  rt   rw   r   r   r   r   r  r   rI  r   r  rx   r  r,   rm  rv   r  r  r  default_cfgsr  r  r  r  r  r  r  r  r  r  r  rn   rz   r{   <module>r     s  $   2 2  O O O     E    + + L 0 <  6;
' 

H
% T, T, T,n	 	L 	 <<#s(O  5<<sCx()	@J J= =@)l )x )X 	299 	 	D  "##"]]A\\AA A }	A
 {{A ellA AH  /3 !"&+E<<Eell+E E 	E
  $E \\E EPh		 hV<Gs Gu GQU Gai G"HT#s(^ HI H$sTWx. HVc T#s(^ " %152 6:6 9=9 -1F48F48F /3/ 151%& 0s  9 * !(A(A(A 	(AZ !@C@C@C 	@CF 4 i  " t )  $ $ Y  & 4 i    4 i  0 4 i  . d     9  rz   