
    RЦiq                     <   S r SSKrSSKJr  SSKJrJrJrJrJ	r	J
r
  SSKrSSKJr  SSKJs  Jr  SSKJrJrJrJr  SSKJrJrJrJrJrJrJrJrJrJ r J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  SS	K(J)r)  SS
K*J+r+  SSK,J-r-  SSK.J/r/J0r0  SSK1J2r2J3r3  S/r4\Rj                  " \65      r7S\8S\8S\Rr                  S\Rr                  4S jr:\-" \:5        S\Rr                  S\Rr                  S\Rr                  S\\8\84   S\\8\84   S\Rr                  4S jr; " S S\Rx                  5      r= " S S\Rx                  5      r>S\Rr                  S\8S\\Rr                  \\8\84   4   4S jr? S8S\Rr                  S\8S \\8\84   S!\\\8\84      S\Rr                  4
S" jjr@ " S# S\Rx                  5      rAS$ rBS9S% jrC\2" \C" S&S'S(\\SS)S*S+9\C" S,S'S(\\SS)S*S+9\C" S-S'S(\\SS)S*S+9\C" \\S.S/S0S19S2.5      rDS:S3 jrE\3S:S\A4S4 jj5       rF\3S:S\A4S5 jj5       rG\3S:S\A4S6 jj5       rH\3S:S\A4S7 jj5       rIg);a*  Vision Transformer (ViT) in PyTorch

A PyTorch implement of Vision Transformers as described in:

'Exploring Plain Vision Transformer Backbones for Object Detection'
    - https://arxiv.org/abs/2203.16527

'Segment Anything Model (SAM)'
    - https://github.com/facebookresearch/segment-anything/

    N)partial)CallableListOptionalTupleTypeUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STD)
PatchEmbedMlpDropPathcalculate_drop_path_ratesPatchDropoutLayerNorm2d
LayerScaleClassifierHeadNormMlpClassifierHeadFormatresample_abs_pos_embed_nhwcRotaryEmbeddingCatapply_rot_embed_cat	to_2tupleuse_fused_attn)Final   )build_model_with_cfg)feature_take_indices)register_notrace_function)
checkpointcheckpoint_seq)generate_default_cfgsregister_modelVisionTransformerSAMq_sizek_sizerel_posreturnc                 j   [        S[        X5      -  S-
  5      nUR                  S   U:w  ah  [        R                  " UR                  SUR                  S   S5      R                  SSS5      USS9nUR                  SU5      R                  SS5      nOUn[        R                  " U [        R                  S9SS2S4   [        X-  S	5      -  n[        R                  " U[        R                  S9SSS24   [        X-  S	5      -  nXV-
  US-
  [        X-  S	5      -  -   nXGR                  5          $ )
a8  
Get relative positional embeddings according to the relative positions of
    query and key sizes.
Args:
    q_size (int): size of query q.
    k_size (int): size of key k.
    rel_pos (Tensor): relative position embeddings (L, C).

Returns:
    Extracted positional embeddings according to relative positions.
   r   r   linear)sizemode)dtypeN      ?)intmaxshapeFinterpolatereshapepermutetorcharangefloat32long)r'   r(   r)   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss           a/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/models/vision_transformer_sam.pyget_rel_posrD   4   s    q3v..23L}}Q<'--OOAw}}Q/4<<Q1E

 *11"lCKKAqQ! ||F%--8DACY\D]]H||F%--8qACY\D]]H*vzSRU=V.VVO//122    q	rel_pos_h	rel_pos_wc                 X   Uu  pVUu  px[        XWU5      n	[        XhU5      n
U R                  u  pnU R                  XXm5      n[        R                  " SX5      n[        R                  " SX5      nUSS2SS2SS2SS2S4   USS2SS2SS2SSS24   -   nUR                  SXV-  Xx-  5      $ )aw  
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
Args:
    q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
    rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
    rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
    q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
    k_size (Tuple): spatial sequence size of key k with (k_h, k_w).

Returns:
    bias (Tensor): attention bias to add to attention map
zbhwc,hkc->bhwkzbhwc,wkc->bhwkNr-   )rD   r5   r8   r:   einsum)rF   rG   rH   r'   r(   q_hq_wk_hk_wRhRwB_dimr_qrel_hrel_w	attn_biass                     rC   get_decomposed_rel_pos_biasrX   W   s    ( HCHC	Sy	)B	Sy	)BIA#
))AC
%CLL)33ELL)33EaAq$&'%1aq0@*AAIRCI66rE   c                      ^  \ rS rSr% \\   \S'   SSSSS\R                  SSSSS4S\	S	\	S
\S\S\
S\
S\\R                     S\S\\\	\	4      S\\R                     4U 4S jjjrS rSrU =r$ )	Attentiony   
fused_attn   TF        NrS   	num_headsqkv_biasqk_norm	attn_drop	proj_drop
norm_layeruse_rel_pos
input_sizeropec                   > XS.n[         TU ]  5         X-  S:X  d   S5       eX l        X-  U l        U R                  S-  U l        [        5       U l        [        R                  " XS-  4SU0UD6U l	        U(       a  U" U R                  40 UD6O[        R                  " 5       U l        U(       a  U" U R                  40 UD6O[        R                  " 5       U l        [        R                  " U5      U l        [        R                  " X40 UD6U l        [        R                  " U5      U l        Xl        U R"                  (       a  U
b   eU	c   S5       e[        R$                  " [&        R(                  " SU	S   -  S	-
  U R                  40 UD65      U l        [        R$                  " [&        R(                  " SU	S	   -  S	-
  U R                  40 UD65      U l        Xl        g )
Ndevicer1   r   z$dim should be divisible by num_headsg         biaszBInput size must be provided if using relative positional encoding.r,   r   )super__init__r_   head_dimscaler   r\   nnLinearqkvIdentityq_normk_normDropoutrb   projrc   re   	Parameterr:   zerosrG   rH   rg   )selfrS   r_   r`   ra   rb   rc   rd   re   rf   rg   rj   r1   dd	__class__s                 rC   rn   Attention.__init__|   s    /!#K%KK#"(]]d*
(*99S'??B?9@j5"5bkkm9@j5"5bkkmI.IIc-"-	I.&<<&TST&  \\%++a*Q-6G!6KT]]*a^`*abDN\\%++a*Q-6G!6KT]]*a^`*abDN	rE   c                    UR                   u  p#pEX4-  nUR                  X&S5      nU R                  U5      R                  X&SU R                  S5      R                  SSSSS5      nUR                  SX R                  -  US5      R                  S5      u  pn
U R                  U5      U R                  U	5      pU R                  (       a%  [        XR                  U R                  X44X445      nO]S nU R                  bN  U R                  R                  5       n[        X5      R!                  U
5      n[        X5      R!                  U
5      n	U R"                  (       aR  [$        R&                  R(                  R+                  XU
UU R,                  (       a  U R.                  R0                  OSS9nOMXR2                  -  nXR5                  S	S5      -  nUb  X-   nUR7                  SS
9nU R/                  U5      nX-  nUR                  X R                  US5      R5                  SS5      R                  X&S5      nU R9                  U5      nU R;                  U5      nUR                  X#US5      nU$ )Nr-   rk   r,   r   r      r^   )	attn_mask	dropout_p)rS   )r5   r8   rs   viewr_   r9   unbindru   rv   re   rX   rG   rH   rg   	get_embedr   type_asr\   r:   rq   
functionalscaled_dot_product_attentiontrainingrb   prp   	transposesoftmaxrx   rc   )r{   xrQ   HWrR   Nrs   rF   kvrW   rg   attns                 rC   forwardAttention.forward   s
   WW
aEIIaBhhqkqQ;CCAq!QPQR++a^^!3Q;BB1Ea{{1~t{{1~13A~~t~~XYW]`a_efIIyy$yy**,'088;'088;??##@@a#.2mm$..** A A JJA{{2r**D$'<<B<'D>>$'DAFF1nna,66q!<DDQ2NIIaLNN1FF1BrE   )rb   r\   ro   rv   r_   rx   rc   ru   rs   rG   rH   rg   rp   re   )__name__
__module____qualname____firstlineno__r   bool__annotations__rq   	LayerNormr3   floatr   Moduler   r   rn   r   __static_attributes____classcell__r}   s   @rC   rZ   rZ   y   s    d
 !!!!*,,, %48(,&& & 	&
 & & & RYY& & !sCx1& 299%& &P& &rE   rZ   c                   
  ^  \ rS rSrSSSSSSS\R
                  \R                  \SSSSSS4S\S	\S
\	S\
S\
S\	S\	S\\	   S\	S\\R                     S\\R                     S\\R                     S\
S\4U 4S jjjrS rSrU =r$ )Block         @TFr^   Nr   rS   r_   	mlp_ratior`   ra   rc   rb   init_values	drop_path	act_layerrd   	mlp_layerre   window_sizec                 f  > UUS.n[         TU ]  5         Xl        U" U40 UD6U l        [	        U4UUUUUUUUS:X  a  UOX4US.	UD6U l        U(       a  [        U4SU0UD6O[        R                  " 5       U l	        U	S:  a  [        U	5      O[        R                  " 5       U l        U" U40 UD6U l        U" SU[        X-  5      U
US.UD6U l        U(       a  [        U4SU0UD6O[        R                  " 5       U l        U	S:  a  [        U	5      U l        g [        R                  " 5       U l        g )Nri   r   )	r_   r`   ra   rb   rc   rd   re   rf   rg   r   r^   )in_featureshidden_featuresr   drop )rm   rn   r   norm1rZ   r   r   rq   rt   ls1r   
drop_path1norm2r3   mlpls2
drop_path2)r{   rS   r_   r   r`   ra   rc   rb   r   r   r   rd   r   re   r   rf   rg   rj   r1   r|   r}   s                       rC   rn   Block.__init__   s=   * /&*r*

!#%0A%5zK;U
 
	 FQ:cA{AbAVXVaVaVc1:R(9-R[[]*r*
 
0	

 
 FQ:cA{AbAVXVaVaVc1:R(9-R[[]rE   c           
      "   UR                   u  p#pEUnU R                  U5      nS nU R                  S:  a  [        XR                  5      u  pU R	                  U R                  U R                  U5      5      5      nU R                  S:  a  [        XR                  X44U5      nXa-   nUR                  X#U-  S5      nXR                  U R                  U R                  U R                  U5      5      5      5      -   nUR                  X#US5      nU$ )Nr   r-   )r5   r   r   window_partitionr   r   r   window_unpartitionr8   r   r   r   r   )r{   r   rQ   r   r   rR   shortcutpad_hws           rC   r   Block.forward  s    WW
aJJqM,0a(,<,<=IAOODHHTYYq\23 a"1&6&6GALIIaQ#$**Q-)@ ABBIIaAr"rE   )	r   r   r   r   r   r   r   r   r   )r   r   r   r   rq   GELUr   r   r3   r   r   r   r   r   rn   r   r   r   r   s   @rC   r   r      s     "!!!!+/!)+*,,,), % '2S2S 2S 	2S
 2S 2S 2S 2S "%2S 2S BII2S RYY2S BII2S 2S 2S 2Sh rE   r   r   r   c           	      6   U R                   u  p#pEXU-  -
  U-  nXU-  -
  U-  n[        R                  " U SSSUSU45      n X6-   XG-   pU R                  X(U-  XU-  X5      n U R	                  SSSSSS5      R                  5       R                  SXU5      n
XU	44$ )a5  
Partition into non-overlapping windows with padding if needed.
Args:
    x (tensor): input tokens with [B, H, W, C].
    window_size (int): window size.

Returns:
    windows: windows after partition with [B * num_windows, window_size, window_size, C].
    (Hp, Wp): padded height and width before partition
r   r   rk   r,   r      r-   )r5   r6   padr   r9   
contiguous)r   r   rQ   r   r   Cpad_hpad_wHpWpwindowss              rC   r   r     s     JA!{?*k9E{?*k9E	a!Q5!U+,AY		q#[2C[TAii1aAq)446;;BZ[\GHrE   r   hwr   c                 .   Ub  UOUu  pEUu  pgU R                   S   XE-  U-  U-  -  nU R                  XU-  XQ-  XS5      n	U	R                  SSSSSS5      R                  5       R                  XUS5      n	U	SS2SU2SU2SS24   R                  5       n	U	$ )	aw  
Window unpartition into original sequences and removing padding.
Args:
    windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
    window_size (int): window size.
    pad_hw (Tuple): padded height and width (Hp, Wp).
    hw (Tuple): original height and width (H, W) before padding.

Returns:
    x: unpartitioned sequences with [B, H, W, C].
Nr   r-   r   rk   r,   r   r   )r5   r   r9   r   )
r   r   r   r   r   r   r   r   rQ   r   s
             rC   r   r   3  s     )VrFBDAaRW3{BCAQk)2+<kXZ[A			!Q1a#..055aRDA	!RaR!Q,""$AHrE   c            H       <  ^  \ rS rSrSrSSSSSSSSS	S
SS
SSSSSSS\" \\R                  S
S9\	R                  \	R                  \\S	S
S
SSSSSSSS4#S\S\S\S\S\S\S\S\S\S\S\\   S\S\S \S!\S"\S#\S$\S%\S&\\	R(                     S'\\\	R(                        S(\\\	R(                        S)\\	R(                     S*\\	R(                     S+\S,\S-\S.\S/\\S04   S1\S2\S3\\   S4\\\\\4   \\\4   4      4BU 4S5 jjjr\R0                  R2                  S6 5       r\R0                  R2                  SKS7 j5       r\R0                  R2                  SLS8 j5       r\R0                  R2                  S9\	R(                  4S: j5       rSMS\S2\\   4S; jjr     SNS<\R>                  S=\\ \\!\   4      S>\S?\S@\SA\S9\ \!\R>                     \\R>                  \!\R>                     4   4   4SB jjr"   SOS=\\ \\!\   4      SC\SD\4SE jjr#SF r$SKSG\4SH jjr%SI r&SJr'U =r($ )Pr&   iJ  zVision Transformer for Segment-Anything Model(SAM)

A PyTorch impl of : `Exploring Plain Vision Transformer Backbones for Object Detection` or `Segment Anything Model (SAM)`
    - https://arxiv.org/abs/2010.11929
      rk         r   TFNr^    )
output_fmtstrict_img_size   r      avgimg_size
patch_sizein_chansnum_classes	embed_dimdepthr_   r   r`   ra   r   pre_norm	drop_ratepos_drop_ratepatch_drop_rateproj_drop_rateattn_drop_ratedrop_path_rateweight_initembed_layerrd   r   block_fnr   use_abs_posre   use_roper   global_attn_indexes.
neck_chansglobal_poolhead_hidden_sizeref_feat_shapec$                   > [         T+U ]  5         U"U#S.n$U=(       d    [        [        R                  SS9nU=(       d    [        R
                  nX@l        X0l        UU l        U=U l	        =U l
        U l        SU l        U" S&UUUUU(       + S.U$D6U l        U R                  R                  n%[        U R                  S5      (       a  U R                  R!                  5       OUn&U(       a:  [        R"                  " [$        R&                  " SU%S   U%S   U40 U$D65      U l        OS	U l        [        R*                  " US
9U l        US:  a  [/        USS9U l        O[        R2                  " 5       U l        U(       a	  U" U40 U$D6O[        R2                  " 5       U l        U(       as  U(       a   S5       eU!b.  [7        U!5      S:X  d   e[9        U!S   5      n'[9        U!S   5      n(OS	=n'n([;        XW-  SU%U'S9U l        [;        XW-  S[9        U5      U(S9U l        OS	U l        S	U l        [A        UU5      n)[        RB                  " [E        U5       V*s/ s Hb  n*U" S&0 SU_SU_SU_SU	_SU
_SU_SU_SU_SU)U*   _SU_SU_SU_SU_SU*U;  a  UOS_SU%_SU*U;  a  U R>                  OU R<                  _U$D6PMd     sn*6 U l#        [E        U5       V*s/ s H  n*[I        SU* 3UU&S 9PM     sn*U l%        U(       am  [        RB                  " [        RL                  " UU4SSS!.U$D6[O        U40 U$D6[        RL                  " UU4S"SSS#.U$D6[O        U40 U$D65      U l(        UU l	        O5U (       a  [        R2                  " 5       U l(        O[O        U40 U$D6U l(        UnU (       a  [S        UU4U UUS$.U$D6U l*        g	[W        UU4UUS%.U$D6U l*        g	s  sn*f s  sn*f )'a  
Args:
    img_size: Input image size.
    patch_size: Patch size.
    in_chans: Number of image input channels.
    num_classes: Number of classes for classification head.
    global_pool: Type of global pooling for final sequence (default: 'token').
    embed_dim: Transformer embedding dimension.
    depth: Depth of transformer.
    num_heads: Number of attention heads.
    mlp_ratio: Ratio of mlp hidden dim to embedding dim.
    qkv_bias: Enable bias for qkv projections if True.
    init_values: Layer-scale init values (layer-scale enabled if not None).
    drop_rate: Head dropout rate.
    pos_drop_rate: Position embedding dropout rate.
    attn_drop_rate: Attention dropout rate.
    drop_path_rate: Stochastic depth rate.
    weight_init: Weight initialization scheme.
    embed_layer: Patch embedding layer.
    norm_layer: Normalization layer.
    act_layer: MLP activation layer.
    block_fn: Transformer block layer.
    use_abs_pos: If True, use absolute positional embeddings.
    use_rel_pos: If True, add relative positional embeddings to the attention map.
    use_rope: If True, add rotary position embeddings to q/k in attention block.
    window_size: Window size for window attention blocks. If 0, not use window attention.
    global_attn_indexes: Indexes for blocks using global attention. Used when window_size > 0.
    global_pool: Global pooling type.
    head_hidden_size: If set, use NormMlpHead
    ref_feat_shape: Tuple of reference feature shapes for ROPE, (global, local)
ri   gư>)epsF)r   r   r   r   rl   
feat_ratior   r   N)r   )num_prefix_tokenszCROPE and relative pos embeddings should not be enabled at same timer,   )	in_pixels
feat_shaper   rS   r_   r   r`   ra   r   rc   rb   r   rd   r   r   re   r   rf   rg   zblocks.)modulenum_chs	reduction)kernel_sizerl   rk   )r   paddingrl   )hidden_size	pool_typer   )r   r   r   ),rm   rn   r   rq   r   r   r   r   r   num_featuresr   r   grad_checkpointingpatch_embed	grid_sizehasattrr   ry   r:   rz   	pos_embedrw   pos_dropr   
patch_droprt   norm_prelenr   r   rope_globalrope_windowr   
Sequentialrangeblocksdictfeature_infoConv2dr   neckr   headr   ),r{   r   r   r   r   r   r   r_   r   r`   ra   r   r   r   r   r   r   r   r   r   r   rd   r   r   r   r   re   r   r   r   r   r   r   r   rj   r1   r|   r   rref_feat_shape_globalref_feat_shape_windowdprir}   s,                                              rC   rn   VisionTransformerSAM.__init__Q  st   J 	/B72<<T#B
(	& &ENNND1DN"'& 
!
 
 $$..	-4T5E5E|-T-TD'')Zd\\%++a1yQR|U^*ebd*efDN!DN

]3Q*"#DO
 !kkmDO7?
933R[[]"i$ii?)>*a///(1.2C(D%(1.2C(D%@DD%(=1&$4	 D  2&$[14	 D  $D#D (>mm( 5\)&#( "'  # $ "	
   ( ) ) a& & $ $ ( ,-4G+GKQ %  *+2E)ET%%4K[K[#& ")&# $, QVV[P\^P\1D'!yAFP\^ 		 !"	
  J-"-		 !"  J-"-#DI& !+DKKM	 (	8R8	"J - -%# DI ' &#	
 DIA&#*^s   A)OOc                 
    SS1$ )Nr  
dist_tokenr   r{   s    rC   no_weight_decay$VisionTransformerSAM.no_weight_decay  s    \**rE   c                     [        SSS/S9$ )Nz^pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr
  )r  )r{   coarses     rC   group_matcher"VisionTransformerSAM.group_matcher!  s    *-/CD
 	
rE   c                     Xl         g N)r   )r{   enables     rC   set_grad_checkpointing+VisionTransformerSAM.set_grad_checkpointing(  s    "(rE   r*   c                     U R                   $ r!  r  r  s    rC   get_classifier#VisionTransformerSAM.get_classifier,  s    yyrE   c                 F    Xl         U R                  R                  X5        g r!  )r   r  reset)r{   r   r   s      rC   reset_classifier%VisionTransformerSAM.reset_classifier0  s    &		1rE   r   indicesnorm
stop_earlyr   intermediates_onlyc                    US:X  d   S5       e/ n[        [        U R                  5      U5      u  pU R                  U5      nU R                  b&  U[        U R                  UR                  SS 5      -   nU R                  U5      nU R                  U5      nU R                  U5      n[        R                  R                  5       (       d  U(       d  U R                  n
OU R                  SU	S-    n
[        U
5       H  u  pU R                  (       a/  [        R                  R                  5       (       d  [        X5      nOU" U5      nX;   d  MT  U(       a4  UR!                  U R#                  UR%                  SSSS5      5      5        M  UR!                  UR%                  SSSS5      5        M     U(       a  U$ U R#                  UR%                  SSSS5      5      nX4$ )a  Forward features that returns intermediates.

Args:
    x: Input image tensor
    indices: Take last n blocks if int, all if None, select matching indices if sequence
    norm: Apply norm layer to all intermediates
    stop_early: Stop iterating over blocks when last desired intermediate hit
    output_fmt: Shape of intermediate feature outputs
    intermediates_only: Only return intermediate features
Returns:

NCHWz&Output shape for ViT-SAM must be NCHW.Nr   rk   r   r,   )r    r  r
  r   r  r   r5   r  r  r  r:   jitis_scripting	enumerater   r"   appendr  r9   )r{   r   r-  r.  r/  r   r0  intermediatestake_indices	max_indexr
  r  blks                rC   forward_intermediates*VisionTransformerSAM.forward_intermediates4  s   * V#M%MM#"6s4;;7G"Q Q>>%/!MMAMM!OOAMM!99!!##:[[F[[)a-0F'FA&&uyy/E/E/G/Gs&F  "((199Q1a3H)IJ!((1aA)>? (   IIaii1a+,rE   
prune_norm
prune_headc                     [        [        U R                  5      U5      u  pEU R                  SUS-    U l        U(       a  [        R                  " 5       U l        U(       a  U R                  SS5        U$ )z?Prune layers not required for specified intermediates.
        Nr   r   r   )r    r  r
  rq   rt   r  r+  )r{   r-  r=  r>  r8  r9  s         rC   prune_intermediate_layers.VisionTransformerSAM.prune_intermediate_layerso  s[     #7s4;;7G"Qkk.9q=1DI!!!R(rE   c                    U R                  U5      nU R                  b&  U[        U R                  UR                  SS 5      -   nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  (       a:  [        R                  R                  5       (       d  [        U R                  U5      nOU R                  U5      nU R                  UR                  SSSS5      5      nU$ )Nr   rk   r   r,   )r   r  r   r5   r  r  r  r   r:   r3  r4  r#   r
  r  r9   r{   r   s     rC   forward_features%VisionTransformerSAM.forward_features  s    Q>>%/!MMAMM!OOAMM!""599+A+A+C+Ct{{A.AAAIIaii1a+,rE   
pre_logitsc                 R    U(       a  U R                  USS9$ U R                  U5      $ )NT)rF  r&  )r{   r   rF  s      rC   forward_head!VisionTransformerSAM.forward_head  s$    0:tyyty,L		!LrE   c                 J    U R                  U5      nU R                  U5      nU$ r!  )rD  rH  rC  s     rC   r   VisionTransformerSAM.forward  s'    !!!$a rE   )r
  r   r  r   r   r  r   r   r  r  r   r   r  r   r  r  r  r  F)Tr!  )NFFr2  F)NFT))r   r   r   r   __doc__r   r   r   NHWCrq   r   r   r   r   r3   r   r   r   strr   r   r   rn   r:   r3  ignorer  r  r#  r'  r+  Tensorr	   r   r;  r@  rD  rH  r   r   r   r   s   @rC   r&   r&   J  s    ! " !!!+/"!#%%'$&$&$&!+2:&++gl+m46LL3577(-), $ %"!35!$.2PTIJJ J 	J
 J J J J J J J "%J J J !J  #!J" "#J$ "%J& "'J( )J* bii+J, !bii1-J.  RYY0/J0 299o1J2 BII3J4 5J6 7J8 9J: ;J< "'sCx=J> ?J@ AJB 'smCJD %U5c?E#s(O+K%LMEJ JX YY+ + YY
 
 YY) ) YY		  2C 2hsm 2 8<$$',9 ||9  eCcN349  	9 
 9  9  !%9  
tELL!5tELL7I)I#JJ	K9 z 8<$#	eCcN34  	"M$ M rE   c                     SU ;   n0 nU R                  5        H@  u  pEUR                  S5      (       a  USS nUR                  SS5      nO	U(       a  M<  XSU'   MB     U$ )zRemap SAM checkpoints -> timm z%image_encoder.patch_embed.proj.weightzimage_encoder.r   Nzmlp.linzmlp.fc)items
startswithreplace)
state_dictmodelsam_checkpointout_dictr   r   s         rC   checkpoint_filter_fnrZ    sg    
 =
JNH  "<<())"#A		)X.A # OrE   c                 2    U SSS SSS[         [        SSS.UE$ )	N  rk   r   r   ?bicubicTzpatch_embed.projzhead.fc)urlr   rf   	pool_sizecrop_pctinterpolationfixed_input_sizemeanstd
first_conv
classifier)r   r   )r`  kwargss     rC   _cfgrj    s2    ?'0F(	  rE   zDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pthztimm/z
apache-2.0r]  r2   )r`  	hf_hub_idlicensere  rf  r   rf   rb  zDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pthzDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pthr\  )rk      rm  r^  )re  rf  r   rf   rb  )zsamvit_base_patch16.sa1bzsamvit_large_patch16.sa1bzsamvit_huge_patch16.sa1bsamvit_base_patch16_224c           	      h    UR                  SS5      n[        [        U U4[        [	        USS9S.UD6$ )Nout_indicesrk   getter)rp  feature_cls)pretrained_filter_fnfeature_cfg)popr   r&   rZ  r  )variant
pretrainedri  rp  s       rC   _create_vision_transformerrx    sF    **]A.K 2[hG  rE   c                 Z    [        SSSS/ SQSSSS9n[         S
S	U 0[        U40 UD6D6nU$ )z"ViT-B/16 for Segment-Anything
    r   r   r   r,   r   r]      r   Tr   r   r   r   r_   r   r   re   r   rw  )samvit_base_patch16r  rx  rw  ri  
model_argsrW  s       rC   r}  r}    sR     B"R_D4J 'T*4T8<Z8R68RTELrE   c                 Z    [        SSSS/ SQSSSS9n[         S	SU 0[        U40 UD6D6nU$ )
z"ViT-L/16 for Segment-Anything
    r   r      )r   r{        r   Tr|  rw  )samvit_large_patch16r~  r  s       rC   r  r    sR     R2SbD4J 'U+5U9=j9SF9SUELrE   c                 Z    [        SSSS/ SQSSSS9n[         S
S	U 0[        U40 UD6D6nU$ )z"ViT-H/16 for Segment-Anything
    r   i       )      r     r   Tr   r|  rw  )samvit_huge_patch16r~  r  s       rC   r  r    sR     R2SbD4J 'T*4T8<Z8R68RTELrE   c                 ^    [        SSSS/ SQSSSSS	S
9
n[         SSU 0[        U40 UD6D6nU$ )z"ViT-B/16 based on samvit arch
    r   r   r   rz  r   TFrm  N)
r   r   r   r_   r   r   re   r   r   r   rw  )rn  r~  r  s       rC   rn  rn    sW     B"R_DecVZJ '!X.8X<@<Vv<VXELrE   r!  )r   rL  )JrM  logging	functoolsr   typingr   r   r   r   r   r	   r:   torch.nnrq   torch.nn.functionalr   r6   	timm.datar
   r   r   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	torch.jitr   _builderr   	_featuresr    _features_fxr!   _manipulater"   r#   	_registryr$   r%   __all__	getLoggerr   _loggerr3   rQ  rD   rX   r   rZ   r   r   r   r&   rZ  rj  default_cfgsrx  r}  r  r  rn  r   rE   rC   <module>r     s  
   ? ?     r r    "  * + 3 3 < "
" 

H
%3 3S 35<< 3ELL 3@ + &7<<7<<7 <<7 c3h	7
 c3h7 \\7DQ		 QhLBII L^ 3 5uUXZ]U]A^;_ 0 gk\\(+16sCxJRSXY\^aYaSbJc
\\.K299 K\
$ % !%R"(<!"S!2 "&R"(<!"S"2 !%R"(<!"S!2  $"(<$ 3 0-& 8	 	7K 	 	 	8L 	 	 	7K 	 	 	;O 	 	rE   