
    RЦi5                         S SK JrJrJrJr  S SKrS SKJr  S SKJr	  SSK
Jr  SSKJr  SSKJr  SS	KJr   " S
 S\R"                  5      r " S S\R"                  5      r " S S\R"                  5      rg)    )ListOptionalTypeUnionN)nn)
functional   )use_fused_attn)create_conv2d)	to_2tuple)create_pool2dc                      ^  \ rS rSrSr        SS\S\\   S\S\S\S\S	\4U 4S
 jjjrS r	S r
SS\\R                     4S jjrSrU =r$ )MultiQueryAttentionV2   a  Multi Query Attention.

Fast Transformer Decoding: One Write-Head is All You Need
https://arxiv.org/pdf/1911.02150.pdf

This is an acceletor optimized version - removing multiple unnecessary
tensor transpose by re-arranging indices according to the following rules: 1)
contracted indices are at the end, 2) other indices have the same order in the
input and output tensores.

Compared to V1, this gives 3x speed up.
dimdim_out	num_headskey_dim	value_dim	attn_drop	proj_dropc
                 
  > XS.n
[         TU ]  5         U=(       d    UnX0l        X@l        XPl        US-  U l        [        R                  " [        R                  " U R                  U R                  U440 U
D65      U l
        [        R                  " [        R                  " XR                  440 U
D65      U l        [        R                  " [        R                  " XR                  440 U
D65      U l        [        R                  " U5      U l        [        R                  " [        R                  " X R                  U R                  440 U
D65      U l        [        R                  " U5      U l        U R#                  5         g)zInitializer.devicedtype      N)super__init__r   r   r   scaler   	Parametertorchempty
query_projkey_proj
value_projDropoutr   out_projr   reset_parameters)selfr   r   r   r   r   r   r   r   r   dd	__class__s              V/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/layers/attention2d.pyr   MultiQueryAttentionV2.__init__   s    /.S""_
,,u{{DNNDLLRU3V']Z\']^U[[#||1D%K%KL,,u{{C3H'OB'OPI.U[['>>4>>1Z%a^`%abI.    c                    U R                   R                  S   S-  n[        R                  R	                  U R
                  US9  [        R                  R	                  U R                   US9  [        R                  R	                  U R                  US9  [        R                  R	                  U R                  U R                  R                  S   S-  S9  g )Nr   r   )std)r$   shaper   initnormal_r#   r%   r'   )r)   r   s     r,   r(   &MultiQueryAttentionV2.reset_parameters9   s    ##A&$.
U3
51
U3
4==+>+>q+AT+IJr.   c                 l    UR                   nUR                  US   US   S5      R                  SS5      $ )zBReshapes a tensor to three dimensions, keeping the first and last.r   r	      )r1   reshape	transposer)   tss      r,   _reshape_input$MultiQueryAttentionV2._reshape_input@   s5    GG yy1qtR(221a88r.   mc                 x   UR                   u  p4pVUb  UOUnU R                  U5      nU R                  U5      n[        R                  " SXpR                  5      n	[        R                  " SXR
                  5      n
[        R                  " SX5      U R                  -  nUR                  SS9nU R                  U5      n[        R                  " SXR                  5      n[        R                  " SX5      n[        R                  " SXR                  5      nU R                  U5      nUR                  USXV5      $ )	Run layer computation.zbnd,hkd->bnhkzbmd,dk->bmkzbnhk,bmk->bnhmr6   r   zbmd,dv->bmvzbnhm,bmv->bnhvzbnhv,dhv->bdn)r1   r=   r!   einsumr#   r$   r   softmaxr   r%   r'   r   r8   )r)   xr?   b_hw
reshaped_x
reshaped_mqkattnvoresults                  r,   forwardMultiQueryAttentionV2.forwardH   s    WW
aAA((+
((+
LL*ooFLL
MMB||,a3djj@|||#~~d#LL
OODLL)43oq--@'~~aQ**r.   )
r   r   r$   r   r'   r   r#   r   r   r%   )N   @   rU           rV   NNN)__name__
__module____qualname____firstlineno____doc__intr   floatr   r(   r=   r!   TensorrR   __static_attributes____classcell__r+   s   @r,   r   r      s      &*!!   c]  	 
          <K9+HU\\2 + +r.   r   c                     ^  \ rS rSr% Sr\R                  R                  \   \	S'   SSSSSSSSSS	S	\
R                  S
SS4S\S\\   S\S\\   S\\   S\S\S\S\S\\\\\   4   S\S\S\\
R&                     S\4U 4S jjjrS rS\R,                  4S jrS\R,                  S\S\4S jrS\R,                  S\S\S\4S  jrS$S!\\R,                     4S" jjrS#rU =r$ )%MultiQueryAttention2d^   a  Multi Query Attention with spatial downsampling.

 3 parameters are introduced for the spatial downsampling:
 1. kv_stride: downsampling factor on Key and Values only.
 2. query_strides: horizontal & vertical strides on Query only.

This is an optimized version.
1. Projections in Attention is explicit written out as 1x1 Conv2D.
2. Additional reshapes are introduced to bring a up to 3x speed up.

fused_attnNrT   r	       rV   Fr   r   r   r   r   query_strides	kv_stridedw_kernel_sizedilationpaddingr   r   
norm_layeruse_biasc                   > UUS.n[         TU ]  5         U=(       d    UnX0l        U=(       d    X-  U l        U=(       d    X-  U l        [        U5      U l        Xpl        [        U R                   Vs/ s H  nUS:  PM
     sn5      U l	        U R                  S-  U l
        [        5       U l        Xl        [        R                  " 5       U l        U R                  (       a  U
S:X  a0  U R                   R#                  S[%        SU R                  SS95        O.U R                   R#                  S[        R&                  " US95        U R                   R#                  S	U" U40 UD65        U R                   R#                  S
[)        UU R                  U R                  -  4SUS.UD65        [        R                  " 5       U l        US:  aP  U R*                  R#                  S[)        UU4UUU	U
SS.UD65        U R*                  R#                  S	U" U40 UD65        U R*                  R#                  S
[)        UU R                  4SU
US.UD65        [        R                  " 5       U l        US:  aP  U R,                  R#                  S[)        UU4UUU	U
SS.UD65        U R,                  R#                  S	U" U40 UD65        U R,                  R#                  S
[)        UU R                  4SUS.UD65        [        R.                  " U5      U l        [        R                  " 5       U l        U R                  (       a:  U R2                  R#                  S[        R4                  " U R                  SSS95        U R2                  R#                  S
[)        U R                  U R                  -  U4SUS.UD65        U R2                  R#                  S[        R.                  " U5      5        SU l        U R9                  5         gs  snf )a;  Initializer.

Args:
  num_heads: Number of attention heads.
  key_dim: Size of the attention key dimension.
  value_dim: Size of the attention value dimension.
  query_strides: Vertical stride size for query only.
  kv_stride: Key and value stride size.
  dw_kernel_size: Spatial dimension of the depthwise kernel.
r   r	   r   same	down_poolavg)kernel_sizerm   )rt   normproj)rt   bias	down_convT)rt   striderl   rm   	depthwise)rt   rm   rw   upsamplebilinearF)scale_factormodealign_cornersdropN)r   r   r   r   r   r   ri   rj   anyhas_query_stridesr   r
   rf   r   r   
Sequentialquery
add_moduler   	AvgPool2dr   keyvaluer&   r   outputUpsamplerC   init_weights)r)   r   r   r   r   r   ri   rj   rk   rl   rm   r   r   rn   ro   r   r   r*   r<   r+   s                      r,   r   MultiQueryAttention2d.__init__k   s   : /.S"2#"2"6c&6&}5"!$T5G5G%H5Ga!e5G%H!I\\T)
(*	]]_
!!& 

%%k= $ 2 2"4  

%%k2<<M3Z[JJ!!&*S*?B*?@

fmNNT\\)'
 	'

 '
 	 ==?q=HH]	. + !	. 	. 	 HH
3(="(=>FMLL%
 %
 %
 	 ]]_
q=JJ!!+}	0 + !	0 	0 	 JJ!!&*S*?B*?@

fmNN'
 	'

 '
 	 I.mmo!!KK"":r{{!//#0 
 	v}NNT^^+(
 	(

 (
 	 	vrzz)'<=q &Is   3O<c                     [         R                  R                  U R                  R                  R
                  5        [         R                  R                  U R                  R                  R
                  5        [         R                  R                  U R                  R                  R
                  5        U R                  S:  az  [         R                  R                  U R                  R                  R
                  5        [         R                  R                  U R                  R                  R
                  5        [         R                  R                  U R                  R                  R
                  5        g )Nr	   )r   r2   xavier_uniform_r   rv   weightr   r   rj   rx   r   )r)   s    r,   r   "MultiQueryAttention2d.init_weights   s    


 6 67
 4 45


 6 67>>AGG##DHH$6$6$=$=>GG##DJJ$8$8$?$?@
 0 0 7 78r.   r;   c                     UR                   nUR                  US   US   S5      R                  SS5      nU R                  (       a  U$ UR	                  S5      R                  5       $ )zFReshapes a tensor to three dimensions, keeping the batch and channels.r   r	   r6   r7   )r1   r8   r9   rC   	unsqueeze
contiguousr:   s      r,   r=   $MultiQueryAttention2d._reshape_input   sU    GGIIadAaD"%//15;;H;;q>,,..r.   c                     UR                   nUR                  US   X#S5      nU R                  (       a"  UR                  SSSS5      R	                  5       $ UR                  SS5      R	                  5       $ )z?Reshapes projected query: [b, n, n, h x k] -> [b, n x n, h, k].r   r6   rg   r	   r7   )r1   r8   rC   permuter   r9   )r)   r;   r   r   r<   s        r,   _reshape_projected_query.MultiQueryAttention2d._reshape_projected_query   s`    GGIIadI3;;99Q1a(3355;;r2&1133r.   h_pxw_pxc                     UR                   nUS   U-  nU R                  (       d  UR                  SS5      nUR                  US   X4U5      R	                  SSSS5      R                  5       $ )z2Reshape output:[b, n x n x h, k] -> [b, n, n, hk].r6   r	   r7   r   rg   )r1   rC   r9   r8   r   r   )r)   r;   r   r   r   r<   feat_dims          r,   _reshape_output%MultiQueryAttention2d._reshape_output  sa    GGR59${{Aq!Ayy1t84<<Q1aHSSUUr.   	attn_maskc                    UR                   =u  p4pVnU R                  U5      nU R                  XR                  U R                  5      nU R                  U5      n	U R                  U	5      n	U R                  U5      n
U R                  U
5      n
U R                  (       ac  [        R                  " SX5      U R                  -  nUb  X-   nUR                  SS9nU R                  U5      n[        R                  " SX5      nOU R                  (       a?  [        R                  " XU
UU R                   (       a  U R                  R"                  OSS9nOMXR                  -  nXR%                  SS5      -  nUb  X-   nUR                  SS9nU R                  U5      nX-  nU R'                  XR                  XPR(                  S   -  X`R(                  S	   -  5      nU R+                  U5      nU$ )
rA   zblhk,bpk->blhpr6   rB   zblhp,bpk->blhkrV   r   	dropout_pr   r   r	   )r1   r   r   r   r   r   r=   r   rC   r!   r   rD   r   rf   Fscaled_dot_product_attentiontrainingpr9   r   ri   r   )r)   rE   r   BCHWr<   rL   rM   rO   rN   rP   s                r,   rR   MultiQueryAttention2d.forward  s    
aQJJqM))!^^T\\JHHQK"JJqM"
 ;;<< 0!7$**DD$'<<B<'D>>$'D-t7A22!'26--dnn..R 

N;;r2..(+D|||+~~d+H   NNA9K9KA9N4NPQUgUghiUjPjkKKNr.   )r   r   rC   rf   r   r   r   rj   r   r   r   ri   r   r   r   rW   )rX   rY   rZ   r[   r\   r!   jitFinalbool__annotations__r   BatchNorm2dr]   r   r   strr   r^   r   Moduler   r   r_   r=   r   r   rR   r`   ra   rb   s   @r,   rd   rd   ^   s   	 		%%
 &*%)'+!""#24!!*,.."#}} c]} 	}
 c]}  }} } }  } } 3T#Y./} } } RYY} } }~9/ /4%,, 43 4QT 4V V# VS VPS V/HU\\$: / /r.   rd   c                      ^  \ rS rSr% \R
                  R                  \   \S'             SS\	S\
\	   S\	S\S\S\S	\S
\4U 4S jjjrSS\
\R                     4S jjrSrU =r$ )Attention2di@  rf   r   r   r   rw   expand_first
head_firstr   r   c                   > XS.n[         TU ]  5         U=(       d    UnU(       a  UOUnX0l        X-  U l        X`l        [        5       U l        [        R                  " XS-  S4SU0UD6U l	        [        R                  " U5      U l        [        R                  " XS4SU0UD6U l        [        R                  " U5      U l        g )Nr   rg   r	   rw   )r   r   r   dim_headr   r
   rf   r   Conv2dqkvr&   r   rv   r   )r)   r   r   r   rw   r   r   r   r   r   r   r*   dim_attnr+   s                r,   r   Attention2d.__init__D  s     /.S*7" -$(*99SQ,CCCI.IIhDDD	I.r.   r   c                    UR                   u  p4pVU R                  (       aK  U R                  U5      R                  X0R                  U R
                  S-  S5      R                  SSS9u  pxn	OJU R                  U5      R                  USU R                  U R
                  S5      R                  S5      u  pxn	U R                  (       a  [        R                  R                  R                  UR                  SS5      R                  5       UR                  SS5      R                  5       U	R                  SS5      R                  5       UU R                   (       a  U R"                  R$                  OSS9R                  SS5      R                  USXV5      nOUR                  SS5      nU	R                  SS5      n	Xx-  UR'                  S5      S	-  -  n
Ub  X-   n
U
R)                  SS9n
U R#                  U
5      n
X-  R                  SS5      R                  USXV5      nU R+                  U5      nU R-                  U5      nU$ )
Nrg   r6   r7   rB   r	   r   rV   r   r   )r1   r   r   viewr   r   chunkr8   unbindrf   r!   r   r   r   r9   r   r   r   r   sizerD   rv   r   )r)   rE   r   r   r   r   r   rL   rM   rO   rN   s              r,   rR   Attention2d.forward_  s   WW
a??hhqk&&q..$--!:KRPVVWX^_V`GA!hhqk))!QrRYYZ[\GA!??##@@B#..0B#..0B#..0#.2mm$..** A  iB2q 4  B#AB#A5166":--D$'<<B<'D>>$'D$$R,44QAAAIIaLNN1r.   )r   r   rf   r   r   rv   r   r   )	N    TFFrV   rV   NNrW   )rX   rY   rZ   r[   r!   r   r   r   r   r]   r   r^   r   r_   rR   r`   ra   rb   s   @r,   r   r   @  s    		%%3 &*!&$!!// c]/ 	/
 / / / / / /6HU\\$:  r.   r   )typingr   r   r   r   r!   r   torch.nnr   r   configr
   r   helpersr   pool2d_samer   r   r   rd   r    r.   r,   <module>r      sV    . .   $ " (  &N+BII N+b_BII _D<")) <r.   