
    RЦi                         S r SSKrSSKJrJr  SSKrSSKJr  SSKJs  J	r
  SSKJr  SSKJr  SSKJr   " S S	\R"                  5      rg)
zDifferential Attention

Paper: 'Differential Transformer' - https://arxiv.org/abs/2410.05258

Reference impl: https://github.com/microsoft/unilm/tree/master/Diff-Transformer

Hacked together by / Copyright 2024, Ross Wightman
    N)OptionalType   )maybe_add_mask)use_fused_attn)RmsNormc                   j  ^  \ rS rSr% Sr\R                  R                  \   \	S'               SS\
S\
S\S\S	\S
\S\S\S\\\R                        S\
S\SS4U 4S jjjrS\
4S jrS rS\R&                  4S jr SS\R&                  S\\R&                     S\R&                  4S jjrSrU =r$ )DiffAttention   a  Differential Attention module.

Computes attention as the difference between two softmax attention maps, which helps
cancel out noise and promotes sparse attention patterns. The module splits Q and K
into two groups, computes separate attention maps, and subtracts one from the other
scaled by a learnable lambda parameter.

The attention output is computed as:
    Attn = softmax(Q1 @ K1^T) - lambda * softmax(Q2 @ K2^T)
    Output = Attn @ V

Supports both fused (scaled_dot_product_attention) and manual implementations.

fused_attnNdim	num_headsqkv_biasqk_norm
scale_norm	proj_bias	attn_drop	proj_drop
norm_layerdepthdual_lambdareturnc                   > [         TU ]  5         XS.nX-  S:X  d   S5       eU	c  [        n	X l        X-  S-  U l        U R                  S-  U l        [        5       U l        [        R                  " XS-  4SU0UD6U l
        U(       a  U	" U R                  40 UD6O[        R                  " 5       U l        U(       a  U	" U R                  40 UD6O[        R                  " 5       U l        [        R                  " U5      U l        Xpl        U(       a	  U	" U40 UD6O[        R                  " 5       U l        [        R                  " X4SU0UD6U l        [        R                  " U5      U l        Xl        U(       a  [        R*                  " [,        R.                  " S	[,        R0                  US
95      U l        [        R*                  " [,        R.                  " S	[,        R0                  US
95      U l        S=U l        =U l        =U l        U l        GO*S=U l        U l        [        R*                  " [,        R.                  " U R                  [,        R0                  US
95      U l        [        R*                  " [,        R.                  " U R                  [,        R0                  US
95      U l        [        R*                  " [,        R.                  " U R                  [,        R0                  US
95      U l        [        R*                  " [,        R.                  " U R                  [,        R0                  US
95      U l        [        SU R                  -  4SS0UD6U l        SU l         U RC                  U
5        U RE                  5         g)ax  Initialize the DiffAttention module.

Args:
    dim: Input dimension of the token embeddings.
    num_heads: Number of attention heads.
    qkv_bias: Whether to use bias in the query, key, value projections.
    qk_norm: Whether to apply normalization to query and key vectors.
    scale_norm: Whether to apply normalization before the output projection.
    proj_bias: Whether to use bias in the output projection.
    attn_drop: Dropout rate applied to the attention weights.
    proj_drop: Dropout rate applied after the output projection.
    norm_layer: Normalization layer constructor (defaults to RmsNorm).
    depth: Block depth index, used to compute depth-dependent lambda_init.
    dual_lambda: If True, use simplified dual scalar lambda parameterization
        (2 params). If False, use the paper's original formulation with
        lambda_q/k vectors (4 * head_dim params).
)devicedtyper   z$dim should be divisible by num_headsN   g         bias )r   r   epsgh㈵>皙?)#super__init__r   r   head_dimscaler   r   nnLinearqkvIdentityq_normk_normDropoutr   attn_drop_pnormprojr   r   	Parametertorchemptyfloat32lambda_alambda_b	lambda_q1	lambda_k1	lambda_q2	lambda_k2sub_normlambda_initset_lambda_initreset_parameters)selfr   r   r   r   r   r   r   r   r   r   r   r   r   dd	__class__s                  Y/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/layers/diff_attention.pyr#   DiffAttention.__init__%   so   B 	/!#K%KK# J"(A-]]d*
(*99S'??B?9@j5"5bkkm9@j5"5bkkmI.$-7Js)b)R[[]	IIc=Y="=	I.&LLRu}}U[)\]DMLLRu}}U[)\]DMPTTDNTT^Tdnt~,00DMDM\\%++dmm5==ag*hiDN\\%++dmm5==ag*hiDN\\%++dmm5==ag*hiDN\\%++dmm5==ag*hiDNDMM 1BtBrBU#    c                 L    SS[         R                  " SU-  5      -  -
  U l        g )Nr!   g333333?g333333ӿ)mathexpr;   )r>   r   s     rA   r<   DiffAttention.set_lambda_initk   s!    txxu'=!==rC   c                    U R                   (       aS  [        R                  R                  U R                  5        [        R                  R                  U R
                  5        g [        R                  R                  U R                  SSS9  [        R                  R                  U R                  SSS9  [        R                  R                  U R                  SSS9  [        R                  R                  U R                  SSS9  g )Nr   g?)meanstd)r   r&   initzeros_r4   r5   normal_r6   r7   r8   r9   )r>   s    rA   r=   DiffAttention.reset_parametersn   s    GGNN4==)GGNN4==)GGOODNNO<GGOODNNO<GGOODNNO<GGOODNNO<rC   c                    U R                   bA  [        R                  " U R                   5      n[        R                  " U R                  5      nO[        R                  " [        R                  " U R
                  U R                  -  SS9R                  5       5      n[        R                  " [        R                  " U R                  U R                  -  SS9R                  5       5      nX-
  U R                  -   $ )Nr   )r4   r1   rF   r5   sumr6   r7   floatr8   r9   r;   )r>   lambda_1lambda_2s      rA   _compute_lambdaDiffAttention._compute_lambdax   s    ==$yy/Hyy/Hyy4>>DNN+JPR!S!Y!Y![\Hyy4>>DNN+JPR!S!Y!Y![\H"T%5%555rC   x	attn_maskc                    UR                   u  p4nU R                  U5      R                  SSS9u  pgnUR                  X4SU R                  -  U R
                  5      R                  SS5      nUR                  X4SU R                  -  U R
                  5      R                  SS5      nUR                  X4U R                  SU R
                  -  5      R                  SS5      nU R                  U5      U R                  U5      pvU R                  5       R                  U5      n	U R                  (       a  UR                  X0R                  SX@R
                  5      nUR                  X0R                  SX@R
                  5      nUR                  S5      u  pUR                  S5      u  pU R                  (       a  U R                  OSn[        R                   " XXUS9n[        R                   " XXUS9nXU-  -
  nOX`R"                  -  nXgR                  SS5      -  n[%        UU5      nUR'                  SS9nU R)                  U5      nUR+                  X0R                  SXD5      nUS S 2S S 2S	4   U	US S 2S S 2S4   -  -
  nUU-  nU R-                  U5      nUSU R.                  -
  -  nUR                  SS5      R                  X4U5      nU R1                  U5      nU R3                  U5      nU R5                  U5      nU$ )
Nr   r   rQ   r           )rY   	dropout_prP   r   )shaper(   chunkreshaper   r$   	transposer*   r+   rV   type_asr   unbindtrainingr-   Fscaled_dot_product_attentionr%   r   softmaxr   viewr:   r;   r.   r/   r   )r>   rX   rY   BNCqkvlambda_fullq1q2k1k2r\   attn1attn2attns                     rA   forwardDiffAttention.forward   s   
 ''a((1+##A1#-aIIaA.>HHANIIaA.>HHANIIaDNNA,=>HHAN{{1~t{{1~1**,44Q7??		!^^Q==AA		!^^Q==AAXXa[FBXXa[FB,0MM((sI2221]fgE2221]fgEe++AJJA{{2r**D!$	2D<<B<'D>>$'D99Q18D1a=;aAg#>>DqAMM!T%%%&KK1%%aA.IIaLIIaLNN1rC   )r   r-   r   r   r$   r+   r4   r5   r;   r7   r9   r6   r8   r.   r   r/   r   r*   r(   r%   r:   )   FFFTr[   r[   Nr   FNN)N)__name__
__module____qualname____firstlineno____doc__r1   jitFinalbool__annotations__intrS   r   r   r&   Moduler#   r<   r=   TensorrV   rw   __static_attributes____classcell__)r@   s   @rA   r
   r
      s<    		%%
 "!$"!!48 %D D  D  	D 
 D  D  D  D  D  !bii1D  D  D  
D  D L>S >=6 6 15.||.  -. 
	. .rC   r
   )r~   rE   typingr   r   r1   torch.nnr&   torch.nn.functional
functionalre   	attentionr   configr   r.   r   r   r
   r   rC   rA   <module>r      s9     !     % " ZBII ZrC   