
    RЦi(                     0   S SK JrJrJr  S SKrS SKJr  S SKJr  SSK	J
r
  SSKJr  SSKJr  \R                  R                   \
SS	\R"                  S
\\R"                     4S jj5       5       r " S S\R&                  5      r " S S\R&                  5      rg)    )FinalOptionalTypeN)nn)
functional   )register_notrace_function)use_fused_attn)apply_rot_embed_catscores	attn_maskc                     Uc  U $ X-   $ N )r   r   s     T/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/layers/attention.pymaybe_add_maskr      s     &6>F,>>    c                     ^  \ rS rSr% Sr\\   \S'               SS\S\S\	\   S\	\   S	\S
\S\S\S\
S\
S\	\\R                        SS4U 4S jjjr SS\R                   S\	\R                      S\R                   4S jjrSrU =r$ )	Attention   a  Standard Multi-head Self Attention module with QKV projection.

This module implements the standard multi-head attention mechanism used in transformers.
It supports both the fused attention implementation (scaled_dot_product_attention) for
efficiency when available, and a manual implementation otherwise. The module includes
options for QK normalization, attention dropout, and projection dropout.

fused_attnNdim	num_headsattn_head_dimdim_outqkv_biasqk_norm
scale_norm	proj_bias	attn_drop	proj_drop
norm_layerreturnc                    > [         TU ]  5         XS.nU=(       d    UnUnUc  X-  S:X  d   S5       eX-  nU(       d  U(       a
  Uc   S5       eX l        Xl        X/-  U l        US-  U l        [        5       U l        [        R                  " XR                  S-  4SU0UD6U l
        U(       a	  U" U40 UD6O[        R                  " 5       U l        U(       a	  U" U40 UD6O[        R                  " 5       U l        [        R                  " U	5      U l        U(       a  U" U R                  40 UD6O[        R                  " 5       U l        [        R                  " U R                  U4SU0UD6U l        [        R                  " U
5      U l        g)	a  Initialize the Attention module.

Args:
    dim: Input dimension of the token embeddings.
    num_heads: Number of attention heads.
    attn_head_dim: Dimension of each attention head. If None, computed as dim // num_heads.
    dim_out: Output dimension. If None, same as dim.
    qkv_bias: Whether to use bias in the query, key, value projections.
    qk_norm: Whether to apply normalization to query and key vectors.
    scale_norm: Whether to apply normalization to attention output before projection.
    proj_bias: Whether to use bias in the output projection.
    attn_drop: Dropout rate applied to the attention weights.
    proj_drop: Dropout rate applied after the output projection.
    norm_layer: Normalization layer constructor for QK normalization if enabled.
devicedtypeNr   $dim should be divisible by num_heads<norm_layer must be provided if qk_norm or scale_norm is True         bias)super__init__r   head_dimattn_dimscaler
   r   r   LinearqkvIdentityq_normk_normDropoutr    normprojr!   )selfr   r   r   r   r   r   r   r   r    r!   r"   r&   r'   ddr/   	__class__s                   r   r.   Attention.__init__   sC   > 	/.S ?a'O)OO''Hj)i+ii)" !,%
(*99S--!"3I(IbI4;j0R04;j0R0I.7AJt}}33r{{}	IIdmmWK9KK	I.r   xr   c                 "   UR                   u  p4nU R                  U5      R                  X4SU R                  U R                  5      R                  SSSSS5      nUR                  S5      u  pxn	U R                  U5      U R                  U5      pU R                  (       a?  [        R                  " XxU	UU R                  (       a  U R                  R                  OSS9nOQXpR                  -  nXxR!                  SS	5      -  n
[#        X5      n
U
R%                  S	S
9n
U R                  U
5      n
X-  nUR!                  SS5      R                  X4U R&                  5      nU R)                  U5      nU R+                  U5      nU R-                  U5      nU$ )Nr+      r   r              r   	dropout_pr   )shaper3   reshaper   r/   permuteunbindr5   r6   r   Fscaled_dot_product_attentiontrainingr    pr1   	transposer   softmaxr0   r8   r9   r!   )r:   r>   r   BNCr3   qkvattns              r   forwardAttention.forwardS   sH   
 ''ahhqk!!!4>>4==IQQRSUVXY[\^_`**Q-a{{1~t{{1~1??..a#.2mm$..**A JJA{{2r**D!$2D<<B<'D>>$'DAKK1%%aDMM:IIaLIIaLNN1r   )r0   r    r   r/   r6   r8   r   r9   r!   r5   r3   r1   )   NNFFFTrB   rB   NNNr   )__name__
__module____qualname____firstlineno____doc__r   bool__annotations__intr   floatr   r   Moduler.   torchTensorrY   __static_attributes____classcell__r<   s   @r   r   r      s    d
 +/%)"!$"!!485/5/ 5/ $C=	5/
 c]5/ 5/ 5/ 5/ 5/ 5/ 5/ !bii15/ 
5/ 5/t 15||  - 
	 r   r   c                   8  ^  \ rS rSr% Sr\R                  R                  \   \	S'                  SS\
S\
S\\
   S\S\S	\
S
\S\S\\
   S\\R                     S\S\S\S\4U 4S jjjr  SS\\R"                     S\\R"                     4S jjrSrU =r$ )AttentionRoper   zA Self Attention module with ROPE support.

Includes options for:
 * QK normalization option
 * Attention output (scale) normalization
 * Fused or unfused QKV projection support
r   r   r   r   r   	qkv_fusednum_prefix_tokensr    r!   r   r"   r   r   r   rotate_halfc                 v  > [         TU ]  5         UUS.nU=(       d    UnU	nUc  X-  S:X  d   S5       eX-  nU(       d  U(       a
  U
c   S5       eX l        UU l        UU-  U l        US-  U l        X`l        [        5       U l        Xl	        U(       aA  [        R                  " XR                  S-  4SU0UD6U l        S=U l        =U l        U l        OSU l        [        R                  " XR                  4SU0UD6U l        [        R                  " XR                  4SU0UD6U l        [        R                  " XR                  4SU0UD6U l        U(       a	  U
" U40 UD6O[        R                   " 5       U l        U(       a	  U
" U40 UD6O[        R                   " 5       U l        [        R&                  " U5      U l        U(       a  U
" U R                  40 UD6O[        R                   " 5       U l        [        R                  " U R                  U4SU0UD6U l        [        R&                  " U5      U l        g)	a3  Initialize the Attention module.

Args:
    dim: Input dimension of the token embeddings
    num_heads: Number of attention heads
    dim_out: Output dimension. If None, same as dim.
    qkv_bias: Whether to add a bias term to the query, key, and value projections
    qkv_fused: Whether to use fused QKV projection (single linear) or separate projections
    num_prefix_tokens: Number of reg/cls tokens at the beginning of the sequence that
        should not have position embeddings applied
    attn_drop: Dropout rate for attention weights
    proj_drop: Dropout rate for the output projection
    attn_head_dim: Dimension of each attention head. If None, computed as dim // num_heads.
    norm_layer: Normalization layer constructor to use for QK and scale normalization
    qk_norm: Enable normalization of query (Q) and key (K) vectors with norm_layer
    scale_norm: Enable normalization (scaling) of attention output with norm_layer
    proj_bias: Whether to use bias in the output projection
    rotate_half: Use 'half' ROPE layout instead of default 'interleaved'
r%   Nr   r(   r)   r*   r+   r,   )r-   r.   r   r/   r0   r1   ro   r
   r   rp   r   r2   r3   q_projk_projv_projr4   r5   r6   r7   r    r8   r9   r!   )r:   r   r   r   r   rn   ro   r    r!   r   r"   r   r   r   rp   r&   r'   r;   r/   r<   s                      r   r.   AttentionRope.__init__|   s   L 	/.S ?a'O)OO''H)i+ii)"  9,%
!2(*&yymma&7MhM"MDH6::DK:$+DH))CLXLLDK))CLXLLDK))CLXLLDK4;j0R04;j0R0I.7AJt}}33r{{}	IIdmmWK9KK	I.r   roper   c                 N   UR                   u  pEnU R                  ba  U R                  U5      nUR                  XESU R                  U R                  5      R                  SSSSS5      nUR                  S5      u  pn
OU R                  U5      R                  XEU R                  U R                  5      R                  SS5      nU R                  U5      R                  XEU R                  U R                  5      R                  SS5      n	U R                  U5      R                  XEU R                  U R                  5      R                  SS5      n
U R                  U5      U R                  U	5      pUb  U R                  n[        U SS5      n[        R                   " USS2SS2SU2SS24   [#        USS2SS2US2SS24   X,S	9/SS
9R%                  U
5      n[        R                   " U	SS2SS2SU2SS24   [#        U	SS2SS2US2SS24   X,S	9/SS
9R%                  U
5      n	U R&                  (       a?  [(        R*                  " XU
UU R,                  (       a  U R.                  R0                  OSS9nOQXR2                  -  nXR                  SS5      -  n[5        X5      nUR7                  SS
9nU R/                  U5      nX-  nUR                  SS5      R                  XEU R8                  5      nU R;                  U5      nU R=                  U5      nU R?                  U5      nU$ )aS  Forward pass for the attention module.

Args:
    x: Input tensor of shape (batch_size, sequence_length, embedding_dim)
    rope: Rotary position embeddings tensor for position-aware attention
    attn_mask: Optional attention mask to apply during attention computation

Returns:
    Tensor of shape (batch_size, sequence_length, dim_out)
Nr+   r@   r   r   rA   rp   F)halfrG   rB   rC   rE   rF   ) rH   r3   rI   r   r/   rJ   rK   rr   rP   rs   rt   r5   r6   ro   getattrrf   catr   type_asr   rL   rM   rN   r    rO   r1   r   rQ   r0   r8   r9   r!   )r:   r>   rv   r   rR   rS   rT   r3   rU   rV   rW   nptrx   rX   s                 r   rY   AttentionRope.forward   s     ''a88((1+C++aAt~~t}}EMMaQRTUWXZ[\CjjmGA!A&&qT^^T]]KUUVWYZ[AA&&qT^^T]]KUUVWYZ[AA&&qT^^T]]KUUVWYZ[A{{1~t{{1~1((C46D		1Q4C4]+-@1aq=AQSW-cdjklttuvwA		1Q4C4]+-@1aq=AQSW-cdjklttuvwA??..a#.2mm$..**A JJAB++D!$2D<<B<'D>>$'DAKK1%%aDMM:IIaLIIaLNN1r   )r0   r    r   r/   r6   rs   r8   r   ro   r9   r!   r5   rr   r3   rp   r1   rt   )r[   NTTr   rB   rB   NNFFTFNN)NN)r\   r]   r^   r_   r`   rf   jitr   ra   rb   rc   r   rd   r   r   re   r.   rg   rY   rh   ri   rj   s   @r   rl   rl   r   s9    		%%
 %)!"%&!!+/*.!$" %#F/F/ F/ c]	F/
 F/ F/  #F/ F/ F/ $C=F/ RYYF/ F/ F/ F/ F/ F/V ,004	6 5<<(6  -	6 6r   rl   r   )typingr   r   r   rf   r   torch.nnr   rL   _fxr	   configr
   pos_embed_sincosr   fxwraprg   r   re   r   rl   r   r   r   <module>r      s    ( (   $ * " 1 ?5<< ?HU\\4J ?  ?]		 ]@HBII Hr   