
    RЦi                         S r SSKJrJrJr  SSKrSSKJr  SSKJs  J	r
  SSKJrJr  SSKJr  SSKJr  S\\   4S	 jr " S
 S\R(                  5      r " S S\R(                  5      rg)aO  Bottleneck Self Attention (Bottleneck Transformers)

Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605

@misc{2101.11605,
Author = {Aravind Srinivas and Tsung-Yi Lin and Niki Parmar and Jonathon Shlens and Pieter Abbeel and Ashish Vaswani},
Title = {Bottleneck Transformers for Visual Recognition},
Year = {2021},
}

Based on ref gist at: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2

This impl is a WIP but given that it is based on the ref gist likely not too far off.

Hacked together by / Copyright 2021 Ross Wightman
    )ListOptionalTupleN   )	to_2tuplemake_divisible)trunc_normal_)_assertpermute_maskc                    U R                   u  p4pVXR                  SS5      -  nUR                  SUSU-  S-
  5      n[        R                  " USS/5      R                  S5      n[        R                  " USUS-
  /5      nUR                  SUS-   SU-  S-
  5      nUSS2SU2US-
  S24   nUR                  X4SXU5      R                  SSUSS5      nUR                  U5      $ )ag  Compute relative logits along one dimension

As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

Args:
    q: (batch, heads, height, width, dim)
    rel_k: (2 * width - 1, dim)
    permute_mask: permute output dim according to this
   r   r   N)shape	transposereshapeFpadflattenexpandpermute)	qrel_kr   BHWdimxx_pads	            Z/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/layers/bottleneck_attn.pyrel_logits_1dr!      s     77LA!	
__R$	$A			"aQ"A EE!aV$$Q'EEE%!QU$E MM"a!eQUQY/Ea!QUVmA 	
		!1 ''B2r:A99\""    c                   Z   ^  \ rS rSrSr  S
S\\\4   S\S\4U 4S jjjrS r	S r
S	rU =r$ )PosEmbedRel8   zRelative Position Embedding
As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
	feat_sizedim_headscalec                   > XES.n[         TU ]  5         [        U5      u  U l        U l        X l        X0l        [        R                  " [        R                  " U R                  S-  S-
  U40 UD65      U l        [        R                  " [        R                  " U R                  S-  S-
  U40 UD65      U l        U R                  5         g )Ndevicedtyper   r   )super__init__r   heightwidthr'   r(   nn	Parametertorchempty
height_rel	width_relreset_parameters)selfr&   r'   r(   r+   r,   dd	__class__s          r    r.   PosEmbedRel.__init__=   s     /"+I"6TZ 
,,u{{4;;?Q3F'WTV'WXekk$**q.12Dh&URT&UVr"   c                     [         R                  R                  R                  U R                  U R
                  S9  [         R                  R                  R                  U R                  U R
                  S9  g )Nstd)r3   r1   initnormal_r5   r(   r6   r8   s    r    r7   PosEmbedRel.reset_parametersP   sH    doo4::>dnn$**=r"   c                    UR                   u  p#nUR                  X R                  U R                  S5      n[	        XR
                  SS9nUR                  SS5      n[	        XR                  SS9nXe-   nUR                  X#U5      nU$ )Nr   )r   r      r      )r   r   r   )r   rD   r   rE   r   )r   r   r/   r0   r!   r6   r   r5   )r8   r   r   HW_rel_logits_wrel_logits_h
rel_logitss           r    forwardPosEmbedRel.forwardT   s}    77q IIadjj"5$Q_U KK1$QoV!0
''r2
r"   )r'   r/   r5   r(   r0   r6   )NN)__name__
__module____qualname____firstlineno____doc__r   intfloatr.   r7   rK   __static_attributes____classcell__r:   s   @r    r$   r$   8   sM      S#X    	   &> r"   r$   c                      ^  \ rS rSrSr          SS\S\\   S\\\\4      S\S\S\\   S	\S
\	S\	4U 4S jjjr
S rS rSrU =r$ )BottleneckAttnd   a=  Bottleneck Attention
Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605

The internal dimensions of the attention module are controlled by the interaction of several arguments.
  * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
  * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
  * the query and key (qk) dimensions are determined by
    * num_heads * dim_head if dim_head is not None
    * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
  * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used

Args:
    dim (int): input dimension to the module
    dim_out (int): output dimension of the module, same as dim if not set
    stride (int): output stride of the module, avg pool used if stride == 2 (default: 1).
    num_heads (int): parallel attention heads (default: 4)
    dim_head (int): dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
    qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
    qkv_bias (bool): add bias to q, k, and v projections
    scale_pos_embed (bool): scale the position embedding as well as Q @ K
r   dim_outr&   stride	num_headsr'   qk_ratioqkv_biasscale_pos_embedc                   > XS.n[         TU ]  5         Uc   S5       eU=(       d    UnX%-  S:X  d   eXPl        U=(       d    [        X'-  SS9U-  U l        X R                  -  U l        XPR                  -  U l        XPR
                  -  U l        U R                  S-  U l        Xl	        [        R                  " XR                  S-  U R                  -   S4S	U0UD6U l        [        U4U R                  U R                  S
.UD6U l        US:X  a  [        R                  " SS5      O[        R                   " 5       U l        U R%                  5         g )Nr*   zBA concrete feature size matching expected input (H, W) is requiredr      )divisor      r   r   bias)r'   r(   )r-   r.   r\   r   dim_head_qk
dim_head_v
dim_out_qk	dim_out_vr(   r_   r1   Conv2dqkvr$   	pos_embed	AvgPool2dIdentitypoolr7   )r8   r   rZ   r&   r[   r\   r'   r]   r^   r_   r+   r,   r9   r:   s                r    r.   BottleneckAttn.__init__z   s/    /$j&jj$.S"a'''"#a~g6HRS'TXa'a!^^3#&6&66"__4%%-
.99S//A"5"F_PX_\^_ %Yb9I9IQUQ[Q[b_ab*0A+BLLA&2;;=	r"   c                 (   [        U R                  R                  U R                  R                  R                  S   S-  S9  [        U R                  R
                  U R                  S9  [        U R                  R                  U R                  S9  g )Nr   rc   r=   )r	   rj   weightr   rk   r5   r(   r6   rA   s    r    r7   BottleneckAttn.reset_parameters   s\    dhhoo488??+@+@+Ct+KLdnn//TZZ@dnn..DJJ?r"   c                    UR                   u  p#pE[        X@R                  R                  :H  S5        [        XPR                  R                  :H  S5        U R                  U5      n[        R                  " XR                  U R                  U R                  /SS9u  pgnUR                  X R                  -  U R                  S5      R                  SS5      nUR                  X R                  -  U R                  S5      nUR                  X R                  -  U R                  S5      R                  SS5      nU R                  (       a$  Xg-  U R                  U5      -   U R                   -  n	O#Xg-  U R                   -  U R                  U5      -   n	U	R#                  SS9n	X-  R                  SS5      R                  X R                  XE5      n
U R%                  U
5      n
U
$ )N r   )r   r   r   )r   r
   rk   r/   r0   rj   r3   splitrg   rh   r   r\   re   r   rf   r_   r(   softmaxrn   )r8   r   r   Cr   r   r   kvattnouts              r    rK   BottleneckAttn.forward   su   WW
a^^***B/^^)))2.HHQK ++a//4??DNN!SYZ[aIIa..($*:*:B?II"bQIIa..($*:*:B?IIa..($//2>HHRPEDNN1--;DETZZ'$..*;;D|||#x""2r*221nnaKiin
r"   )
re   rf   rg   rh   r\   rn   rk   rj   r(   r_   )
NNr   rE   Ng      ?FFNN)rM   rN   rO   rP   rQ   rR   r   r   rS   boolr.   r7   rK   rT   rU   rV   s   @r    rX   rX   d   s    0 &*37&*!"$)" "  c]"   c3h0	" 
 "  "  sm"  "  "  ""  " H@
 r"   rX   )rQ   typingr   r   r   r3   torch.nnr1   torch.nn.functional
functionalr   helpersr   r   weight_initr	   trace_utilsr
   rR   r!   Moduler$   rX    r"   r    <module>r      sZ     ) (     . &  #$s) #8)")) )XURYY Ur"   