
    RЦi,                         S r SSKJrJrJrJr  SSKrSSKJr  SSKJs  J	r
  SSKJr  SSKJr  SSKJr  S	\\   4S
 jr " S S\R&                  5      r " S S\R&                  5      rg)a\  Halo Self Attention

Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
    - https://arxiv.org/abs/2103.12731

@misc{2103.12731,
Author = {Ashish Vaswani and Prajit Ramachandran and Aravind Srinivas and Niki Parmar and Blake Hechtman and
    Jonathon Shlens},
Title = {Scaling Local Self-Attention for Parameter Efficient Visual Backbones},
Year = {2021},
}

Status:
This impl is a WIP, there is no official ref impl and some details in paper weren't clear to me.
The attention mechanism works but it's slow as implemented.

Hacked together by / Copyright 2021 Ross Wightman
    )ListOptionalTupleUnionN)nn   )make_divisible)trunc_normal_)_assertpermute_maskc                    U R                   u  p4pVUR                   S   nUS-   S-  nXR                  SS5      -  n	U	R                  SXW5      n	[        R                  " U	SS/5      R                  S5      n
[        R                  " U
SXu-
  /5      n
U
R                  SUS-   U5      n
U
SS2SU2US-
  S24   n	U	R                  X4SXX5      R                  SSUSS5      n	U	R                  U5      $ )aa  Compute relative logits along one dimension

As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

Args:
    q: (batch, height, width, dim)
    rel_k: (2 * window - 1, dim)
    permute_mask: permute output dim according to this
r   r      N)shape	transposereshapeFpadflattenexpandpermute)qrel_kr   BHWdimrel_sizewin_sizexx_pads              T/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/layers/halo_attn.pyrel_logits_1dr$      s     77LA!{{1~H1"H	
__R$	$A			"a"A EE!aV$$Q'EEE%!X\*+E MM"a!eX.Ea!X\]"#A 	
		!1'..r2xRHA99\""    c            	       T   ^  \ rS rSrSr  SS\S\S\S\4U 4S jjjrS rS	 r	S
r
U =r$ )PosEmbedRel=   zRelative Position Embedding
As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925


block_sizer    dim_headscalec                 J  > XVS.n[         TU ]  5         Xl        X0l        X@l        [
        R                  " [        R                  " US-  S-
  U40 UD65      U l	        [
        R                  " [        R                  " US-  S-
  U40 UD65      U l
        U R                  5         g)z
Args:
    block_size: block size
    win_size: neighbourhood window size
    dim_head: attention head dim
    scale: scale factor (for init)
devicedtyper   r   N)super__init__r)   r*   r+   r   	Parametertorchempty
height_rel	width_relreset_parameters)	selfr)   r    r*   r+   r.   r/   dd	__class__s	           r#   r1   PosEmbedRel.__init__C   s      /$ 
,,u{{8a<!3CX'TQS'TUekk(Q,2BH&SPR&STr%   c                     [         R                  R                  R                  U R                  U R
                  S9  [         R                  R                  R                  U R                  U R
                  S9  g )Nstd)r3   r   initnormal_r5   r+   r6   )r8   s    r#   r7   PosEmbedRel.reset_parameters^   sH    doo4::>dnn$**=r%   c                 *   UR                   u  p#pEUR                  SU R                  U R                  U R                  5      n[	        XR
                  SS9nUR                  SS5      n[	        XR                  SS9nXv-   nUR                  X#US5      nU$ )Nr   )r   r      r      )r   r   r   )r   rC   r   rD   r   )r   r   r)   r*   r$   r6   r   r5   )	r8   r   r   BBHW_rel_logits_wrel_logits_h
rel_logitss	            r#   forwardPosEmbedRel.forwardb   s    wwr IIb$//4??DMMJ$Q_U KK1$QoV!0
''r26
r%   )r)   r*   r5   r+   r6   )NN)__name__
__module____qualname____firstlineno____doc__intfloatr1   r7   rK   __static_attributes____classcell__r:   s   @r#   r'   r'   =   sO          	 
    6> r%   r'   c                      ^  \ rS rSrSr             SS\S\\   S\\\\4      S\S\S\\   S	\S
\S\S\	S\	S\	4U 4S jjjr
S rS rSrU =r$ )HaloAttnr   ac  Halo Attention

Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
    - https://arxiv.org/abs/2103.12731

The internal dimensions of the attention module are controlled by the interaction of several arguments.
  * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
  * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
  * the query and key (qk) dimensions are determined by
    * num_heads * dim_head if dim_head is not None
    * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
  * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used

Args:
    dim (int): input dimension to the module
    dim_out (int): output dimension of the module, same as dim if not set
    feat_size (Tuple[int, int]): size of input feature_map (not used, for arg compat with bottle/lambda)
    stride: output stride of the module, query downscaled if > 1 (default: 1).
    num_heads: parallel attention heads (default: 8).
    dim_head: dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
    block_size (int): size of blocks. (default: 8)
    halo_size (int): size of halo overlap. (default: 3)
    qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
    qkv_bias (bool) : add bias to q, k, and v projections
    avg_down (bool): use average pool downsample instead of strided query blocks
    scale_pos_embed (bool): scale the position embedding as well as Q @ K
r   dim_out	feat_sizestride	num_headsr*   r)   	halo_sizeqk_ratioqkv_biasavg_downscale_pos_embedc                   > XS.n[         TU ]  5         U=(       d    UnX%-  S:X  d   eUS;   d   eXPl        U=(       d    [        X)-  SS9U-  U l        X R                  -  U l        XPR                  -  U l        XPR
                  -  U l        U R                  S-  U l        Xl	        U=U l
        U l        Xl        XxS-  -   U l        SU l        S	nUS:  a>  U=(       d    Xt-  S:g  nU(       a  SOUU l        U R                  U R                  -  U l        [        R                   " XR                  S4U R                  U
S
.UD6U l        [        R                   " XR                  U R                  -   S4SU
0UD6U l        ['        SU R                  U R                  U R                  U R                  S.UD6U l        U(       a  [        R*                  " SS5      O[        R,                  " 5       U l        U R1                  5         g )Nr-   r   )r   r      )divisor      r   r   F)r\   biasrg   )r)   r    r*   r+    )r0   r1   r]   r	   dim_head_qk
dim_head_v
dim_out_qk	dim_out_vr+   rb   r)   block_size_dsr^   r    block_strider   Conv2dr   kvr'   	pos_embed	AvgPool2dIdentitypoolr7   )r8   r   rZ   r[   r\   r]   r*   r)   r^   r_   r`   ra   rb   r.   r/   r9   use_avg_poolr:   s                    r#   r1   HaloAttn.__init__   s   " /.S"a'''"#a~g6HRS'TXa'a!^^3#&6&66"__4%%-
./99$,""]2A:#?z':a'?L%1vD!%D4E4E!ED
 3b4;L;LS[b_ab))C4>>!A1Z8ZWYZ$ 
))]]%%**	

 
 +7BLLA&BKKM	r%   c                 h   U R                   R                  R                  S   S-  n[        U R                   R                  US9  [        U R                  R                  US9  [        U R
                  R                  U R                  S9  [        U R
                  R                  U R                  S9  g )Nr   rf   r=   )	r   weightr   r
   rp   rq   r5   r+   r6   )r8   r>   s     r#   r7   HaloAttn.reset_parameters   ss    ffmm!!!$,dffmm-dggnn#.dnn//TZZ@dnn..DJJ?r%   c                 l   UR                   u  p#pE[        X@R                  -  S:H  S5        [        XPR                  -  S:H  S5        X@R                  -  nXPR                  -  nXg-  nU R                  U5      n	U	R	                  SU R
                  X`R                  XpR                  5      R                  SSSSSS5      n	U	R	                  X R                  -  U R
                  SU5      R                  SS5      n	U R                  U5      n
[        R                  " XR                  U R                  U R                  U R                  /5      n
U
R                  SU R                  U R                  5      R                  SU R                  U R                  5      R	                  X R                  -  U R
                  U R                   -   US5      R                  SSSS5      n
["        R$                  " XR
                  U R                   /SS	9u  pU R&                  (       a4  XR                  SS
5      -  U R)                  U	5      -   U R*                  -  nO3XR                  SS
5      -  U R*                  -  U R)                  U	5      -   nUR-                  SS	9nX-  R                  SS5      nUR	                  SU R                  U R                  Xg5      nUR                  SSSSS5      R/                  5       R1                  X R2                  X@R4                  -  XPR4                  -  5      nU R7                  U5      nU$ )Nr    r   r   rC      r   rD   )r   r   )r   r   r)   r   r   ri   rm   r   r]   r   rp   r   r   r^   unfoldr    rj   r3   splitrb   rq   r+   softmax
contiguousviewrl   rn   rt   )r8   r!   r   Cr   r   num_h_blocksnum_w_blocks
num_blocksr   rp   kvattnouts                  r#   rK   HaloAttn.forward   s   WW
aOO#q("-OO#q("-OO+OO+!0
FF1III  ,,l<N<NPPWPWXY[\^_abdeghPi 	
 IIa..($*:*:B
KUUVWYZ[ WWQZ UU2WXYYq$--9@@DMMSWSbSbckk 0 04?? BJPRTT[T[\]_`bcefTg 	{{2 0 0$//BK B++dnnQ.??4::MDB++tzz9DNN1<MMD|||#x""1a(kk"d00$2D2Dlakk!Q1a(335::~~q$5$55q<M<M7MO iin
r%   )r)   rm   rn   ri   rj   rk   rl   r^   rp   r]   rt   rq   r   r+   rb   r    )NNr   rd   Nrd   rC   g      ?FFFNN)rM   rN   rO   rP   rQ   rR   r   r   rS   boolr1   r7   rK   rT   rU   rV   s   @r#   rX   rX   r   s    < &*37&*!""$)7 7  c]7   c3h0	7 
 7  7  sm7  7  7  7  7  7  "7  7 r@) )r%   rX   )rQ   typingr   r   r   r   r3   r   torch.nn.functional
functionalr   helpersr	   weight_initr
   trace_utilsr   rR   r$   Moduler'   rX   rh   r%   r#   <module>r      s`   $ 0 /     # &  #$s) #>2")) 2jEryy EPr%   