
    |-jB9                        d Z ddlmZmZ ddlZddlmZ ddlmZm	Z	 ddl
mZmZmZmZ  ed          Z e            rdd	lmZ dd
lmZmZmZ erddlmZ ndZ e	j        e          Z G d d          Zdedeeeed         z  f         fdZ	 d'dej        dej        dej        dej        e ej        ej        f         z  fdZ!ej        e"z  Z#	 	 	 	 	 d(dej        de"dz  de e#e#f         dz  dedz  ddf
dZ$dej        de"dej        fd Z%	 	 	 d)d!ej&        j'        dej        dej        dej        d"eej        df         d#e(dz  d$e(dz  d%ej        dz  de ej        ej        dz  f         fd&Z)dS )*a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalUnionN)version   )is_torch_flex_attn_availablelogging)get_torch_versionis_torch_greater_or_equalis_torch_less_or_equalis_torchdynamo_compilingz2.9.0)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskflex_attention)
AuxRequestc                   |     e Zd ZdZdZdZdZ fdZej	        
                    d          d             Zd Z xZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                 l    | j         &t                                          |           | _         | j         S N)	_instancesuper__new__)clsargskwargs	__class__s      h/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/integrations/flex_attention.pyr   zWrappedFlexAttention.__new__D   s*    = !GGOOC00CM}    )	recursivec                    | j         r|| j        k    r|| _        t          d          r!t          j        t
          d          | _        nkt          j        t                                j
        dk    r$|r"t          j        t
          dd          | _        nt          j        t
                    | _        d| _         dS dS )	z>
        Initialize or update the singleton instance.
        2.5.1F)dynamicz2.6.0zmax-autotune-no-cudagraphs)r"   modeTN)_is_flex_compiledtrainingr   torchcompiler   _compiled_flex_attentionr   parser	   base_version)selfr%   s     r   __init__zWrappedFlexAttention.__init__J   s    
 % 	*T])B)B$DM%g.. N05nV[0\0\0\-- 02233@GKKPXK05"E8T1 1 1--
 16n0M0M-%)D""" *C)Br   c                     | j         S r   )r(   )r+   s    r   __call__zWrappedFlexAttention.__call__`   s    ,,r   )__name__
__module____qualname____doc__r   r$   r(   r   r&   compilerdisabler,   r.   __classcell__)r   s   @r   r   r   ;   s          I#     ^e,,* * -,**- - - - - - -r   r   
return_lsereturnr   c                 D    t           rd| rt          d          ndiS d| iS )aU  
    Requests the LSE from flex_attention in a version-agnostic fashion.

    Before torch 2.9, the LSE was requested via the boolean return_lse field. However, starting with
    torch 2.9, an AuxRequest object must be passed via the aux_request field. This method conditionally
    returns the correct form based on the python version.
    
return_auxT)lseNr6   )_TORCH_FLEX_USE_AUXr   )r6   s    r   get_flex_attention_lse_kwargsr<   d   s8      LjJjT2222dKK*%%r   Fquerykeyvaluec                 p    t                      s t          |                      nt          } || ||fi |S r   )r   r   r   )r=   r>   r?   r%   r   flex_attention_compileds         r   compile_friendly_flex_attentionrB   r   s\     G_F`F`t<28<<>>>ft""  	  r   Tattention_mask_2dattention_chunk_sizeoffsets	is_causalr   c                 v     j         \  }}|s|}|s|}|t          z  dz   t          z  }t          j        j                             dd||z
  f            j        }	                                 |@                                                    d          	                    d          dz
  |z   fdfd}
 fd}|s|n|n|
|>|d         
                    |	          |d         
                    |	          fd	}n}t          ||d|||	t          d
                     S )aG  
    IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
    and will be removed in a future version without warnings. New code should not use it. It is only kept here
    for BC for now, while models using it are being patched accordingly.

    Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
    Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full (causal) block
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
       r   )r?   padNc                 l    ||k    }	| |f         	| |f         k    }| |f         dk    }||z  |z  }|S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.
        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskrC   document_idss
           r   causal_mask_modz4make_flex_block_causal_mask.<locals>.causal_mask_mod   sV     vo$Y%56,yRXGX:YY(E)9:Q> </-?
r   c                 V    | |f         | |f         k    } | |||          }||z  S )zU
        Combines the chunk mask with the causal mask for chunked attention.
        rL   )rM   rN   rO   rP   
chunk_maskcausal_doc_maskrV   
chunk_idxss         r   chunk_causal_mask_modz:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   sC      	5 01Z	6@Q5RR
)/)XufMMO++r   c                 Z    | |f         | |f         k    }| |f         dk    }||z  }|S )zp
        Utilizes default attention mask to enable encoder and encoder-decoder
        attention masks.
        r   rL   )	rM   rN   rO   rP   rR   rS   rT   rC   rU   s	          r   default_mask_modz5make_flex_block_causal_mask.<locals>.default_mask_mod   sH    
 %Y%56,yRXGX:YY(F):;a?!M1
r   c                 4    |z   }|z   } | |||          S r   rL   )	rM   rN   rO   rP   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_modz-make_flex_block_causal_mask.<locals>.mask_mod   s.    x'H*I**9h)TTTr   r!   )rd   BHQ_LENKV_LENdevice_compile)shapeflex_default_block_sizer&   nn
functionalrI   ri   clonefill_cumsumtor   r   )rC   rD   query_length
key_lengthrE   rF   
batch_sizetotal_seq_lenpad_lenri   r[   r]   rd   rV   rZ   rU   ra   rb   rc   s   `            @@@@@@r   make_flex_block_causal_maskrx      s   D !2 7J #"
 %$55:>UUG+//0AQRT[^hThPi/jj%F$**,,L'"((**0033::2>>BH\]
     , , , , , ,	 	 	 	 	 	  m"25I5Q//Wl1:==((AJMM&))		U 	U 	U 	U 	U 	U 	U 	U
 +

+G444	 	 	 	r   hidden_statesn_repc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    rH   N)rk   expandreshape)ry   rz   batchnum_key_value_headsslenhead_dims         r   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr   moduleattention_maskscalingsoftcaps_auxc           
         |                     dd          dk    rt          d          d }	d t          |t                    r|}	n|d d d d d d d |j        d         f         fd}
d}|j        d         }||dz
  z  dk    rTt          ||j        d         |j        d         z            }t          ||j        d         |j        d         z            }d	}|                     d
          }|j        j        dk    }|s|t          d          t          |||f|
|	|||| j	        dt          |          }|rt          r|\  }}|j        }n|\  }}|                    |j                  }||j        \  }}}}|                    dddd                              |||d          }|                    d          }t%          j        t%          j        ||gd          dd          }t%          j        ||z
            }||z  }|                    |j                  }n|}d }|                    dd                                          }||fS )Ndropoutg        r   z`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.c                 ~    t          j        | z            z  } | |         d         |         |         z   } | S )Nr   )r&   tanh)scorerM   rN   rO   rP   
score_maskr   s        r   	score_modz)flex_attention_forward.<locals>.score_mod!  sL    ej999E!Jy1!4U;FCCE r   TrH   Fkernel_optionscpuzhAttention sinks cannot be run on CPU with flex attention. Please switch to a different device, e.g. CUDA)r   
block_mask
enable_gqascaler   r%   rJ   )dim)r   keepdimr   )get
ValueError
isinstancer   rk   r   ri   typerB   r%   r<   r;   r:   rr   dtypeviewr|   	unsqueezer&   	logsumexpcatexp	transpose
contiguous)r   r=   r>   r?   r   r   r   r   r   r   r   r   num_local_query_headsr   r6   flex_attention_outputattention_outputauxr:   ru   	num_heads	seq_len_q_sinkslse_expandedcombined_lserenorm_factorr   s         `                    @r   flex_attention_forwardr     s    zz)S!!A%%a
 
 	

 JJ.),, $#

#
111aaa39R= 89
      J!KN 	!6!:;AAU[^sy|;<<%Q5;q>!ABB
ZZ 011N"e+J 
%+v
 
 	
 < %   (

3
3      	:$9!c'CC$9!c ffU[!!2B2H/J	9aJJq"a++22:y)UVWWE
 ==,,L ?59lE5JPR+S+S+SY[eijjjL "Il\&ABBM/-?/225;??0'11!Q77BBDDS  r   )F)NNNNT)NNN)*r2   typingr   r   r&   	packagingr   utilsr   r   utils.import_utilsr	   r
   r   r   r;   !torch.nn.attention.flex_attentionr   rl   r   r   r   r   
get_loggerr/   loggerr   booldictstrr<   TensortuplerB   intOffsetrx   r   rm   Modulefloatr   rL   r   r   <module>r      sj   8 # " " " " " " "        9 9 9 9 9 9 9 9            0/88   !! gggggg^^^^^^^^^^ @@@@@@@
 
	H	%	%&- &- &- &- &- &- &- &-R&d &tCQ]H^A^<^7_ & & & &$ 	 <	 < \E%,455   $ 
	 (,,0!o o|o*o
 66>"T)o d{o o o o od	UU\ 	U# 	U%, 	U 	U 	U 	U$ ! !%g! g!HOg!<g! 
g! <	g!
 %,34g! T\g! T\g! <$g! 5<,,-g! g! g! g! g! g!r   