
    |-jy                     d   d dl Z ddlmZmZ ddlmZ  ej        e          Z e            Z	de j
        de j        j        de j        fdZ	 	 	 	 	 	 dde j        j        de j
        d
e j
        de j
        de j
        dz  dededz  dedz  dedz  dedz  de j
        dz  dee j
        df         fdZdS )    N   )_flash_attention_forward!flash_attn_supports_top_left_mask)loggingquerymodulereturnc                 ,   | j         t          j        k    r~t          j        d          rt          j        d          S t          |j        d          r|j        j         S t          d |                                D                       j	        j         S dS )ziIf the query is in float32, return a target dtype compatible with flash attention. Return None otherwise.cuda_is_quantizedc              3   X   K   | ]%}t          |t          j        j                  !|V  &d S )N)
isinstancetorchnnLinear).0layers     i/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/integrations/flash_attention.py	<genexpr>z#get_target_dtype.<locals>.<genexpr>   s8      bb%z%QVQYQ`?a?abbbbbbb    N)
dtyper   float32is_autocast_enabledget_autocast_dtypehasattrconfignextmodulesweight)r   r   s     r   get_target_dtyper       s    {em##$V,, 	p+F333V]O44 	p=&&bb6>>+;+;bbbbbioo4r           keyvalueattention_maskdropoutscalingsliding_windowsoftcap	is_causals_auxc                 T   |                     dd          rt                              d           |j        d         }t	          d |j        D                       rt          d          |                    dd          }|                    dd          }|                    dd          }t          ||           }|	|	n| j        }	t          ||||f||	||||t          || j        j        t          | d          r| j        nd |
|
                    |j                  nd d	|}|d fS )
Noutput_attentionsFzFlash Attention does not support `output_attentions=True`. Please set your attention to `eager` if you want any of these features.r   c              3   "   K   | ]
}|d k    V  dS )r   N )r   dims     r   r   z*flash_attention_forward.<locals>.<genexpr>0   s&      
+
+3!8
+
+
+
+
+
+r   zTensor query has shape  with a zero dimension.
FlashAttention does not support inputs with dim=0.
Please check your input shapes or use SDPA instead.   	layer_idx)query_lengthr)   r%   softmax_scaler'   r(   use_top_left_masktarget_dtypeattn_implementationr1   r*   )getloggerwarning_onceshapeany
ValueError	transposer    r)   r   _use_top_left_maskr   _attn_implementationr   r1   tor   )r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   kwargsseq_lenr5   attn_outputs                  r   flash_attention_forwardrD      sw    zz%u-- 
W	
 	
 	
 k!nG

+
+u{
+
+
+++ 
B
 
 	
 OOAq!!E
--1

COOAq!!E $E622L '2		8HI*	
 %,!"M>&-fk&B&BL&""   HHU[!!!% ( ) K. r   )r!   NNNNN)r   modeling_flash_attention_utilsr   r   utilsr   
get_logger__name__r8   r>   Tensorr   Moduler   r    floatintbooltuplerD   r.   r   r   <module>rO      s~    h h h h h h h h       
	H	%	%6688 
EL 
%(/ 
ek 
 
 
 
&  !% !!%C CHOC<C 
C <	C
 L4'C C T\C $JC T\C d{C <$C 5<C C C C C Cr   