
    f,jR                    >   U d Z ddlmZ ddlZddlZddlmZ erddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlZdd	lmZ d
dlmZ dgZdaded<   daded<   e
 G d d                      Zedhd            Z	 didjdZdkdZdldZdmd+Zdnd2Zdod9Z  ed:          Z!dpd=Z"dqd?Z#	 	 	 	 	 	 drdsdJZ$	 dtdudPZ%	 	 	 dvddQdQdddddRddS	dwdVZ&ddQdQdddddWdxdXZ'ddQdQddddddYdydZZ(dddd[dzd^Z)	 	 	 	 	 	 d{dd`d|daZ*	 	 	 d}dd`d~dbZ+dd`ddeZ, ej-        dfeg           dS )z
PROTOTYPE!
Flash Attention 3 implementation.
For fp8: only supports forward pass right now.
For fp16/bf16: supports forward and backward pass.
    )annotationsN)TYPE_CHECKING)Callable)	dataclass)cache)TypeVarTupleUnpack)Library   )	_registryregister_flash_attention_fa3zCallable | None_FA3_CUDA_FWD_FA3_CUDA_BWDc                  "    e Zd ZU ded<   ddZdS )
_FA3HandlezLibrary | NonelibraryreturnNonec                R    d | _         t          j                            d           d S )NF)r   torch_C_set_sdp_use_fa3)selfs    W/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/torch/nn/attention/_fa3.pyremovez_FA3Handle.remove*   s%    !!%(((((    N)r   r   )__name__
__module____qualname____annotations__r    r   r   r   r   &   s6         ) ) ) ) ) )r   r   devicetorch.devicer   intc                J    t           j                            |           \  }}|S N)r   cudaget_device_capability)r"   major_s      r   _get_device_majorr+   0   s     z//77HE1Lr   flash_attn_interfacemodule_pathstrc                    t          |            t          j                            d           t	          t                                S )z
    Register FA3 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA3 implementation.
    T)_fa3_import_moduler   r   r   r   _fa3_register_kernelsr-   s    r   r   r   6   s?     {### 
Hd###+--...r   r   c                   t          j        |            t          t          j        d          st          d|  d          t          t          j        j        d          st          d|  d          t          t          j        j        d          st          d|  d          t          j        j        j        at          j        j        j	        a
d S )Nflash_attn_3zModule 'z' does not expose FA3 kernelsfwdz%' does not expose FA3 forward kernelsbwdz&' does not expose FA3 backward kernels)	importlibimport_modulehasattrr   opsRuntimeErrorr4   r5   r   r6   r   r2   s    r   r0   r0   G   s    K(((59n-- RPkPPPQQQ59)511 
I{III
 
 	
 59)511 
J{JJJ
 
 	
 I*.MI*.MMMr   r
   c                    t          ddd          } |                     dt          d           |                     dt          d           |                     dt          d           |                     dt
          d           |                     dt          d           |                     d	t          d           |                     d
t          d           | S )NatenIMPLCUDAz"_flash_attention_forward.quantizedz-_scaled_dot_product_flash_attention.quantized_flash_attention_forward+_flash_attention_forward_no_dropout_inplace#_scaled_dot_product_flash_attention_flash_attention_backward,_scaled_dot_product_flash_attention_backward)	r
   impl!_fa3_flash_attention_forward_impl4_fa3_scaled_dot_product_flash_attention_forward_impl)_fa3_flash_attention_forward_impl_default4_fa3_flash_attention_forward_no_dropout_inplace_impl<_fa3_scaled_dot_product_flash_attention_forward_impl_default"_fa3_flash_attention_backward_impl5_fa3_scaled_dot_product_flash_attention_backward_impl)libs    r   r1   r1   X   s    
&&&
)
)CHH,.OQW   HH7<  
 HH"$Mv   HH5<  
 HH-D   HH(*LfUUUHH6=  
 Jr   querytorch.Tensortensorstuple[torch.Tensor, ...]	dropout_pfloat	cum_seq_qtorch.Tensor | None	q_descale	k_descale	v_descale
str | Nonec                   |dk    rdS t          d |D                       sdS t          d |D                       dk    rdS | j        t          j        k    r |||t          j        dt                     ||                                 d	k    rd
S ||                                 dk    rdS t          j	        
                                sdS t          | j                  dk    rdS d S )N        zdropout_p must be 0c              3  $   K   | ]}|j         V  d S r&   )is_cuda.0ts     r   	<genexpr>z,_fa3_common_support_error.<locals>.<genexpr>   s$      **Qqy******r   zinputs must be CUDA tensorsc                    h | ]	}|j         
S r!   )r"   r^   s     r   	<setcomp>z,_fa3_common_support_error.<locals>.<setcomp>   s    &&&AH&&&r   r   inputs must share devicezWhen using SDPA with fp8, descale tensor should always be used for accurate dequantization. Please use _scaled_dot_product_attention_quantized and provide the descale tensors.   zdense query must be 4D   zragged query must be 3DzCUDA not available	   z#FA3 requires compute capability 9.0)alllendtyper   float8_e4m3fnwarningswarnUserWarningdimr'   is_availabler+   r"   )rN   rP   rR   rT   rV   rW   rX   s          r   _fa3_common_support_errorrq   y   s    C$$**'***** -,,
&&g&&&''1,,)){e)))Y.)2C+ 	
 	
 	
 UYY[[A--''!1!1((:""$$ $##&&!++444r   keyvaluereturn_debug_maskboolalibi_slopes	seqused_kc           	     t   |rdS |dS | |j         t          j        k    rdS |j        sdS t          j        t          j        t          j        ft          fd| ||hD                       sd S t          d | ||hD                       dk    rd	S t          | | ||f||||	|
          }|
|d
k    rdS |S d S )Nzreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDAc              3  *   K   | ]}|j         v V  d S r&   rj   r_   r`   supported_dtypess     r   ra   z-_fa3_forward_support_error.<locals>.<genexpr>   s+      HHqqw**HHHHHHr   inputs must be one of c                    h | ]	}|j         
S r!   rz   r^   s     r   rc   z-_fa3_forward_support_error.<locals>.<setcomp>   s    111AG111r   r   #all inputs must have the same dtyperd   z(query, key, value must be on same device)
rj   r   int32r]   rk   float16bfloat16rh   ri   rq   )rN   rr   rs   rR   rt   rv   rw   rT   rV   rW   rX   errorr|   s               @r   _fa3_forward_support_errorr      s     100++?ek)),,  	,+++U]ENKHHHHUC4GHHHHH ;:(8:::
11eS%011122a7744%	U E ...==4r   grad_outout	logsumexpwindow_size_left
int | Nonewindow_size_rightc
           	     f   |j         t          j        k    r	 dS |j         t          j        k    rdS t          j        t          j        ft          fd| ||||hD                       sd S t          d | ||||hD                       dk    rdS t          || |||||f||d d d           }
|
|
S d S )NzHFA3 backward does not support fp8 - use inference only (torch.no_grad())zlogsumexp dtype must be float32c              3  *   K   | ]}|j         v V  d S r&   rz   r{   s     r   ra   z._fa3_backward_support_error.<locals>.<genexpr>   s+      WWqqw**WWWWWWr   r}   c                    h | ]	}|j         
S r!   rz   r^   s     r   rc   z._fa3_backward_support_error.<locals>.<setcomp>   s    @@@AG@@@r   r   r   )	rj   r   rk   float32r   r   rh   ri   rq   )r   rN   rr   rs   r   r   rR   rT   r   r   r   r|   s              @r   _fa3_backward_support_errorr      s     {e)))V	
 	
 %-''00u~6WWWWXuc5RU4VWWWWW ;:(8:::
@@hsE3?@@@AAQFF44%	5#uc95 E 4r   Ts
Unpack[Ts]tuple[Unpack[Ts]]c                 4    t          d | D                       S )Nc              3  B   K   | ]}|                     d d          V  dS )r      N)	transposer^   s     r   ra   z#_transpose_dense.<locals>.<genexpr>   s0      44qQ""444444r   )tuple)rP   s    r   _transpose_denser      s    44G444444r   xc                d    | -|                      d          dk    r|                                 n| S )z2Ensure tensor is contiguous in the last dimension.Nr   )stride
contiguous)r   s    r   _maybe_contiguousr      s,    ]qxx||q/@/@1<<>>>aGr   cu_seq_qcu_seq_kmax_qmax_kscalefloat | None	is_causalblock_table
num_splits!tuple[torch.Tensor, torch.Tensor]c                   t           t          d          t          |           }t          |          }|j        t          j        k    rF|                    d          dk    r-|                    d          dk    r|                                nt          |          }t          |          }t          |          }t          |          }t          |          }t          g |||ddd|||dd||||ddddd||||||	|	nd|
|
nddddd|pt	          j                    rdnddt          j	        
                                pdR  \  }}}}||                                fS )	zF
    Run the FA3 forward pass by calling the C++ kernel directly.
    NFA3 not registeredr   r   r   r[   T)r   r;   r   rj   r   rk   r   r   $are_deterministic_algorithms_enabledr   _get_sm_carveout_experimental)rN   rr   rs   r   r   r   r   r   r   r   r   rw   r   rV   rW   rX   r   r   qkvcu_seqlens_qcu_seqlens_ksoftmax_lse	out_accumsoftmax_lse_accums                             r   _fa3_run_forwardr      s   . /000%  A#A ;%---LL!!LL!! 	 u%%  %X..L$X..L!),,I#K00K5B $6	$6	$6 	
$6 		$6
 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6  	!$6" 	#$6$ 	%$6& 	'$6( 	)$6* 	+$6, 	-$6. 	/$60 	1$62 	3$64 -8b5$66 /:7$68 	
9$6: 	;$6< 	=$6> 	?$6@ 	 	F;==DAA1C$6D 	E$6F 	..005AG$6 $6 $62Ci!2J &&((((r   Fmax_seqlen_qmax_seqlen_kdeterministic/tuple[torch.Tensor, torch.Tensor, torch.Tensor]c                   t           t          d          t          |           }|                    d          dk    r|                                n|}|                    d          dk    r|                                n|}|                    d          dk    r|                                n|}t          |          }t          |          }t          j        |          }t          j        |          }t          j        |          }t          |||||||||||d d ||	|
|||d|t
          j                                        pd           |||fS )Nr   r   r   r[   r   )	r   r;   r   r   r   r   
empty_liker   r   )r   rN   rr   rs   r   r   r   r   r   r   r   r   r   r   r   doutr   r   r   olsedqdkdvs                           r   _fa3_run_backwardr   L  si   " /000 X&&D#ll2..!33AJJrNNa//SA#ll2..!33A#A
I
&
&C 
	!		B		!		B		!		B				


..005A-  0 r2:r   r   T	r   r   r   rw   rv   r   r   compute_auxiliaryr   	cum_seq_kr   c       	           t          | ||||	||||
||          }|t          d|           t          | |||||||||||||
||||          \  }}|rnt          j        dt          j        | j                  }t          j        dt          j        | j                  }t          j        d| j        | j                  }nd }d }d }|||||fS )Nz)FA3 flash_attention forward unsupported: )r   )rj   r"   r!   r   )	r   r;   r   r   zerosuint64r"   emptyrj   )rN   rr   rs   rT   r   r   r   rR   r   rt   rV   rW   rX   r   r   r   rw   rv   r   r   r   r   r   r   	rng_statephilox_offset
debug_masks                              r   rF   rF     s   2 ' E NuNNOOO% HC(  KELNNN	Bel5<PPP[%+elKKK

	
Yz99r   )r   r   r   rw   rv   r   r   c               T    t          |||||||||	|
d d d f|||||| |d|d	\  }}}}}|S )NFr   rF   )r   rN   rr   rs   rT   r   r   r   rR   r   rt   r   r   r   rw   rv   r   r   r*   r   s                       r   rI   rI     sr    * 8 )+!-  OAsAq!0 Jr   )r   r   r   rw   rv   r   r   r   c
               J    t          | |||||||||	d d d |
|||||||          S )N)r   r   r   rw   rv   r   r   r   r   )rN   rr   rs   rT   r   r   r   rR   r   rt   r   r   r   rw   rv   r   r   r   s                     r   rH   rH     sZ    * -)+!+   r   )r   r   r   r   unusedc                   t          | ||||||
|||
  
        }|t          d|           t          j                    }t	          | |||||||||	||||nd||nd|          \  }}}|||fS )z0FA3 implementation of _flash_attention_backward.Nz*FA3 flash_attention backward unsupported: r   )r   r;   r   r   r   )r   rN   rr   rs   r   r   rT   r   r   r   rR   r   r   r   r   r   r   r   r   r   r   r   s                         r   rK   rK   ,  s    * ( E OOOPPP>@@M",8b.: JBB" r2:r   r[   r   c	               4   t          | ||||d d d |||          }
|
t          d|
           t          | ||          \  }}}| j        t          j        k    rt          j        n| j        }t	          j        | |          }|                    dd          }|	                    d          }|	                    d          }t          |||d d ||||||	||||          \  }}}}}| 	                    d          }|	                    d          }||d d |||||f	S )NzFA3 SDPA forward unsupported: rz   r   r   )r   r   rV   rW   rX   )r   r;   r   rj   r   rk   r   r   r   sizerF   )rN   rr   rs   rV   rW   rX   rR   r   rt   r   r   r   r   r   	out_dtypeout_bhsdout_bshdmax_q_flashmax_k_flashr*   r   r   r   r   r   r   s                             r   rG   rG   g  sa    ' E CECCDDDuc511GAq!
 #(+1D"D"D%+IY777H!!!Q''H&&))K&&))K3T			4 4 40AsI}j" JJqMMEHHQKKE
 
r   c               4    t          | ||d d d ||||
  
        S )Nr   )rG   )rN   rr   rs   rR   r   rt   r   s          r   rJ   rJ     s:     @   r   philox_seedr   c                  t          | ||||||
ddd
  
        }|t          d|           t          | ||||          \  }}}}}t          ||||||dd||	|
||||          \  }}}t          |||          \  }}}|||fS )zCFA3 implementation of _scaled_dot_product_flash_attention_backward.NzFA3 SDPA backward unsupported: r   )r   r;   r   rK   )r   rN   rr   rs   r   r   rT   r   r   r   rR   r   r   r   r   r   
grad_out_tq_tk_tv_tout_tr   r   r   dq_outdk_outdv_outs                              r   rL   rL     s    & (%eS)YdD E DUDDEEE (8%eS( ($JS#u 4  JBB& .b"b99FFF66!!r   FA3)register_fn)r"   r#   r   r$   )r,   )r-   r.   r   r   )r-   r.   r   r   )r   r
   )rN   rO   rP   rQ   rR   rS   rT   rU   rV   rU   rW   rU   rX   rU   r   rY   )rN   rO   rr   rO   rs   rO   rR   rS   rt   ru   rv   rU   rw   rU   rT   rU   rV   rU   rW   rU   rX   rU   r   rY   )r   rO   rN   rO   rr   rO   rs   rO   r   rO   r   rO   rR   rS   rT   rU   r   r   r   r   r   rY   )rP   r   r   r   )r   rU   r   rU   )NNNNNN)&rN   rO   rr   rO   rs   rO   r   rU   r   rU   r   r$   r   r$   r   r   r   ru   r   r   r   r   rw   rU   r   rU   rV   rU   rW   rU   rX   rU   r   rU   r   r   r   r   )F) r   rO   rN   rO   rr   rO   rs   rO   r   rO   r   rO   r   rU   r   rU   r   r   r   r   r   r   r   ru   r   r$   r   r$   r   ru   r   r   )NNN),rN   rO   rr   rO   rs   rO   rT   rU   r   rU   r   r$   r   r$   rR   rS   r   ru   rt   ru   rV   rU   rW   rU   rX   rU   r   r   r   r$   r   r$   rw   rU   rv   rU   r   rU   r   rU   r   ru   r   r   )$r   rO   rN   rO   rr   rO   rs   rO   rT   rU   r   rU   r   r$   r   r$   rR   rS   r   ru   rt   ru   r   r   r   r$   r   r$   rw   rU   rv   rU   r   rU   r   r   )$rN   rO   rr   rO   rs   rO   rT   rU   r   rU   r   r$   r   r$   rR   rS   r   ru   rt   ru   r   r   r   r$   r   r$   rw   rU   rv   rU   r   rU   r   rU   r   r   )"r   rO   rN   rO   rr   rO   rs   rO   r   rO   r   rO   rT   rU   r   rU   r   r$   r   r$   rR   rS   r   ru   r   rO   r   rO   r   r   r   r   r   r   )NNNr[   FF)rN   rO   rr   rO   rs   rO   rV   rU   rW   rU   rX   rU   rR   rS   r   ru   rt   ru   r   r   )r[   FF)rN   rO   rr   rO   rs   rO   rR   rS   r   ru   rt   ru   r   r   )r   rO   rN   rO   rr   rO   rs   rO   r   rO   r   rO   rT   rU   r   rU   r   r$   r   r$   rR   rS   r   ru   r   rO   r   rO   r   r   ).__doc__
__future__r   r7   rl   typingr   collections.abcr   dataclassesr   	functoolsr   typing_extensionsr   r	   r   torch.libraryr
    r   __all__r   r    r   r   r+   r   r0   r1   rq   r   r   r   r   r   r   r   rF   rI   rH   rK   rG   rJ   rL   register_flash_attention_implr!   r   r   <module>r      s     # " " " " "                   )(((((( ! ! ! ! ! !       2 2 2 2 2 2 2 2  ! ! ! ! ! !       #
 "& % % % %!% % % % % ) ) ) ) ) ) ) )     ./ / / / /"/ / / /"   B" " " "J( ( ( (V# # # #L \$5 5 5 5H H H H$  $%)%)%)'+!%N) N) N) N) N)@  8 8 8 8 8L &*%)%)D: %)(,#'+"!/D: D: D: D: D: D:h %)(,'+!'- - - - - -x %)(,'+#!'+ + + + + +| #'$(%8 8 8 8 8 8~ &*%)%)#D D D D D D DV #      P !2" 2" 2" 2" 2" 2"j (	 ';W X X X X X Xr   