
    GjA9              
          U d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ dd	lmZ e
rdd
lmZ ddlZddlmZ dgZdaded<   e G d d                      Zed`d            Z	 dadbdZedcd            ZdddZ	 dedfd'Z	 	 dgdhd3Zdid7Zdjd9Z  ed:          Z!dkd=Z"	 	 	 dldmdHZ#	 dndodLZ$ddddddddMddN	dpdQZ%ddddddddRdqdSZ&ddddTdrdWZ'	 	 	 dsddYdtdZZ(ddYdud]Z) ej*        d^e_           dS )vzUBER PROTOTYPE!!!    )annotationsN)	dataclass)cache)AnyTYPE_CHECKING)TypeVarTupleUnpack   )	_registry)
ModuleType)Libraryregister_flash_attention_fa4
str | None_FA4_MODULE_PATHc                  "    e Zd ZU ded<   ddZdS )
_FA4HandlezLibrary | NonelibraryreturnNonec                    d | _         d S N)r   )selfs    Z/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/torch/nn/attention/_fa4.pyremovez_FA4Handle.remove"   s        N)r   r   )__name__
__module____qualname____annotations__r    r   r   r   r      s6              r   r   devicetorch.devicer   intc                J    t           j                            |           \  }}|S r   )torchcudaget_device_capability)r!   major_s      r   _get_device_majorr*   &   s     z//77HE1Lr   flash_attn.cute.interfacemodule_pathstrc                Z    t          |           }| at          t                                S )z
    Register FA4 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA4 implementation.
    )_fa4_import_moduler   r   _fa4_register_kernels)r,   r)   s     r   r   r   ,   s,     	;''A"+--...r   r   c                    t          j        |           }t          |d          rt          |d          st          d|  d          |S )N_flash_attn_fwd_flash_attn_bwdzModule 'z' does not expose FA4 kernels)	importlibimport_modulehasattrRuntimeError)r,   modules     r   r/   r/   ;   sX    $[11F6,-- RWVEV5W5W RPkPPPQQQMr   r   c                 @   t          ddd          } |                     dt          d           |                     dt          d           |                     dt          d           |                     dt
          d           |                     dt          d           | S )	NatenIMPLCUDA_flash_attention_forward+_flash_attention_forward_no_dropout_inplace_flash_attention_backward#_scaled_dot_product_flash_attention,_scaled_dot_product_flash_attention_backward)r   impl!_fa4_flash_attention_forward_impl4_fa4_flash_attention_forward_no_dropout_inplace_impl"_fa4_flash_attention_backward_impl4_fa4_scaled_dot_product_flash_attention_forward_impl5_fa4_scaled_dot_product_flash_attention_backward_impl)libs    r   r0   r0   C   s    
&&&
)
)CHH')JFSSSHH5<  
 HH(*LfUUUHH-<  
 HH6=  
 Jr   r    querytorch.Tensortensorstuple[torch.Tensor, ...]	cum_seq_qtorch.Tensor | Nonerequire_fp32$tuple[tuple[str, torch.Tensor], ...]c                   t          d |D                       sdS t          d |D                       dk    rdS | j        t          j        t          j        fvrdS |D ]!\  }}|j        t          j        k    r| dc S "||                                 dk    rd	S ||                                 d
k    rdS t          j        	                                sdS t          | j                  dvrdS d S )Nc              3  $   K   | ]}|j         V  d S r   )is_cuda.0ts     r   	<genexpr>z,_fa4_common_support_error.<locals>.<genexpr>_   s$      **Qqy******r   zinputs must be CUDA tensorsc                    h | ]	}|j         
S r    )r!   rT   s     r   	<setcomp>z,_fa4_common_support_error.<locals>.<setcomp>a   s    &&&AH&&&r   r
   inputs must share devicez'query dtype must be float16 or bfloat16z dtype must be float32   zdense query must be 4D   zragged query must be 3DzCUDA not available)	   
   z+FA4 requires compute capability 9.0 or 10.0)alllendtyper%   float16bfloat16float32dimr&   is_availabler*   r!   )rI   rK   rM   rO   nametensors         r   _fa4_common_support_errorri   Y   s!    **'***** -,,
&&g&&&''1,,)){5=%.99988$ 3 3f<5=((222222 )UYY[[A--''!1!1((:""$$ $##&&g55<<4r   keyvalue	dropout_pfloatreturn_debug_maskboolalibi_slopes	seqused_kblock_table
num_splits
int | Nonec
                   |dk    rdS |rdS |dS | |j         t          j        k    rdS |j        sdS t	          | j                  }
||
dk    rd|
 d	S |	|	d
k    r|
dk    rd|
 d	S t          | | ||f|          }|
|dk    rdS |S d S )N        dropout_p must be 0zreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDAr^   z+paged KV (block_table) not supported on SM 0r
   z-SplitKV (num_splits > 1) not supported on SM rZ   z(query, key, value must be on same device)ra   r%   int32rS   r*   r!   ri   )rI   rj   rk   rl   rn   rp   rq   rM   rr   rs   r(   errors               r   _fa4_forward_support_errorr{   s   s     C$$ 100++?ek)),,  	,++el++E5B;;EUEEEE*q..Ub[[GuGGGG%	U E
 ...==4r   grad_outout	logsumexpc           	     V    |dk    rdS t          || |||||f|d|ff          }||S d S )Nrv   rw   r~   )rO   )ri   )	r|   rI   rj   rk   r}   r~   rl   rM   rz   s	            r   _fa4_backward_support_errorr      s\     C$$%	5#uc95"I.0	  E 4r   valc                    | dk    rdn| S )z"need to convert -1 to None for FA4Nr    )r   s    r   _aten_to_fa4_window_sizer      s    "9944#%r   Ts
Unpack[Ts]tuple[Unpack[Ts]]c                 4    t          d | D                       S )Nc              3  B   K   | ]}|                     d d          V  dS )r
      N)	transposerT   s     r   rW   z#_transpose_dense.<locals>.<genexpr>   s0      44qQ""444444r   )tuple)rK   s    r   _transpose_denser      s    44G444444r   cu_seq_qcu_seq_kmax_qmax_kscalefloat | None	is_causalwindow_size_leftwindow_size_right!tuple[torch.Tensor, torch.Tensor]c                ,   t           t          d          t          t                     }||t          |	          t          |
          d||||||                                nd ||pd|d} |j        | ||fi |\  }}||                                fS )NFA4 not registeredTr
   )softmax_scalecausalr   r   
return_lsecu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krq   
page_tablers   r}   )r   r7   r/   r   
contiguousr2   )rI   rj   rk   r   r   r   r   r   r   r   r   rq   r}   rr   rs   r8   kwargslses                     r   _fa4_run_forwardr      s    " /000 011F 45EFF56GHH  /8/DY))+++$! oA F &v%eS%BB6BBHC    r   Fdeterministic/tuple[torch.Tensor, torch.Tensor, torch.Tensor]c                   t           t          d          t          t                     }|                    ||||| |                                ||	t          |
          t          |          |||          \  }}}|||fS )Nr   )r   r   r   r   r   r   r   )r   r7   r/   r3   r   r   )r|   rI   rj   rk   r}   r~   r   r   r   r   r   r   r   r8   dqdkdvs                    r   _fa4_run_backwardr      s     /000 011F''12BCC23DEE# (  JBB r2:r   T	r   r   r   rq   rp   r}   rr   compute_auxiliaryrs   	cum_seq_kr   c
       	           t          | ||||	|||||
  
        }|t          d|           t          | |||||||
|||||||          \  }}|rnt          j        dt          j        | j                  }t          j        dt          j        | j                  }t          j        d| j        | j                  }nd }d }d }|||||fS )Nz)FA4 flash_attention forward unsupported: )r   )ra   r!   r    r   )	r{   r7   r   r%   zerosuint64r!   emptyra   )rI   rj   rk   rM   r   r   r   rl   r   rn   r   r   r   rq   rp   r}   rr   r   rs   rz   r   	rng_statephilox_offset
debug_masks                           r   rC   rC   
  s
   , ' E NuNNOOO HC"  KELNNN	Bel5<PPP[%+elKKK

	
Yz99r   )r   r   r   rq   rp   rr   rs   c               V    t          |||||||||	|
|||||| |d|          \  }}}}}|S )NFr   )rC   )r}   rI   rj   rk   rM   r   r   r   rl   r   rn   r   r   r   rq   rp   rr   rs   r)   r   s                       r   rD   rD   J  sb    * 8)+!'  OAsAq!* Jr   )r   r   r   r   unusedc                   t          | ||||||
|          }|t          d|           t          j                    }t	          | ||||||||||||          \  }}}|||fS )Nz*FA4 flash_attention backward unsupported: )r   r7   r%   $are_deterministic_algorithms_enabledr   )r|   rI   rj   rk   r}   r~   rM   r   r   r   rl   r   r   r   r   r   r   rz   r   r   r   r   s                         r   rE   rE   w  s    ( (	 	E OOOPPP>@@M" JBB r2:r   rv   r   c                  t          | ||||d d d           }|t          d|           t          | ||          \  }}	}
t          j        |           }|                    dd          }|                    d          }|	                    d          }t          ||	|
d d |||||||          \  }}}}}|                     d          }|                    d          }||d d |||||f	S )NzFA4 SDPA forward unsupported: r
   r   )r   r}   )r{   r7   r   r%   
empty_liker   sizerC   )rI   rj   rk   rl   r   rn   r   rz   qkvout_bhsdout_bshdmax_q_flashmax_k_flashr)   r   r   r   r   r   r   s                         r   rF   rF     s.    '	 	E CECCDDDuc511GAq!
 &&H!!!Q''H&&))K&&))K3T			4 4 40AsI}j JJqMMEHHQKKE
 
r   philox_seedr   c               T   t          | ||||||
d           }|t          d|           t          |||||           \  }}}}}|                    d          }|                    d          }	t	          ||||||d d ||	|
||||          \  }}}t          |||          \  }}}|||fS )NzFA4 SDPA backward unsupported: r   r   )r   r7   r   r   rE   )r|   rI   rj   rk   r}   r~   rM   r   r   r   rl   r   r   r   r   rz   r   r   r   ogor   r   r   s                           r   rG   rG     s    $ (	 	E DUDDEEE%eS%hGGNAq!QJJqMMEHHQKKE3
				  JBB" ""b"--JBBr2:r   FA4)register_fn)r!   r"   r   r#   )r+   )r,   r-   r   r   )r,   r-   r   r   )r   r   )r    )
rI   rJ   rK   rL   rM   rN   rO   rP   r   r   )NN)rI   rJ   rj   rJ   rk   rJ   rl   rm   rn   ro   rp   rN   rq   rN   rM   rN   rr   rN   rs   rt   r   r   )r|   rJ   rI   rJ   rj   rJ   rk   rJ   r}   rJ   r~   rJ   rl   rm   rM   rN   r   r   )r   rt   r   rt   )rK   r   r   r   )NNN) rI   rJ   rj   rJ   rk   rJ   r   rN   r   rN   r   rt   r   rt   r   r   r   ro   r   rt   r   rt   rq   rN   r}   rN   rr   rN   rs   rt   r   r   )F)r|   rJ   rI   rJ   rj   rJ   rk   rJ   r}   rJ   r~   rJ   r   rN   r   rN   r   r   r   ro   r   rt   r   rt   r   ro   r   r   )&rI   rJ   rj   rJ   rk   rJ   rM   rN   r   rN   r   r#   r   r#   rl   rm   r   ro   rn   ro   r   r   r   rt   r   rt   rq   rN   rp   rN   r}   rN   rr   rN   r   ro   rs   rt   )$r}   rJ   rI   rJ   rj   rJ   rk   rJ   rM   rN   r   rN   r   r#   r   r#   rl   rm   r   ro   rn   ro   r   r   r   rt   r   rt   rq   rN   rp   rN   rr   rN   rs   rt   )"r|   rJ   rI   rJ   rj   rJ   rk   rJ   r}   rJ   r~   rJ   rM   rN   r   rN   r   r#   r   r#   rl   rm   r   ro   r   rJ   r   rJ   r   r   r   rt   r   rt   )rv   FF)rI   rJ   rj   rJ   rk   rJ   rl   rm   r   ro   rn   ro   r   r   )r|   rJ   rI   rJ   rj   rJ   rk   rJ   r}   rJ   r~   rJ   rM   rN   r   rN   r   r#   r   r#   rl   rm   r   ro   r   rJ   r   rJ   r   r   )+__doc__
__future__r   r4   dataclassesr   	functoolsr   typingr   r   typing_extensionsr   r	    r   typesr   r%   torch.libraryr   __all__r   r   r   r*   r   r/   r0   ri   r{   r   r   r   r   r   r   rC   rD   rE   rF   rG   register_flash_attention_implr    r   r   <module>r      sx      # " " " " "     ! ! ! ! ! !       % % % % % % % % 2 2 2 2 2 2 2 2        !        ! ! ! ! ! ! #
  $  # # # #             3/ / / / /       4 :<	    F (,!% % % % %P   .& & & &
 \$5 5 5 5"  $'+!%! %! %! %! %!j  ! ! ! ! !` #'$(%)(,#'+"!)=: =: =: =: =: =:Z #'$(%)(,'+!'* * * * * *z #'$(%0 0 0 0 0 0n #: : : : : : :Z !3 3 3 3 3 3l (	 ';W X X X X X Xr   