
    ёiJ                   h   S SK Jr  S SKJrJrJr  S SKrS SKJs  J	r
  S SKJr  S SKJr  S SKJr  S SKJr  S SKJrJrJr  \(       a  S S	KJr  S S
KJr  \   S/       S0S jj5       rS1S jr\     S2                   S3S jj5       r\     S2                   S4S jj5       r\     S2                   S5S jj5       r      S6S jrS7S jrS7S jr\   S8SSSSSS.                       S9S jjj5       r\   S8SSSSSS.                       S:S jjj5       r\   S8SSSSSS.                       S;S jjj5       r   S<SSSSSS.S jjr\   S8SSSSS.                 S=S jjj5       r\   S8SSSSS.                 S>S jjj5       r\   S8SSSSS.                 S?S  jjj5       r   S<SSSSS.S! jjr\       S@                               SAS" jj5       r \       S@                               SBS# jj5       r \       S@                               SCS$ jj5       r        SDS% jr              SES& jr!             SES' jr"\        SF                             SGS( jj5       r#\        SF                             SHS) jj5       r#\        SF                             SIS* jj5       r#        SJS+ jr# SKSSSSSSSSSSSS,.                             SLS- jjjr$        SMS. jr%g)N    )annotations)TYPE_CHECKINGLiteraloverloadN)_C_ops)in_dynamic_or_pir_mode)LayerHelper)signature_safe_contextmanager)
SDPBackend_get_enabled_backendssdpa_kernel)	Generator)TensorFTc              #  x  #    / nU(       a  UR                  [        R                  5        U(       a  UR                  [        R                  5        U (       a  UR                  [        R                  5        U(       d  [        S5      e[        U5       n Uv    SSS5        g! f = f! , (       d  f       g= f7f)z|
With the sdp_kernel context manager, different algorithm implementations can
be selected for scaled_dot_product_attention.
z$At least one backend must be enabledN)appendr   FLASH_ATTENTIONEFFICIENT_ATTENTIONMATH
ValueErrorr   )enable_mathenable_flashenable_mem_efficientbackend_listcontexts        d/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/nn/functional/flash_attention.py
sdp_kernelr   %   s      LJ667J::;JOO,?@@	\	"g	M	 
#	" 	 
#	"s0   BB:B)B$	B:$B&&B))
B73B:c                z    [         R                  " U S5      nSUl        [         R                  " USS9nSUl        U$ )Ng     T   )diagonal)paddle	full_likestop_gradienttriu)xmasks     r   get_triangle_upper_maskr&   B   s9    At$DD;;ta(DDK    .c	                    g N 	querykeyvaluer%   dropout_ratecausalreturn_softmaxtrainingscales	            r   _math_attentionr4   J        r'   c	                    g r)   r*   r+   s	            r   r4   r4   X         r'   c	                    g r)   r*   r+   s	            r   r4   r4   f        $'r'           c	                d   U R                   S   n	[        R                  " U / SQ5      n [        R                  " U/ SQ5      n[        R                  " U/ SQ5      nU=(       d    U	S:w  a  U	S-  OSn[        R                  " X-  USS9n
U(       d  Ub  X-   n
[        R
                  " U
5      nO[        R                  " 5       nS	U;   dL  S
U;   dF  U
R                   S   S:  d3  U
R                   S   S:  d   U
R                   S   U
R                   S   :w  a&  [        U
5      nX-   n
[        R
                  " U
5      nO[        R                  R                  U
5      nUS:  a  [        R                  " XUSS9n[        R                  " X5      n[        R                  " U/ SQ5      nX(       a  U4$ S4$ )zt
This is a basic implementation of scaled dot product attention composed of
combinations of fundamental components.
)r      r      r         g      ?T)r$   ytranspose_yNxpucpu    i @  r:   upscale_in_train)r2   mode)shaper    	transposematmulFsoftmax
get_devicer&   incubate softmax_mask_fuse_upper_triangledropout)r,   r-   r.   r%   r/   r0   r1   r2   r3   head_dimproductweightsplaceouts                 r   r4   r4   t   ss     {{2HUL1E


3
-CUL1E?Ahn3EmmemsEGnG))G$ !!#UN~}}R 2%}}R 5(}}R GMM"$55 +73DnGii(GooFFwOGc))H;M
 --
'C


3
-C>33t33r'   c                    U S::  a  gg)N   
flash_attnmem_efficientr*   )rQ   s    r   _select_sdp_cudarZ      s    3r'   c                J   [         R                  " 5       nSU;   a  g[        5       nU(       d  [        S5      e[        R
                  U;   n[        R                  U;   n[        R                  U;   nUSL a  USL a  USL a  gSU;  a  gUSL a  USL a  [        U 5      $ USL a  gg)	z
There are currently three different implementation options available for
scaled dot product attention, and the chosen approach depends on whether it
is determined by the sdp_kernel configuration or specified through input values.
rB   rX   z@No available backend for scaled_dot_product_attention was found.TFmathgpurY   )	r    rM   r   AssertionErrorr   r   r   r   rZ   )rQ   rT   enabled_backendsr   r   r   s         r   _select_sdpr`      s     E~,.N
 	
 //%55K--1AAL%99=MMd5 %9U%Bt 4 <))tr'   )fixed_seed_offsetrng_namer2   namesoftmax_scalec                   g r)   r*   r,   r-   r.   rP   r0   r1   ra   rb   r2   rc   rd   s              r   flash_attentionrg      s     r'   c                   g r)   r*   rf   s              r   rg   rg      s      r'   c                   g r)   r*   rf   s              r   rg   rg      s     $'r'    c                  U R                   S   n[        U5      nUS:X  Ga|  S[        R                  " 5       ;   a  SnOkS[        R                  " 5       ;   a  SnOO[        R                  " S/5      S   (       a  SnO-[        R
                  R                  R	                  S/5      S   n[        5       (       d  US:X  d   S5       eUS	:X  d  US:X  d   S
5       eU(       a  US:X  d   S5       eUb  US:X  d   S5       eUS:X  d  US:X  d   S5       eU(       d  US:X  d   S5       eU	b  US:X  d   S5       eU
b  US:X  d   S5       e[        5       (       a  US:X  a6  [        R                  " U UUUSUUUU(       + U5
      u  p  nUU(       a  U4$ S4$ US:X  aA  U
c  U R                   S   S-  n
[        R                  " U UUSSSSU
USSS	SSSS5      u  nnUS4$ [        SU 35      e[        S#0 [        5       D6nUR                  SS9nUR                  U5      nUR                  U5      nUR                  [        R                   5      nUR                  [        R"                  5      nU UUUS.nUUUUS.nUR%                  SUUUUUU(       + US.S9  UU(       a  U4$ S4$ US:X  a  SS KJn  U" U UUSUSUS!9nUS4$ [+        U UUUUUUS"9$ )$aj
  
The equation is:

.. math::

    result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
The dimensions of the three parameters are the same.
``d`` represents the size of the last dimension of the three parameters.

Warning:
    This API is only support inputs with dtype float16 and bfloat16.

Args:
    query(Tensor): The query tensor in the Attention module.
                    4-D tensor with shape:
                    [batch_size, seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    key(Tensor): The key tensor in the Attention module.
                    4-D tensor with shape:
                    [batch_size, seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    value(Tensor): The value tensor in the Attention module.
                    4-D tensor with shape:
                    [batch_size, seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    dropout(float): The dropout ratio.
    causal(bool): Whether enable causal mode.
    return_softmax(bool): Whether to return softmax.
    fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
    training(bool): Whether it is in the training phase.
    rng_name(str): The name to select Generator.
    name(str|None, optional): The default value is None. Normally there is no need for user
                    to set this property. For more information, please refer to
                    :ref:`api_guide_Name`.

Returns:
    out(Tensor): The attention tensor.
                4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim].
                The dtype can be float16 or bfloat16.
    softmax(Tensor): The softmax tensor. None if return_softmax is False.

Examples:
    .. code-block:: python

        >>> import paddle

        >>> paddle.seed(2023)
        >>> q = paddle.rand((1, 128, 2, 16))

        >>> output = paddle.nn.functional.flash_attention.flash_attention(q, q, q, 0.9, False, False)
        >>> print(output)
        (Tensor(shape=[1, 128, 2, 16], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[[0.34992966, 0.34456208, 0.45826620, ..., 0.39883569,
            0.42132431, 0.39157745],
           [0.76687670, 0.65837246, 0.69117945, ..., 0.82817286,
            0.76690865, 0.71485823]],
          ...,
          [[0.71662450, 0.57275224, 0.57053083, ..., 0.48108247,
            0.53336465, 0.54540104],
           [0.59137970, 0.51350880, 0.50449550, ..., 0.38860250,
            0.40526697, 0.60541755]]]]), None)

r>   rX   rB   r=   iluvatar_gpuFLAGS_cudnn_deterministicFLAGS_flash_attn_versionz2flash attention 3 only support dynamic or pir moder:   z*flash attention 3 does not support dropoutz1flash attention 3 does not support return softmaxNz6flash attention 3 does not support setting seed_offsetrj   z3flash attention 3 does not support setting rng_namez3flash attention 3 does not support setting trainingz/flash attention 3 does not support setting namez8flash attention 2 does not support setting softmax_scaler<   r?   r   Fr   !Invalid flash attention version: qinput_param_name)rp   kvra   rU   rL   softmax_lseseed_offsetrP   r0   r1   is_testrb   typeinputsoutputsattrsrY   memory_efficient_attention	attn_biaspr3   r2   r/   r0   r1   r2   )rX   )rH   r`   r    rM   	get_flagsbase	frameworkr   r   rX   flash_attn_v3r   r	   localsinput_dtype"create_variable_for_type_inferencefloat32int64	append_op-paddle.incubate.nn.memory_efficient_attentionr   r4   )r,   r-   r.   rP   r0   r1   ra   rb   r2   rc   rd   rQ   sdp_func_name
fa_versionresult_attentionresult_softmax_rU   rv   helperdtyperL   rw   r|   r}   r   outputs                              r   rg   rg     s   ^ {{1~H)M$F%%''Jv0022J:;<'
 J..88+,(*J &'':? 	
@	
: #~q 	
8	
0 "Z1_ 	
?	
4 !(J!O 	
D	
; 2~q 	
A	
0 :? 	
A	
* |zQ 	
=	
. $
a 	
F	
7 "##Q;A;L;L%" L<8!1a (&4N :>  q ($)KKO$=M#)#7#7!!$ [$ Dy  7
|D  6VX6""C"877>;;EB??O??M!2	
 &&	
 	" "0'<$	 	 	
 ~G77477O+ 0!F 4<"$-! r'   )ra   rb   r2   rc   c                   g r)   r*   qkvrP   r0   r1   ra   rb   r2   rc   s           r   flash_attn_qkvpackedr     r5   r'   c                   g r)   r*   r   s           r   r   r     r7   r'   c                   g r)   r*   r   s           r   r   r      r9   r'   c                  U R                   S   n[        U5      n	U	S:X  a  [        5       (       a4  [        R                  " U USUUUU(       + U5      u  n
nnnX(       a  U4$ S4$ [        S0 [        5       D6nUR                  SS9nUR                  U5      nUR                  U5      nUR                  [        R                  5      nUR                  [        R                  5      nU US.nUUUUS.nUR                  SUUUUUU(       + US	.S
9  X(       a  U4$ S4$ U SS2SS2SS24   R                  SSSU R                   S   /5      nU SS2SS2S4   nU SS2SS2S4   nU	S:X  a  SSKJn  U" UUUSUSUS9nUS4$ [!        UUUUUUUS9$ )ak	  
The equation is:

.. math::

    result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
The dimensions of the three parameters are the same.
``d`` represents the size of the last dimension of the three parameters.

Warning:
    This API only supports inputs with dtype float16 and bfloat16.
    Don't call this API if flash_attn is not supported.

Args:
    qkv(Tensor): The query/key/value packed tensor in the Attention module.
                    5-D tensor with shape:
                    [batchsize, seqlen , num_heads/num_heads_k + 2, num_heads_k, head_dim].
                    The dtype can be float16 or bfloat16.
    dropout(float): The dropout ratio.
    causal(bool): Whether enable causal mode.
    return_softmax(bool): Whether to return softmax.
    fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
    training(bool): Whether it is in the training phase.
    rng_name(str): The name to select Generator.
    name(str|None, optional): The default value is None. Normally there is no need for user
                    to set this property. For more information, please refer to
                    :ref:`api_guide_Name`.

Returns:
    - out(Tensor). The attention tensor. 4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim]. The dtype can be float16 or bfloat16.
    - softmax(Tensor). The softmax tensor. None if return_softmax is False.

Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('flash_attn need A100 compile')
        >>> import paddle

        >>> paddle.seed(2023)
        >>> q = paddle.rand((1, 128, 2, 16))
        >>> qkv = paddle.stack([q, q, q], axis=2)
        >>> output = paddle.nn.functional.flash_attn_qkvpacked(qkv, 0.9, False, False)
        >>> print(output)
        (Tensor(shape=[1, 128, 2, 16], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[[0.34992966, 0.34456208, 0.45826620, ..., 0.39883569,
            0.42132431, 0.39157745],
           [0.76687670, 0.65837246, 0.69117945, ..., 0.82817286,
            0.76690865, 0.71485823]],
          ...,
          [[0.71662450, 0.57275224, 0.57053083, ..., 0.48108247,
            0.53336465, 0.54540104],
           [0.59137970, 0.51350880, 0.50449550, ..., 0.38860250,
            0.40526697, 0.60541755]]]]), None)
        >>> # doctest: -SKIP

r<   rX   Nr   r   rq   )r   ra   ru   rx   rz   rE   r   rY   r   r   r   )r   )rH   r`   r   r   r   r	   r   r   r   r    r   r   r   reshaper   r   r4   )r   rP   r0   r1   ra   rb   r2   rc   rQ   r   r   r   r   r   r   rU   rL   rv   rw   r|   r}   r,   r-   r.   r   r   s                             r   r   r     s   J yy}H)M$!## ++!	  $~^OO4OO@vx@""E":77>;;EB??O??M!2

 &&	
 	'" "0'<$	 	 	
 ~G77477 Aq#2#I&&1b#))B-'@A!Q(mAq"HO+ 0!F 4<"$-! r'   c                    g r)   r*   r,   r-   r.   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr3   rP   r0   r1   ra   rb   r2   rc   s                  r   flash_attn_unpaddedr     s    " r'   c                    g r)   r*   r   s                  r   r   r     s    "  r'   c                    g r)   r*   r   s                  r   r   r     s    " $'r'   c                   [        5       (       a9  [        R                  " U UUUUUSUUUUU	U
U(       + U5      u  nnX(       a  U4$ S4$ [        S	0 [	        5       D6nUR                  SS9nUR                  U5      nUR                  U5      nUR                  [        R                  5      nUR                  [        R                  5      nU UUUUUS.nUUUUS.nUR                  SUUUUUUU	U
U(       + US.S9  UU
(       a  U4$ S4$ )
a
  
The equation is:

.. math::

    result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
The dimensions of the three parameters are the same.
``d`` represents the size of the last dimension of the three parameters.

Warning:
    This API is only support inputs with dtype float16 and bfloat16.

Args:
    query(Tensor): The query tensor in the Attention module.
                    3-D tensor with shape:
                    [total_seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    key(Tensor): The key tensor in the Attention module.
                    3-D tensor with shape:
                    [total_seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    value(Tensor): The value tensor in the Attention module.
                    3-D tensor with shape:
                    [total_seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                    used to index query.
    cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                    used to index key and value.
    max_seqlen_q(int): Maximum sequence length of query in the batch.
    max_seqlen_k(int): Maximum sequence length of key/value in the batch.
    scale(float): The scaling of QK^T before applying softmax.
    dropout(float, optional): The dropout ratio.
    causal(bool, optional): Whether enable causal mode.
    return_softmax(bool, optional): Whether to return softmax.
    fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
    rng_name(str, optional): The name to select Generator.
    training(bool, optional): Whether it is in the training phase.
    name(str|None, optional): The default value is None. Normally there is no need for user
                    to set this property. For more information, please refer to
                    :ref:`api_guide_Name`.

Returns:
    out(Tensor): The attention tensor.
                3-D tensor with shape: [total_seq_len, num_heads, head_dim].
                The dtype can be float16 or bfloat16.
    softmax(Tensor): The softmax tensor. None if return_softmax is False.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> paddle.seed(2023)
        >>> q = paddle.rand((2, 128, 8, 16), dtype='float16')
        >>> cu = paddle.arange(0, 384, 128, dtype='int32')
        >>> qq = paddle.reshape(q, [256, 8, 16])
        >>> output = paddle.nn.functional.flash_attention.flash_attn_unpadded(qq, qq, qq, cu, cu, 128, 128, 0.25, 0.0, False, False)

Nr   rp   rq   )rp   rs   rt   r   r   ra   ru   r   r   r3   rP   r0   r1   ry   rb   rz   )r   )r   r   r   r	   r   r   r   r    r   r   r   )r,   r-   r.   r   r   r   r   r3   rP   r0   r1   ra   rb   r2   rc   r   r   r   r   rU   rL   rv   rw   r|   r}   s                            r   r   r     s[   \  &&L
	
$  >KKtKK;&(;F4E

3
3E
:C77>G;;FNNKK;;FLLIK$$.F ""	G "((,#| 	
	   >33t33r'   c                >    [        U UUUUUUUUU	U
UUUUUUUUU5      $ r)   )flash_attn_varlen_func)r,   r-   r.   r   r   r   r   	seqused_q	seqused_krd   r0   qv	q_descale	k_descale	v_descalewindow_sizesoftcap
num_splitspack_gqa	sm_margins                       r   flash_attention_v3_varlenr   m  sM    , "
) r'   c                   S[         R                  " 5       ;  d   S5       e[         R                  " S/5      S   (       a   S5       e[         R                  R                  R                  S/5      S   S:X  d   S5       e[        5       (       d   S5       eUb   S
5       eU	c'  U R                  S   Ub  UR                  S   OS-   S-  n	[        R                  " U UUUUUUUUUUUUU	U
US   US   UUUS	LUb  UOSU5      u  nnUU4$ )ac	  
The equation is:
.. math::
    result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
The dimensions of the three parameters are the same.
``d`` represents the size of the last dimension of the three parameters.
This is the varlen version of flash attention.
Warning:
    This API is only support inputs with dtype float16 and bfloat16.
Args:
    query(Tensor): The query tensor in the Attention module.
                    3-D tensor with shape:
                    [token_num, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    key(Tensor): The key tensor in the Attention module.
                    3-D tensor with shape:
                    [token_num, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    value(Tensor): The value tensor in the Attention module.
                    3-D tensor with shape:
                    [token_num, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                    used to index query.
    cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                    used to index key and value.
    causal(bool): Whether enable causal mode.
    softmax_scale(float): The softmax scale of the attention.
    max_seqlen_q(int): Maximum sequence length of query in the batch. Note it's the padding length, not the max actual seqlen.
    max_seqlen_k(int): Maximum sequence length of key/value in the batch.
Returns:
    out(Tensor): The attention tensor. 3-D tensor with shape: [token_num, num_heads, head_dim]. The dtype can be float16 or bfloat16.
    softmax(Tensor): The softmax tensor. None if return_softmax is False.
Examples:
    .. code-block:: python
        >>> # doctest: +SKIP('flash_attn_v3 need H100 compile')
        >>> import paddle
        >>> paddle.seed(2023)
        >>> q = paddle.rand((10, 2, 128), dtype="bfloat16")
        >>> cu_seqlens_q = paddle.to_tensor([0, 10], dtype="int32")
        >>> max_seq_len_q = 10
        >>> output = paddle.nn.functional.flash_attention.flash_attention_v3_varlen(q, q, q, cu_seqlens_q, cu_seqlens_q, max_seqlen_q=max_seq_len_q, max_seqlen_k=max_seq_len_q, causal=True)
        >>> # doctest: -SKIP
rB   z.flash_attn_varlen_func is not supported on xpurm   z5flash_attn_varlen_func does not support deterministicrn   r>   zDFLAGS_flash_attn_version is 2, conflicts with flash_attn_varlen_funcz7flash_attn_varlen_func only support dynamic or pir modeNz2flash_attn_varlen_func does not support setting qvr<   r   r?   r   F)	r    rM   r   r   r   r   rH   r   flash_attn_v3_varlen)r,   r-   r.   r   r   r   r   r   r   rd   r0   r   r   r   r   r   r   r   r   r   rU   rv   s                         r   r   r     sp   F ))++ 8+ !< =># ?>? 
 	'')C(DE&	
 	N
 NN	 "## A# :KKK:KKOr~rxx|1E 22
AA(e-C0 r'   c                    g r)   r*   r   r   r   r   r   r3   rP   r0   r1   ra   rb   varlen_paddedr2   rc   s                 r   flash_attn_varlen_qkvpackedr     s      r'   c                    g r)   r*   r   s                 r   r   r   &  s       r'   c                    g r)   r*   r   s                 r   r   r   9  s      $'r'   c                   [        5       (       a8  [        R                  " U UUU	SUUUUUUU(       + U
U5      u  nnX(       a  U4$ S4$ [        S	0 [	        5       D6nUR                  SS9nUR                  U5      nUR                  U5      nUR                  [        R                  5      nUR                  [        R                  5      nU UUU	S.nUUUUS.nUR                  SUUUUUUUUU(       + U
S.S9  UU(       a  U4$ S4$ )
a,
  
The equation is:

.. math::

    result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
The dimensions of the three parameters are the same.
``d`` represents the size of the last dimension of the three parameters.

Warning:
    This API only supports inputs with dtype float16 and bfloat16.

Args:
    qkv(Tensor): The padded query/key/value packed tensor in the Attention module. The padding part won't be computed
                    4-D tensor with shape:
                    [total_seq_len, num_heads/num_heads_k + 2, num_heads_k, head_dim].
                    The dtype can be float16 or bfloat16.
    cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                    used to index query.
    cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                    used to index key and value.
    max_seqlen_q(int): Maximum sequence length of query in the batch. Note it's the padding length, not the max actual seqlen
    max_seqlen_k(int): Maximum sequence length of key/value in the batch.
    scale(float): The scaling of QK^T before applying softmax.
    dropout(float, optional): The dropout ratio.
    causal(bool, optional): Whether enable causal mode.
    return_softmax(bool, optional): Whether to return softmax.
    fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
    rng_name(str, optional): The name to select Generator.
    training(bool, optional): Whether it is in the training phase.
    name(str|None, optional): The default value is None. Normally there is no need for user
                    to set this property. For more information, please refer to
                    :ref:`api_guide_Name`.

Returns:
    - out(Tensor). The attention tensor. The tensor is padded by zeros. 3-D tensor with shape: [total_seq_len, num_heads, head_dim]. The dtype can be float16 or bfloat16.
    - softmax(Tensor). The softmax tensor. None if return_softmax is False.

Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('flash_attn need A100 compile')
        >>> import paddle
        >>> paddle.seed(2023)
        >>> q = paddle.rand((2, 128, 8, 16), dtype='float16')
        >>> cu = paddle.arange(0, 384, 128, dtype='int32')
        >>> qq = paddle.reshape(q, [256, 8, 16])
        >>> qkv = paddle.stack([qq, qq, qq], axis=2)
        >>> output = paddle.nn.functional.flash_attn_varlen_qkvpacked(qkv, cu, cu, 128, 128, 0.25, 0.0, False, False)
        >>> # doctest: -SKIP

Nr   r   rq   )r   r   r   ra   ru   r   rz   )r   )r   r   r   r	   r   r   r   r    r   r   r   )r   r   r   r   r   r3   rP   r0   r1   ra   rb   r   r2   rc   r   r   r   r   rU   rL   rv   rw   r|   r}   s                           r   r   r   L  sR   L  ..L
	
"  >KKtKKC&(CF6E

3
3E
:C77>G;;FNNKK;;FLLIK$$.	F ""	G *((,#| 	
	   >33t33r'   )rP   r0   r   return_softmax_lsereturn_seed_offsetra   rb   r2   rc   rd   
block_maskc                  UGb7  [        U[        5      (       a  Xf4nU R                  S   nU R                  S   nUb   S5       eU(       a\  [        R                  " US   S-   XS   -   S-   SS9R                  SSUS45      n[        R                  " X?S9R                  US5      nO[        R                  " SSUS4SS9n[        R                  " US   S-   XS   -   S-   SS9USSSS2S4'   [        R                  " US   * XS   -
  SS9USSSS2S4'   [        R                  " USUS	9R                  US5      nUb
  Uc   S
5       eUc+  [        R                  " U UUU	SUUSU(       + U
5
      u  nnnnGOUR                  [        R                  :X  d   SUR                   35       e[        UR                  5      S:X  d   SUR                   35       eUR                  S   UR                  S   :X  d'   SUR                  S    SUR                  S    35       eUR                  S   UR                  S   :X  d'   SUR                  S    SUR                  S    35       eUR                  S   SUR                  S   4;   d   S5       eUGb4  UR                  [        R                  :X  d   SUR                   35       eUR                  S   UR                  S   :X  d'   SUR                  S    SUR                  S    35       eUR                  S   UR                  S   :X  d'   SUR                  S    SUR                  S    35       eUR                  S   U R                  S   S-   S-  :X  d   S5       eUR                  S   UR                  S   S-   S-  :X  d   S5       eUR                  S   S:X  d   S5       eU(       aG  UR                  S   S:X  a  SnOxUR                  S   S:X  a  SnOb[        SUR                  S    35      eUR                  S   S:X  a  SnO1UR                  S   S:X  a  SnO[        SUR                  S    35      eS[        R                  " 5       ;  a)  [        R                   " S /5      S    (       a
  Ub   S!5       eS[        R                  " 5       ;   a  SnO[        R"                  R$                  R!                  S"/5      S"   S:X  aH  [        R"                  R$                  R!                  S /5      S    (       a  U R                  S   S:  a  SnO-[        R"                  R$                  R!                  S"/5      S"   nUS:X  a>  Ub   S#5       eUb   S$5       e[        R&                  " U UUUU	UUSU(       + U
5
      u  nnnnOUS:X  a  US%:X  d   S&5       eU(       a   S'5       eU	b   S(5       eU
S):X  d   S*5       eU(       d   S+5       eUb   S,5       eUc  U R                  S   S--  n[        R(                  " U UUUUUU5      u  nnO[        S.U 35      eU/nU(       a  UU/-  nU(       a  UW/-  n[        U5      S:X  a  US   $ U$ )/ax  
FlashMask: Official Implementation

This module provides the official implementation of the FlashMask algorithm as described in the paper. For more details, please refer to the paper available at: https://arxiv.org/abs/2410.01359.

The core equation utilized in FlashMask is as follows:

.. math::

    \text{result} = \text{softmax}\left(\frac{Q \cdot K^T}{\sqrt{d}} + M\right) \cdot V

In this equation:

    - ``Q``, ``K``, and ``V`` are the input tensors to the attention module.
    - All these tensors share the same dimensions.
    - ``d`` denotes the size of the last dimension of these tensors.
    - ``M`` represents the column-wise sparse mask introduced by FlashMask.

Args:
    query (Tensor):  The query tensor in the attention module.
        A 4-D tensor with shape [batch_size, q_seq_len, num_heads, head_dim].
        The dtype can be float16 or bfloat16.
    key (Tensor): The key tensor in the attention module.
        A 4-D tensor with shape [batch_size, k_seq_len, k_num_heads, head_dim].
        The dtype can be float16 or bfloat16.
    value (Tensor): The value tensor in the attention module.
        A 4-D tensor with shape [batch_size, k_seq_len, k_num_heads, head_dim].
        The dtype can be float16 or bfloat16.
    startend_row_indices(Tensor):
        A column-wise sparse attention mask row indices tensor.
        A 4-D tensor with shape [batch_size, k_num_heads, k_seq_len, {1, 2, 4}].
        The dtype must be int32. k_num_heads can be 1 or the same as key's num_heads. When num_heads is 1, it will be broadcast to match key's num_heads.
        Depending on the value of the causal parameter, startend_row_indices can take different shapes and meanings.

        - When `causal=True` and the shape is [batch_size, k_num_heads, k_seq_len, 1],
          indicating unidirectional attention. The value represents the starting row index of the left
          lower triangular mask in the dense mask. The value startend_row_indices[..., 0] indicates that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) will be masked.
        - When `causal=True` and the shape is [batch_size, k_num_heads, k_seq_len, 2],
          indicating unidirectional attention. The values represent the starting and ending row indices of
          the left lower triangular mask in the dense mask. The values startend_row_indices[..., 0:2] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) but above the startend_row_indices[..., 1]-th row (exclusive) will be masked.
        - When `causal=False` and the shape is [batch_size, k_num_heads, k_seq_len, 2],
          indicating bidirectional attention. The values represent the starting row index of the left
          lower triangular mask and the ending row index of the right upper triangular mask in the dense mask. The values startend_row_indices[..., 0:2] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) will be masked, and elements in the upper right triangle starting from the startend_row_indices[..., 1]-th row upwards (exclusive) will be masked.
        - When `causal=False` and the shape is [batch_size, k_num_heads, k_seq_len, 4] ,
          indicating bidirectional attention. The values represent the start and end row indices of the
          left lower triangular mask and the start and end row indices of the right upper triangular mask in the dense mask. The values startend_row_indices[..., 0:4] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) but above the startend_row_indices[..., 1] row (exclusive) will be masked, and elements in the upper right triangle starting from the startend_row_indices[..., 2]-th row downwards (inclusive) but above the startend_row_indices[..., 3] row (exclusive) will be masked.

    dropout (float): The dropout ratio. Default is 0.0.
    causal (bool): Whether to enable causal mode. Default is False.
    window_size (int|tuple, optional): Indicates the window size of sliding window local attention.
        If causal mode is enabled, Query at position i will only attend to keys between [i - window_size, i] or [i - window_size[0], i].
        If causal mode is disabled, Query at position i will only attend to keys between [i - window_size, i + window_size] or [i - window_size[0], i + window_size[1]].
    return_softmax_lse (bool): Whether to return the log-sum-exp of the softmax. Default is False.
    return_seed_offset (bool): Whether to return the random seed offset. Default is False.
    fixed_seed_offset(Tensor, optional): With fixed seed, offset for dropout mask.
    rng_name (str): The name to select Generator.
    training (bool): Whether the module is in training mode. Default is True.
    name (str, optional): Name of the operation. Default is None. Normally, users do not need to set this property.
        For more information, refer to :ref:`api_guide_Name` .
    block_mask (tensor, optional):
        A 4-D integer mask tensor indicating whether each block in the attention matrix should be kept or masked. Must be used together with flashmask.
        The shape should be [batch_size, num_heads, blocklen_q, blocklen_k], where:

        blocklen_q = ceil(seqlen_q / 128), i.e., block_mask.shape[2] must be (seqlen_q + 127) // 128
        blocklen_k = ceil(seqlen_k / 128), i.e., block_mask.shape[3] must be (seqlen_k + 127) // 128
        block_mask.shape[1] (number of heads) must match the num_heads dimension of the flashmask
        Both seqlen_q and seqlen_k must be less than or equal to 128 * 1024
        The dtype should be int32, and each element should be either 0 or 1.
        A value of 1 indicates that the corresponding block is kept (not masked), while 0 means the block is masked.

        Usage Notes:

        Only supported when blockdim_q = blockdim_k = 128 now.
        Only supported when headdim = 128 now.
        This argument must be provided together with flashmask.
        The mask will be applied at the block level: each [i, j] position in block_mask controls whether the corresponding [128 x 128] block in the attention matrix is masked.
        Any mismatch in expected shape or head dimension will raise an error.


Returns
    Tensor. The computed attention result with the same shape as the input `query`.

Warning:
    This API only supports inputs with dtype float16 and bfloat16.

Hint:
    This API supports GQA.

Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('flash_attn need A100 compile')
        >>> import paddle
        >>> paddle.seed(2023)
        >>> q = paddle.rand((1, 10, 2, 32),dtype="bfloat16") # shape: [batch_size, seq_len, num_heads, head_dim]
        >>> k = paddle.rand((1, 10, 2, 32),dtype="bfloat16") # shape: [batch_size, seq_len, num_heads, head_dim]
        >>> v = paddle.rand((1, 10, 2, 32),dtype="bfloat16") # shape: [batch_size, seq_len, num_heads, head_dim]
        >>> startend_row_indices = paddle.to_tensor([8]*10 + [5]*10, dtype="int32").reshape([1, 2, 10, 1])
        >>> output = paddle.nn.functional.flashmask_attention(q, k, v, startend_row_indices, causal=True)
        >>> print(output)
        Tensor(shape=[1, 10, 2, 32], dtype=bfloat16, place=Place(gpu:0), stop_gradient=True,
            [[[[0.82421875, 0.27539062, 0.80859375, 0.98046875, 0.00251770,
                0.41992188, 0.17285156, 0.11767578, 0.42773438, 0.31250000,
                0.34570312, 0.70312500, 0.29296875, 0.44531250, 0.51562500,
                0.96093750, 0.85546875, 0.15625000, 0.34765625, 0.98437500,
                0.96484375, 0.45312500, 0.33593750, 0.56640625, 0.07714844,
                0.43750000, 0.83984375, 0.66796875, 0.93750000, 0.24804688,
                0.51171875, 0.55468750],
                [0.54687500, 0.74609375, 0.43164062, 0.32421875, 0.10693359,
                0.37304688, 0.53906250, 0.17187500, 0.57421875, 0.75000000,
                0.13378906, 0.57031250, 0.19531250, 0.01403809, 0.29101562,
                0.14257812, 0.07568359, 0.88671875, 0.75390625, 0.17089844,
                0.87109375, 0.93359375, 0.89843750, 0.58203125, 0.75390625,
                0.27539062, 0.67968750, 0.24804688, 0.57812500, 0.67578125,
                0.92578125, 0.98046875]],

                [[0.59765625, 0.62890625, 0.62109375, 0.75781250, 0.03295898,
                0.64062500, 0.27929688, 0.20800781, 0.72265625, 0.52343750,
                0.53125000, 0.61718750, 0.57421875, 0.56640625, 0.65625000,
                0.48242188, 0.68359375, 0.42968750, 0.26562500, 0.86718750,
                0.83203125, 0.40820312, 0.38281250, 0.59765625, 0.43945312,
                0.22851562, 0.86328125, 0.51562500, 0.89453125, 0.62500000,
                0.50390625, 0.67968750],
                [0.34765625, 0.61328125, 0.58593750, 0.60156250, 0.43164062,
                0.41601562, 0.71093750, 0.59765625, 0.53515625, 0.78125000,
                0.13867188, 0.30664062, 0.48828125, 0.04394531, 0.24316406,
                0.18847656, 0.10644531, 0.71093750, 0.69140625, 0.35937500,
                0.44531250, 0.81640625, 0.44140625, 0.64062500, 0.81640625,
                0.61328125, 0.72265625, 0.53125000, 0.49414062, 0.59765625,
                0.54296875, 0.61328125]],

                [[0.65234375, 0.47656250, 0.71875000, 0.64843750, 0.23828125,
                0.61328125, 0.29101562, 0.26562500, 0.54296875, 0.60937500,
                0.67187500, 0.67578125, 0.64062500, 0.41406250, 0.47656250,
                0.40820312, 0.66406250, 0.39453125, 0.39453125, 0.62109375,
                0.58593750, 0.31054688, 0.31835938, 0.45703125, 0.52343750,
                0.43164062, 0.64453125, 0.49804688, 0.82812500, 0.48242188,
                0.38476562, 0.59375000],
                [0.44921875, 0.62109375, 0.50390625, 0.51562500, 0.51953125,
                0.57812500, 0.78515625, 0.73437500, 0.60546875, 0.55078125,
                0.30273438, 0.23339844, 0.60546875, 0.33007812, 0.23242188,
                0.30468750, 0.34570312, 0.70703125, 0.72656250, 0.58593750,
                0.40234375, 0.62109375, 0.62109375, 0.69531250, 0.66796875,
                0.51562500, 0.45898438, 0.67968750, 0.48828125, 0.50000000,
                0.54687500, 0.71875000]],

                [[0.67578125, 0.50000000, 0.58203125, 0.62109375, 0.43554688,
                0.69531250, 0.30273438, 0.24023438, 0.57812500, 0.63671875,
                0.51171875, 0.52734375, 0.60546875, 0.45507812, 0.42382812,
                0.46093750, 0.55859375, 0.34960938, 0.39453125, 0.57031250,
                0.55078125, 0.47265625, 0.24609375, 0.51953125, 0.46093750,
                0.49218750, 0.49609375, 0.60156250, 0.76953125, 0.57421875,
                0.40429688, 0.57031250],
                [0.45703125, 0.71093750, 0.58984375, 0.43164062, 0.54296875,
                0.57031250, 0.72265625, 0.61328125, 0.64453125, 0.50781250,
                0.28125000, 0.19531250, 0.60546875, 0.40625000, 0.18554688,
                0.33203125, 0.40039062, 0.58593750, 0.79687500, 0.45507812,
                0.32812500, 0.58203125, 0.70703125, 0.64453125, 0.53906250,
                0.57421875, 0.48828125, 0.53515625, 0.49804688, 0.50000000,
                0.48437500, 0.55468750]],

                [[0.64453125, 0.43164062, 0.54687500, 0.53125000, 0.42187500,
                0.71484375, 0.30273438, 0.21484375, 0.50390625, 0.69531250,
                0.58203125, 0.51562500, 0.61328125, 0.41992188, 0.40039062,
                0.46679688, 0.58984375, 0.39062500, 0.41992188, 0.49023438,
                0.47851562, 0.47070312, 0.30078125, 0.50390625, 0.47656250,
                0.44921875, 0.43164062, 0.63671875, 0.78125000, 0.60156250,
                0.48242188, 0.58203125],
                [0.52343750, 0.69921875, 0.58984375, 0.35156250, 0.49218750,
                0.58593750, 0.71093750, 0.59375000, 0.66406250, 0.49414062,
                0.24023438, 0.18554688, 0.66796875, 0.50000000, 0.23144531,
                0.29882812, 0.49414062, 0.57031250, 0.70312500, 0.42773438,
                0.35351562, 0.47460938, 0.73437500, 0.53125000, 0.47070312,
                0.49609375, 0.50000000, 0.55078125, 0.50000000, 0.45898438,
                0.45703125, 0.61328125]],

                [[0.63671875, 0.41210938, 0.52734375, 0.56640625, 0.44531250,
                0.64843750, 0.37890625, 0.31250000, 0.56640625, 0.62890625,
                0.53125000, 0.51562500, 0.54296875, 0.50781250, 0.35546875,
                0.41601562, 0.55468750, 0.36914062, 0.35937500, 0.45117188,
                0.46875000, 0.49609375, 0.28710938, 0.50000000, 0.49609375,
                0.50000000, 0.51562500, 0.57031250, 0.77734375, 0.62109375,
                0.43164062, 0.50781250],
                [0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        ]],

                [[0.62109375, 0.44531250, 0.46875000, 0.61328125, 0.39062500,
                0.60156250, 0.41015625, 0.28710938, 0.58984375, 0.67968750,
                0.55859375, 0.48632812, 0.51562500, 0.42382812, 0.37695312,
                0.46679688, 0.54687500, 0.44921875, 0.33789062, 0.36328125,
                0.49023438, 0.44140625, 0.25000000, 0.45312500, 0.43945312,
                0.45507812, 0.46679688, 0.57812500, 0.65625000, 0.64062500,
                0.42382812, 0.57031250],
                [0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        ]],

                [[0.62500000, 0.47070312, 0.51562500, 0.61328125, 0.36718750,
                0.66406250, 0.37890625, 0.28320312, 0.65625000, 0.66015625,
                0.48632812, 0.53906250, 0.46679688, 0.47851562, 0.43359375,
                0.45703125, 0.47070312, 0.39843750, 0.32617188, 0.37304688,
                0.49023438, 0.50390625, 0.27148438, 0.46679688, 0.37695312,
                0.49023438, 0.47265625, 0.58593750, 0.64453125, 0.60156250,
                0.38476562, 0.62109375],
                [0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        ]],

                [[0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        ],
                [0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        ]],

                [[0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        ],
                [0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        , 0.        , 0.        , 0.        ,
                0.        , 0.        ]]]])
        >>> # doctest: -SKIP


To convert FlashMask's `startend_row_indices` to `dense_mask`, use the code below:

.. code-block:: python

    >>> import paddle
    >>> import numpy as np
    >>> def flashmask_to_densemask(startend_row_indices, dtype, causal=True):
    ...     if startend_row_indices is None:
    ...         return None
    ...     bz, num_head, seq_len, bound_num = startend_row_indices.shape
    ...     m = paddle.zeros((bz, num_head, seq_len, seq_len), dtype=dtype)
    ...     has_end = (causal and bound_num == 2) or ((not causal) and bound_num == 4)
    ...     for bi in range(bz):
    ...         for hi in range(num_head):
    ...             for j in range(seq_len):
    ...                 downstart = startend_row_indices[bi, hi, j, 0]
    ...                 if has_end:
    ...                     downend = startend_row_indices[bi, hi, j, 1]
    ...                     m[bi, hi, downstart:downend, j] = -np.inf
    ...                 else:
    ...                     m[bi, hi, downstart:, j] = -np.inf
    ...                 if causal:
    ...                     m[bi, hi, :j, j] = -np.inf
    ...                 else:
    ...                     if has_end:
    ...                         upstart = startend_row_indices[bi, hi, j, 2]
    ...                         upend = startend_row_indices[bi, hi, j, 3]
    ...                         m[bi, hi, upstart:upend, j] = -np.inf
    ...                     else:
    ...                         upend = startend_row_indices[bi, hi, j, 1]
    ...                         m[bi, hi, :upend, j] = -np.inf
    ...     return m

For `Causal Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

.. code-block:: python

   [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]])

    >>> # doctest: +SKIP('Only example')
    >>> import paddle
    >>> startend_row_indices = paddle.to_tensor([8]*10, dtype="int32").reshape([1, 1, 10, 1])
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[8],
            [8],
            [8],
            [8],
            [8],
            [8],
            [8],
            [8],
            [8],
            [8]]]])
    >>> # doctest: -SKIP


For `Sliding Window Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

.. code-block:: python

   [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
      [0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
      [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
      [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
      [0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
      [0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
      [0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]]])

    >>> # doctest: +SKIP('Only example')
    >>> import paddle
    >>> startend_row_indices = paddle.to_tensor([3, 4, 5, 6, 7, 8, 9, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[3 ],
            [4 ],
            [5 ],
            [6 ],
            [7 ],
            [8 ],
            [9 ],
            [10],
            [10],
            [10]]]])
    >>> # doctest: -SKIP

For `Causal Document Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

.. code-block:: python

   [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
      [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
      [0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]]])

    >>> # doctest: +SKIP('Only example')
    >>> import paddle
    >>> startend_row_indices = paddle.to_tensor([4, 4, 4, 4, 7, 7, 7, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[4 ],
            [4 ],
            [4 ],
            [4 ],
            [7 ],
            [7 ],
            [7 ],
            [10],
            [10],
            [10]]]])
    >>> # doctest: -SKIP

For `Document Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

.. code-block:: python

   [[[[1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
      [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
      [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
      [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
      [0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]]])

    >>> # doctest: +SKIP('Only example')
    >>> import paddle
    >>> LTS = paddle.to_tensor([4, 4, 4, 4, 7, 7, 7, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> UTE = paddle.to_tensor([0, 0, 0, 0, 4, 4, 4, 7, 7, 7], dtype="int32").reshape([1, 1, 10, 1])
    >>> startend_row_indices = paddle.concat([LTS, UTE], axis=-1)
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[4 , 0 ],
            [4 , 0 ],
            [4 , 0 ],
            [4 , 0 ],
            [7 , 4 ],
            [7 , 4 ],
            [7 , 4 ],
            [10, 7 ],
            [10, 7 ],
            [10, 7 ]]]])
    >>> # doctest: -SKIP

For `Share Question Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

.. code-block:: python

   [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
      [1, 1, 1, 1, 0, 0, 0, 1, 0, 0],
      [1, 1, 1, 1, 0, 0, 0, 1, 1, 0],
      [1, 1, 1, 1, 0, 0, 0, 1, 1, 1]]]])

    >>> # doctest: +SKIP('Only example')
    >>> import paddle
    >>> startend_row_indices = paddle.to_tensor([10, 10, 10, 10, 7, 7, 7, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[10],
            [10],
            [10],
            [10],
            [7 ],
            [7 ],
            [7 ],
            [10],
            [10],
            [10]]]])
    >>> # doctest: -SKIP

For `Global + Sliding Window Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

.. code-block:: python

    >>> # doctest: +SKIP('Only example')

   [[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
      [1, 1, 0, 1, 1, 1, 0, 0, 0, 0],
      [1, 1, 0, 0, 1, 1, 1, 0, 0, 0],
      [1, 1, 0, 0, 0, 1, 1, 1, 0, 0],
      [1, 1, 0, 0, 0, 0, 1, 1, 1, 0],
      [1, 1, 0, 0, 0, 0, 0, 1, 1, 1],
      [1, 1, 0, 0, 0, 0, 0, 0, 1, 1]]]])

    >>> import paddle
    >>> LTS = paddle.to_tensor([10, 10, 4, 5, 6, 7, 8, 9, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> LTE = paddle.to_tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> UTS = paddle.to_tensor([0, 0, 0, 0, 2, 2, 2, 2, 2, 2], dtype="int32").reshape([1, 1, 10, 1])
    >>> UTE = paddle.to_tensor([0, 0, 0, 0, 3, 4, 5, 6, 7, 8], dtype="int32").reshape([1, 1, 10, 1])
    >>> startend_row_indices = paddle.concat([LTS, LTE, UTS, UTE], axis=-1)
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 4], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[10, 10, 0 , 0 ],
            [10, 10, 0 , 0 ],
            [4 , 10, 0 , 0 ],
            [5 , 10, 0 , 0 ],
            [6 , 10, 2 , 3 ],
            [7 , 10, 2 , 4 ],
            [8 , 10, 2 , 5 ],
            [9 , 10, 2 , 6 ],
            [10, 10, 2 , 7 ],
            [10, 10, 2 , 8 ]]]])
    >>> # doctest: -SKIP

For `Causal Blockwise Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

.. code-block:: python

   [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
      [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
      [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
      [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])

    >>> # doctest: +SKIP('Only example')
    >>> import paddle
    >>> LTS = paddle.to_tensor([4, 4, 4, 4, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> LTE = paddle.to_tensor([7, 7, 7, 7, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> startend_row_indices = paddle.concat([LTS, LTE], axis=-1)
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[4 , 7 ],
            [4 , 7 ],
            [4 , 7 ],
            [4 , 7 ],
            [10, 10],
            [10, 10],
            [10, 10],
            [10, 10],
            [10, 10],
            [10, 10]]]])
    >>> # doctest: -SKIP

For `Prefix LM Document Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

.. code-block:: python

   [[[[1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
      [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
      [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
      [0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
      [0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
      [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]]])

    >>> # doctest: +SKIP('Only example')
    >>> import paddle
    >>> LTS = paddle.to_tensor([3, 3, 3, 5, 5, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> UTE = paddle.to_tensor([0, 0, 2, 3, 3, 5, 5, 7, 8, 9], dtype="int32").reshape([1, 1, 10, 1])
    >>> startend_row_indices = paddle.concat([LTS, UTE], axis=-1)
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[3 , 0 ],
            [3 , 0 ],
            [3 , 2 ],
            [5 , 3 ],
            [5 , 3 ],
            [10, 5 ],
            [10, 5 ],
            [10, 7 ],
            [10, 8 ],
            [10, 9 ]]]])
    >>> # doctest: -SKIP

For `Prefix LM Causal Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

.. code-block:: python

   [[[[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
      [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
      [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])

    >>> # doctest: +SKIP('Only example')
    >>> import paddle
    >>> LTS = paddle.to_tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> UTE = paddle.to_tensor([0, 0, 0, 0, 0, 5, 6, 7, 8, 9], dtype="int32").reshape([1, 1, 10, 1])
    >>> startend_row_indices = paddle.concat([LTS, UTE], axis=-1)
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[10, 0 ],
            [10, 0 ],
            [10, 0 ],
            [10, 0 ],
            [10, 0 ],
            [10, 5 ],
            [10, 6 ],
            [10, 7 ],
            [10, 8 ],
            [10, 9 ]]]])

For `QK-sparse Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

.. code-block:: python

   [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
      [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])

    >>> # doctest: +SKIP('Only example')
    >>> import paddle
    >>> LTS = paddle.to_tensor([10, 10, 2, 3, 4, 5, 6, 7, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> LTE = paddle.to_tensor([10, 10, 5, 5, 5, 5, 8, 8, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
    >>> startend_row_indices = paddle.concat([LTS, LTE], axis=-1)
    >>> print(startend_row_indices)
    Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
        [[[[10, 10],
            [10, 10],
            [2 , 5 ],
            [3 , 5 ],
            [4 , 5 ],
            [5 , 5 ],
            [6 , 8 ],
            [7 , 8 ],
            [10, 10],
            [10, 10]]]])

    >>> # doctest: -SKIP
Nr   r   z/can't use window_size with startend_row_indicesint32)r   )maxr=   )minr   z<must provide startend_row_indices when using block_mask_attnFz9startend_row_indices.dtype must be paddle.int32, but got    z,startend_row_indices rank must be 4,but got zCstartend_row_indices.shape[0] must be equal to batch_size, but got z and zAstartend_row_indices.shape[2] must be equal to seqlen_k, but got zJstartend_row_indices head_num must be equal to 1(broadcast) or head_num_k.z/block_mask.dtype must be paddle.int32, but got z9block_mask.shape[0] must be equal to batch_size, but got zLblock_mask.shape[1] must be equal to startend_row_indices.shape[1], but got       z1block_size must be 128 when using block_mask_attnr>   z.headdim must be 128 when using block_mask_attnr<   TzoInvalid shape of startend_row_indices, when causal is True, the last dimension should be either 1 or 2 but got zpInvalid shape of startend_row_indices, when causal is False, the last dimension should be either 2 or 4 but got rB   rm   z4 blockmask attention no supports deterministic now .rn   z^flashmask_attention does not support setting softmax_scale, use flashmask_attention_v2 insteadz0 blockmask attention only supports sm >= 90 now.r:   z/flashmask_attention_v2 does not support dropoutz:flashmask_attention_v2 does not support return seed_offsetz;flashmask_attention_v2 does not support setting seed_offsetrj   z8flashmask_attention_v2 does not support setting rng_namezAflashmask_attention_v2 does not support setting training to Falsez4flashmask_attention_v2 does not support setting namer?   ro   )
isinstanceintrH   r    aranger   cliprepeat_interleaveemptyr   rX   r   r   lenr   rM   r   r   r   flashmask_attentionflashmask_attention_v2)r,   r-   r.   startend_row_indicesrP   r0   r   r   r   ra   rb   r2   rc   rd   r   sqbszrU   r   result_softmax_lseresult_seed_offsethas_endr   r}   s                           r   r   r     sM   @ k3''&4K[[^kk!n#+ 	
=	
+ #)==A"BQ$7!$;7$gq!Rm$ ! $*;;$$Q' !
 $*<<Ar1W#M /5}}A"BQ$7!$;70 Aq!, 06}}Q!n!4G0 Aq!, $*;;$!$Q' ! #/ 	
J	
/ # L
	
 $))V\\9 	
GH\HbHbGcd	
9 '--.!3 	
:;O;U;U:VW	
3 $))!,		!< 	
QRfRlRlmnRoQppuvyvv  AB  wC  vD  E	
< $))!,		!< 	
OPdPjPjklPmOnnstwt}t}~  uA  tB  C	
< $))!,IIaL1
 
 	
 Y		
 
 !##v||3 A*BRBRAST3 ##A&#))A,6 KJL\L\]^L_K``efifofopqfrest6 ##A&*>*D*DQ*GG ^_i_o_opq_r^ssxy|  zC  zC  DE  zF  yG  HG ##A&5;;q>C+?C*GG CG ##A&399Q<#+=#*EE CE 99Q<3& @& #))"-2%++B/14  F  G[  Ga  Ga  bd  Ge  Ff  g  $))"-2%++B/14  G  H\  Hb  Hb  ce  Hf  Gg  h 
 **,,  "=!>?+ % F% F%%''JKK!!++-G,HI*  %%//1L0MN+ A$J..88+,(*J ? ( p( % B% **$!"" 1_c> A> * L) %, M, r> J>  S8 < F< $ %BD 9
 --$" @MNNeG&''&''
7|qqzr'   c                   U R                   (       a  UR                   (       d   S5       e[        5       (       a  [        R                  " XU5      nU$ [	        S0 [        5       D6nUR                  [        R                  5      nUR                  [        R                  5      nU UUS.nSU0nUR                  SUUS9  U$ )av	  
The equation is:

.. math::

    result=reduce\_sum(softmax(\frac{ Q * K^T }{\sqrt{d}}), dim=-2)

Warning:
    This API only supports inputs with dtype float16 and bfloat16.

Args:
    query(Tensor): The query tensor in the Attention module.
                    4-D tensor with shape:
                    [batch_size, seqlen_q, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    key(Tensor): The key tensor in the Attention module.
                    4-D tensor with shape:
                    [batch_size, seqlen_k, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    softmax_lse(Tensor): The logsumexp of each row returned by _C_ops.flash_attn().
                    3-D tensor with shape:
                    [batch_size, num_heads, seqlen_q_rounded], where seqlen_q_rounded = ceil(seqlen_q/128).
                    The dtype is float32.
Returns:
    reduced_attention_scores(Tensor), The reduce sum of attention scores across seqlen_q.
    4-D tensor with shape: [batch_size, num_heads, 1, seqlen_k]. The dtype is float32.
Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('reduce_attn_scores need A100 compile')
        >>> import paddle
        >>> import numpy as np
        >>> import paddle._C_ops as _C_ops
        >>> from paddle.nn.functional.flash_attention import (
        >>>     calc_reduced_attention_scores
        >>> )
        >>> np.random.seed(2024)
        >>> q_shape = (5,1024,16,128)
        >>> k_shape = (5,2048,16,128)
        >>> dtype = 'float16'
        >>> query = np.random.random(q_shape)
        >>> key = np.random.random(k_shape)
        >>> q = paddle.to_tensor(
        >>>     query, place=place, dtype=dtype, stop_gradient=True
        >>> )
        >>> k = paddle.to_tensor(
        >>>     key, place=place, dtype=dtype, stop_gradient=True
        >>> )
        >>> _, _, softmax_lse, _ = _C_ops.flash_attn(
        >>>     q,
        >>>     k,
        >>>     k,
        >>>     (None,), #fixed_seed_offset
        >>>     None, #attn_mask
        >>>     0.0, #dropout
        >>>     False, #causal
        >>>     False, #return_softmax
        >>>     False, #is_test
        >>>     "" #rng_name
        >>> )
        >>> reduced_attn_scores = calc_reduced_attention_scores(
        >>>     q,
        >>>     k,
        >>>     softmax_lse,
        >>> )
        >>> # doctest: -SKIP
z6calc_reduced_attention_scores() is for inference only.calc_reduced_attn_scores)rp   rs   rv   reduced_scores)r{   r|   r}   )r   )
r"   r   r   r   r	   r   r   r    r   r   )r,   r-   rv   r   r   rL   r|   r}   s           r   calc_reduced_attention_scoresr   ,  s    L 3#4#4 @4 88
 @vx@F>>v~~NN77GG"F 	.G '  
 r'   )FTT)r   boolr   r   r   r   returnzGenerator[None, None, None])r$   r   r   r   ).....)r,   r   r-   r   r.   r   r%   r   r/   floatr0   r   r1   Literal[False]r2   r   r3   float | Noner   tuple[Tensor, None])r,   r   r-   r   r.   r   r%   r   r/   r   r0   r   r1   Literal[True]r2   r   r3   r   r   tuple[Tensor, Tensor])r,   r   r-   r   r.   r   r%   r   r/   r   r0   r   r1   r   r2   r   r3   r   r   tuple[Tensor, Tensor | None])Nr:   FFTN)rQ   r   r   str)...)r,   r   r-   r   r.   r   rP   r   r0   r   r1   r   ra   Tensor | Nonerb   r   r2   r   rc   
str | Nonerd   r   r   r   )r,   r   r-   r   r.   r   rP   r   r0   r   r1   r   ra   r   rb   r   r2   r   rc   r   rd   r   r   r   )r,   r   r-   r   r.   r   rP   r   r0   r   r1   r   ra   r   rb   r   r2   r   rc   r   rd   r   r   r   )r:   FF)r   r   rP   r   r0   r   r1   r   ra   r   rb   r   r2   r   rc   r   r   r   )r   r   rP   r   r0   r   r1   r   ra   r   rb   r   r2   r   rc   r   r   r   )r   r   rP   r   r0   r   r1   r   ra   r   rb   r   r2   r   rc   r   r   r   ).......) r,   r   r-   r   r.   r   r   r   r   r   r   r   r   r   r3   r   rP   r   r0   r   r1   r   ra   r   rb   r   r2   r   rc   r   r   r   ) r,   r   r-   r   r.   r   r   r   r   r   r   r   r   r   r3   r   rP   r   r0   r   r1   r   ra   r   rb   r   r2   r   rc   r   r   r   ) r,   r   r-   r   r.   r   r   r   r   r   r   r   r   r   r3   r   rP   r   r0   r   r1   r   ra   r   rb   r   r2   r   rc   r   r   r   )r:   FFNrj   TN)NNNFNNNN)r<   r<   r:   r   Nr   )........)r   r   r   r   r   r   r   r   r   r   r3   r   rP   r   r0   r   r1   r   ra   r   rb   r   r   r   r2   r   rc   r   r   r   )r   r   r   r   r   r   r   r   r   r   r3   r   rP   r   r0   r   r1   r   ra   r   rb   r   r   r   r2   r   rc   r   r   r   )r   r   r   r   r   r   r   r   r   r   r3   r   rP   r   r0   r   r1   r   ra   r   rb   r   r   r   r2   r   rc   r   r   r   )r:   FFNrj   TTNr)   )r,   r   r-   r   r.   r   r   r   rP   r   r0   r   r   zint | tuple | Noner   r   r   r   ra   r   rb   r   r2   r   rc   r   rd   r   r   r   )r,   paddle.Tensorr-   r   rv   r   r   r   )&
__future__r   typingr   r   r   r    paddle.nn.functionalnn
functionalrK   r   paddle.base.frameworkr   paddle.base.layer_helperr	   paddle.base.wrapped_decoratorr
   paddle.nn.attention.sdpar   r   r   collections.abcr   r   r   r&   r4   rZ   r`   rg   r   r   r   r   r   r   r   r*   r'   r   <module>r      sI   # 3 3       8 0 G  ) !%  !	 8 
 %(

	
 
 	

 
 
 #
 
 
 
 

 
 $'
 
 	
  
  	
 
 
  
  "
  
  
  
  

  
 
'
'	
' 
' 	
'
 
' 
' 
' 
' 
' "
' 

'" 

34lB 

 %( (+"%	  	
  # %       
  

 $'  (+"%  	    	 
   "  %             
   

 ' (+"%''	' ' 	'
 ' ' %' ' ' '  ' "' 
'( \ 	\~ 
 %(	
 (+
	

 
 #	
 %
 
 
 
 
 

 
 $'	
  (+
 	
 
  
  "	
  %
  
  
  
  
  

  
 	
' (+
'	
'
' 
' 	
' %
' 
' 
' 
' "
' 

' 	U 	Up 
 %('*	  	
       # %     ! 
& 
 $''*  	    	 
             "  %         !  
 & 
 '*''	' ' 	'
 ' ' ' ' ' ' ' %' ' ' '  "!' 
'8 	H4f )+l )up 
 %('*	  	
     # %      
$ 
 $''* 	     	 
         "  %            
 $ 
 '*'	'' ' 	'
 ' ' ' ' ' %' ' ' ' ' "' 
'2 	}4H +/	] &*$$'+"& $!]]	] ] (	] ] ] $] ] ] %] ] ] ]  ]  !]@``,`;H``r'   