
    {-jJ                   2   d dl mZ d dlmZmZmZ d dlZd dlmc m	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ erd d	lmZ d d
lmZ e	 	 	 dadbd            ZdcdZe	 	 	 	 	 ddded%            Ze	 	 	 	 	 dddfd(            Ze	 	 	 	 	 dddgd*            Z	 	 	 	 	 	 dhd,Zdid0Zdid1Ze	 	 	 djdddddd2dkd:            Ze	 	 	 djdddddd2dld;            Ze	 	 	 djdddddd2dmd<            Z	 	 	 dndd=dddd2d>Ze	 	 	 djddddd?dodA            Ze	 	 	 djddddd?dpdB            Ze	 	 	 djddddd?dqdC            Z	 	 	 dndd=ddd?dDZe	 	 	 	 	 	 	 drdsdI            Z e	 	 	 	 	 	 	 drdtdJ            Z e	 	 	 	 	 	 	 drdudK            Z 	 	 	 	 	 	 	 dvdLZ 	 	 	 	 	 	 	 	 	 	 	 	 	 dwdOZ!	 	 	 	 	 	 	 	 	 	 	 	 	 dwdPZ"e	 	 	 	 	 	 	 	 dxdydR            Z#e	 	 	 	 	 	 	 	 dxdzdS            Z#e	 	 	 	 	 	 	 	 dxd{dT            Z#	 	 	 	 	 	 	 	 d|dUZ#	 d}d+dddddd=dddddVd~d]Z$dd`Z%dS )    )annotations)TYPE_CHECKINGLiteraloverloadN)_C_ops)in_dynamic_or_pir_mode)LayerHelper)signature_safe_contextmanager)
SDPBackend_get_enabled_backendssdpa_kernel)	Generator)TensorFTenable_mathboolenable_flashenable_mem_efficientreturnGenerator[None, None, None]c              #  ^  K   g }|r|                     t          j                   |r|                     t          j                   | r|                     t          j                   |st          d          t          |          5 }	 |V  n# w xY w	 ddd           dS # 1 swxY w Y   dS )z
    With the sdp_kernel context manager, different algorithm implementations can
    be selected for scaled_dot_product_attention.
    z$At least one backend must be enabledN)appendr   FLASH_ATTENTIONEFFICIENT_ATTENTIONMATH
ValueErrorr   )r   r   r   backend_listcontexts        d/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/nn/functional/flash_attention.py
sdp_kernelr   %   s      L 8J6777 <J:;;; -JO,,, A?@@@	\	"	" g	MMMDDDDD	                 s*   B"
BB"BB""B&)B&xr   c                x    t          j        | d          }d|_        t          j        |d          }d|_        |S )Ng     T   )diagonal)paddle	full_likestop_gradienttriu)r    masks     r   get_triangle_upper_maskr)   B   s=    At$$DD;ta(((DDK    .querykeyvaluer(   dropout_ratefloatcausalreturn_softmaxLiteral[False]trainingscalefloat | Nonetuple[Tensor, None]c	                    d S N 	r+   r,   r-   r(   r.   r0   r1   r3   r4   s	            r   _math_attentionr;   J   	     #r*   Literal[True]tuple[Tensor, Tensor]c	                    d S r8   r9   r:   s	            r   r;   r;   X   	      Cr*   tuple[Tensor, Tensor | None]c	                    d S r8   r9   r:   s	            r   r;   r;   f   	     $'3r*           c	                   | j         d         }	t          j        | g d          } t          j        |g d          }t          j        |g d          }|p|	dk    r|	dz  nd}t          j        | |z  |d          }
|s||
|z   }
t	          j        |
          }nt          j                    }d	|v sBd
|v s>|
j         d         dk     s-|
j         d         dk    s|
j         d         |
j         d         k    r)t          |
          }|
|z   }
t	          j        |
          }nt          j        	                    |
          }|dk    rt	          j
        |||d          }t          j        ||          }t          j        |g d          }||r|ndfS )z
    This is a basic implementation of scaled dot product attention composed of
    combinations of fundamental components.
    )r      r"      r         g      ?T)r    ytranspose_yNxpucpu    i @  rD   upscale_in_train)r3   mode)shaper$   	transposematmulFsoftmax
get_devicer)   incubate softmax_mask_fuse_upper_triangledropout)r+   r,   r-   r(   r.   r0   r1   r3   r4   head_dimproductweightsplaceouts                 r   r;   r;   t   s     {2HULLL11E

3
-
-CULLL11E?Ahnn3EmeemsEEEG PnG)G$$ !##UNN~~}R 2%%}R 5((}R GM"$555 +733DnGi((GGoFFwOOGc)\H;M
 
 
 -
'
'C

3
-
-C>3t33r*   r[   intstrc                    | dk    rdS dS )N   
flash_attnmem_efficientr9   )r[   s    r   _select_sdp_cudarf      s    3|r*   c                ,   t          j                    }d|v rdS t                      }|st          d          t          j        |v }t          j        |v }t          j        |v }|du r|du r|du rdS d|vrdS |du r|du rt          |           S |du rdS dS )	z
    There are currently three different implementation options available for
    scaled dot product attention, and the chosen approach depends on whether it
    is determined by the sdp_kernel configuration or specified through input values.
    rL   rd   z@No available backend for scaled_dot_product_attention was found.TFmathgpure   )	r$   rW   r   AssertionErrorr   r   r   r   rf   )r[   r^   enabled_backendsr   r   r   s         r   _select_sdprl      s     E~~|,.. 
N
 
 	
 /%55K-1AAL%9=MMd5  %9U%B%B66t 4 < <)))t|?r*   )fixed_seed_offsetrng_namer3   namesoftmax_scalerZ   rm   Tensor | Nonern   ro   
str | Nonerp   c                   d S r8   r9   r+   r,   r-   rZ   r0   r1   rm   rn   r3   ro   rp   s              r   flash_attentionru      s	     #r*   c                   d S r8   r9   rt   s              r   ru   ru      s	      Cr*   c                   d S r8   r9   rt   s              r   ru   ru      s	     $'3r*    c                  | j         d         }t          |          }|dk    r|dt          j                    v rd}nadt          j                    v rd}nIt          j        dg          d         rd}n+t          j        j                            dg          d         }t                      s|dk    s
J d            |d	k    s|dk    s
J d
            |r|dk    s
J d            ||dk    s
J d            |dk    s|dk    s
J d            |s|dk    s
J d            |	|dk    s
J d            |
|dk    s
J d            t                      r|dk    r+t          j	        | |||d|||| |
  
        \  }}}}||r|ndfS |dk    r<|
| j         d         dz  }
t          j
        | ||dddd|
|ddd	dddd          \  }}|dfS t          d|           t          d#i t                      }|                    d          }|                    |          }|                    |          }|                    t          j                  }|                    t          j                  }| |||d}||||d}|                    d|||||| |d           ||r|ndfS |dk    rdd lm}  || ||d|d|!          }|dfS t+          | ||||||"          S )$a>  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API is only support inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        dropout(float): The dropout ratio.
        causal(bool): Whether enable causal mode.
        return_softmax(bool): Whether to return softmax.
        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
        training(bool): Whether it is in the training phase.
        rng_name(str): The name to select Generator.
        name(str|None, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        out(Tensor): The attention tensor.
                    4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
        softmax(Tensor): The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> paddle.seed(2023)
            >>> q = paddle.rand((1, 128, 2, 16))

            >>> output = paddle.nn.functional.flash_attention.flash_attention(q, q, q, 0.9, False, False)
            >>> print(output)
            (Tensor(shape=[1, 128, 2, 16], dtype=float32, place=Place(cpu), stop_gradient=True,
            [[[[0.34992966, 0.34456208, 0.45826620, ..., 0.39883569,
                0.42132431, 0.39157745],
               [0.76687670, 0.65837246, 0.69117945, ..., 0.82817286,
                0.76690865, 0.71485823]],
              ...,
              [[0.71662450, 0.57275224, 0.57053083, ..., 0.48108247,
                0.53336465, 0.54540104],
               [0.59137970, 0.51350880, 0.50449550, ..., 0.38860250,
                0.40526697, 0.60541755]]]]), None)

    rH   rd   rL   rG   iluvatar_gpuFLAGS_cudnn_deterministicFLAGS_flash_attn_versionz2flash attention 3 only support dynamic or pir moderD   z*flash attention 3 does not support dropoutz1flash attention 3 does not support return softmaxNz6flash attention 3 does not support setting seed_offsetrx   z3flash attention 3 does not support setting rng_namez3flash attention 3 does not support setting trainingz/flash attention 3 does not support setting namez8flash attention 2 does not support setting softmax_scalerF   rI   r"   Fr   !Invalid flash attention version: qinput_param_name)r~   kvrm   r_   rV   softmax_lseseed_offsetrZ   r0   r1   is_testrn   typeinputsoutputsattrsre   memory_efficient_attention	attn_biaspr4   r3   r.   r0   r1   r3   )rd   )rR   rl   r$   rW   	get_flagsbase	frameworkr   r   rd   flash_attn_v3r   r	   localsinput_dtype"create_variable_for_type_inferencefloat32int64	append_op-paddle.incubate.nn.memory_efficient_attentionr   r;   )r+   r,   r-   rZ   r0   r1   rm   rn   r3   ro   rp   r[   sdp_func_name
fa_versionresult_attentionresult_softmax_r_   r   helperdtyperV   r   r   r   r   outputs                              r   ru   ru     sQ   ^ {1~H))M$$F%''''JJv02222JJ:;<<'
 	* JJ.88+, (*J &'' 	
:???@ ,;?: #~~q8 "10 " 	
Z1___? &5_4 !(J!OOOD -<O; 2~~qA "10  	
:???A +?* |zQ=  /. $
aF )87 "## +	Q;A;L%" L< <8!>1a (&4>NN$  q ($)KO$=M#)#7!!$ $ [$ Dy  D
DD   66VXX66""C"8877>>;;EBB??OO??MM!2	
 
 &&	
 
 	" "0'<$ 	 	 	
 	
 	
 ~7GG477O++      0/!  F 4<"$-!   r*   )rm   rn   r3   ro   qkvc                   d S r8   r9   r   rZ   r0   r1   rm   rn   r3   ro   s           r   flash_attn_qkvpackedr     r<   r*   c                   d S r8   r9   r   s           r   r   r     r@   r*   c                   d S r8   r9   r   s           r   r   r      rC   r*   c               \   | j         d         }t          |          }	|	dk    rt                      r)t          j        | |d|||| |          \  }
}}}|
|r|ndfS t          di t                      }|                    d          }|                    |          }|                    |          }|                    t          j
                  }|                    t          j                  }| |d}||||d}|                    d|||||| |d	
           ||r|ndfS | ddddddf                             ddd| j         d         g          }| dddddf         }| dddddf         }|	dk    rddlm}  ||||d|d|          }|dfS t!          |||||||          S )a'
  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.
        Don't call this API if flash_attn is not supported.

    Args:
        qkv(Tensor): The query/key/value packed tensor in the Attention module.
                        5-D tensor with shape:
                        [batchsize, seqlen , num_heads/num_heads_k + 2, num_heads_k, head_dim].
                        The dtype can be float16 or bfloat16.
        dropout(float): The dropout ratio.
        causal(bool): Whether enable causal mode.
        return_softmax(bool): Whether to return softmax.
        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
        training(bool): Whether it is in the training phase.
        rng_name(str): The name to select Generator.
        name(str|None, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        - out(Tensor). The attention tensor. 4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim]. The dtype can be float16 or bfloat16.
        - softmax(Tensor). The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('flash_attn need A100 compile')
            >>> import paddle

            >>> paddle.seed(2023)
            >>> q = paddle.rand((1, 128, 2, 16))
            >>> qkv = paddle.stack([q, q, q], axis=2)
            >>> output = paddle.nn.functional.flash_attn_qkvpacked(qkv, 0.9, False, False)
            >>> print(output)
            (Tensor(shape=[1, 128, 2, 16], dtype=float32, place=Place(cpu), stop_gradient=True,
            [[[[0.34992966, 0.34456208, 0.45826620, ..., 0.39883569,
                0.42132431, 0.39157745],
               [0.76687670, 0.65837246, 0.69117945, ..., 0.82817286,
                0.76690865, 0.71485823]],
              ...,
              [[0.71662450, 0.57275224, 0.57053083, ..., 0.48108247,
                0.53336465, 0.54540104],
               [0.59137970, 0.51350880, 0.50449550, ..., 0.38860250,
                0.40526697, 0.60541755]]]]), None)
            >>> # doctest: -SKIP

    rF   rd   Nr   r   r   )r   rm   r   r   r   rO   r   re   r   r   r   )r   )rR   rl   r   r   r   r	   r   r   r   r$   r   r   r   reshaper   r   r;   )r   rZ   r0   r1   rm   rn   r3   ro   r[   r   r   r   r   r   r   r_   rV   r   r   r   r   r+   r,   r-   r   r   s                             r   r   r     s   J y}H))M$$!## 	P +!	 	  $~%O^^4OO@@vxx@@""E"::77>>;;EBB??OO??MM!2
 

 &&	
 
 	'" "0'<$ 	 	 	
 	
 	
 ~7GG477 AAAqqq#2#I&&1b#)B-'@AA!!!QQQ(mAAAqqq"HO++      0/!  F 4<"$-!   r*   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kc                    d S r8   r9   r+   r,   r-   r   r   r   r   r4   rZ   r0   r1   rm   rn   r3   ro   s                  r   flash_attn_unpaddedr     s	    " #r*   c                    d S r8   r9   r   s                  r   r   r     s	    "  Cr*   c                    d S r8   r9   r   s                  r   r   r     s	    " $'3r*   c                   t                      r.t          j        | |||||d|||||	|
| |          \  }}||
r|ndfS t          d	i t	                      }|                    d          }|                    |          }|                    |          }|                    t          j                  }|                    t          j	                  }| |||||d}||||d}|
                    d|||||||	|
| |d           ||
r|ndfS )
a  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API is only support inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        3-D tensor with shape:
                        [total_seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        3-D tensor with shape:
                        [total_seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        3-D tensor with shape:
                        [total_seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index query.
        cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index key and value.
        max_seqlen_q(int): Maximum sequence length of query in the batch.
        max_seqlen_k(int): Maximum sequence length of key/value in the batch.
        scale(float): The scaling of QK^T before applying softmax.
        dropout(float, optional): The dropout ratio.
        causal(bool, optional): Whether enable causal mode.
        return_softmax(bool, optional): Whether to return softmax.
        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
        rng_name(str, optional): The name to select Generator.
        training(bool, optional): Whether it is in the training phase.
        name(str|None, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        out(Tensor): The attention tensor.
                    3-D tensor with shape: [total_seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
        softmax(Tensor): The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> paddle.seed(2023)
            >>> q = paddle.rand((2, 128, 8, 16), dtype='float16')
            >>> cu = paddle.arange(0, 384, 128, dtype='int32')
            >>> qq = paddle.reshape(q, [256, 8, 16])
            >>> output = paddle.nn.functional.flash_attention.flash_attn_unpadded(qq, qq, qq, cu, cu, 128, 128, 0.25, 0.0, False, False)

    Nr   r~   r   )r~   r   r   r   r   rm   r   r   r   r4   rZ   r0   r1   r   rn   r   )r   )r   r   r   r	   r   r   r   r$   r   r   r   )r+   r,   r-   r   r   r   r   r4   rZ   r0   r1   rm   rn   r3   ro   r   r   r   r   r_   rV   r   r   r   r   s                            r   r   r     s   \  L &L
 
	
$  >!KtKK;;&((;;F44E

3
3E
:
:C77>>G;;FNKKK;;FLIIK$$. F ""	 G "((,#| 	
 	
	     >3t33r*   rF   rF   r"   c                F    t          | |||||||||	|
|||||||||          S r8   )flash_attn_varlen_func)r+   r,   r-   r   r   r   r   	seqused_q	seqused_krp   r0   qv	q_descale	k_descale	v_descalewindow_sizesoftcap
num_splitspack_gqa	sm_margins                       r   flash_attention_v3_varlenr   m  sR    , "
)  r*   c                   dt          j                    vs
J d            t          j        dg          d         r
J d            t           j        j                            dg          d         dk    s
J d            t                      s
J d            |
J d
            |	"| j        d         ||j        d         ndz   dz  }	t          j        | |||||||||||||	|
|d         |d         |||d	u||nd|          \  }}||fS )a
  
    The equation is:
    .. math::
        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.
    This is the varlen version of flash attention.
    Warning:
        This API is only support inputs with dtype float16 and bfloat16.
    Args:
        query(Tensor): The query tensor in the Attention module.
                        3-D tensor with shape:
                        [token_num, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        3-D tensor with shape:
                        [token_num, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        3-D tensor with shape:
                        [token_num, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index query.
        cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index key and value.
        causal(bool): Whether enable causal mode.
        softmax_scale(float): The softmax scale of the attention.
        max_seqlen_q(int): Maximum sequence length of query in the batch. Note it's the padding length, not the max actual seqlen.
        max_seqlen_k(int): Maximum sequence length of key/value in the batch.
    Returns:
        out(Tensor): The attention tensor. 3-D tensor with shape: [token_num, num_heads, head_dim]. The dtype can be float16 or bfloat16.
        softmax(Tensor): The softmax tensor. None if return_softmax is False.
    Examples:
        .. code-block:: python
            >>> # doctest: +SKIP('flash_attn_v3 need H100 compile')
            >>> import paddle
            >>> paddle.seed(2023)
            >>> q = paddle.rand((10, 2, 128), dtype="bfloat16")
            >>> cu_seqlens_q = paddle.to_tensor([0, 10], dtype="int32")
            >>> max_seq_len_q = 10
            >>> output = paddle.nn.functional.flash_attention.flash_attention_v3_varlen(q, q, q, cu_seqlens_q, cu_seqlens_q, max_seqlen_q=max_seq_len_q, max_seqlen_k=max_seq_len_q, causal=True)
            >>> # doctest: -SKIP
    rL   z.flash_attn_varlen_func is not supported on xpur{   z5flash_attn_varlen_func does not support deterministicr|   rH   zDFLAGS_flash_attn_version is 2, conflicts with flash_attn_varlen_funcz7flash_attn_varlen_func only support dynamic or pir modeNz2flash_attn_varlen_func does not support setting qvrF   r   rI   r"   F)	r$   rW   r   r   r   r   rR   r   flash_attn_v3_varlen)r+   r,   r-   r   r   r   r   r   r   rp   r0   r   r   r   r   r   r   r   r   r   r_   r   s                         r   r   r     s   F )+++++8 ,++ !< =>># ? ?>? ? 
 	'')C(DEE&	
 	 	 	 N		 	 	 "##  A # ::K:::KOr~rx||1E 2
AA(e- C0 r*   varlen_paddedc                    d S r8   r9   r   r   r   r   r   r4   rZ   r0   r1   rm   rn   r   r3   ro   s                 r   flash_attn_varlen_qkvpackedr     s	      #r*   c                    d S r8   r9   r   s                 r   r   r   &  s	       Cr*   c                    d S r8   r9   r   s                 r   r   r   9  s	      $'3r*   c                   t                      r-t          j        | |||	d||||||| |
|          \  }}||r|ndfS t          d	i t	                      }|                    d          }|                    |          }|                    |          }|                    t          j                  }|                    t          j	                  }| |||	d}||||d}|
                    d||||||||| |
d           ||r|ndfS )
a
  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.

    Args:
        qkv(Tensor): The padded query/key/value packed tensor in the Attention module. The padding part won't be computed
                        4-D tensor with shape:
                        [total_seq_len, num_heads/num_heads_k + 2, num_heads_k, head_dim].
                        The dtype can be float16 or bfloat16.
        cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index query.
        cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index key and value.
        max_seqlen_q(int): Maximum sequence length of query in the batch. Note it's the padding length, not the max actual seqlen
        max_seqlen_k(int): Maximum sequence length of key/value in the batch.
        scale(float): The scaling of QK^T before applying softmax.
        dropout(float, optional): The dropout ratio.
        causal(bool, optional): Whether enable causal mode.
        return_softmax(bool, optional): Whether to return softmax.
        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
        rng_name(str, optional): The name to select Generator.
        training(bool, optional): Whether it is in the training phase.
        name(str|None, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        - out(Tensor). The attention tensor. The tensor is padded by zeros. 3-D tensor with shape: [total_seq_len, num_heads, head_dim]. The dtype can be float16 or bfloat16.
        - softmax(Tensor). The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('flash_attn need A100 compile')
            >>> import paddle
            >>> paddle.seed(2023)
            >>> q = paddle.rand((2, 128, 8, 16), dtype='float16')
            >>> cu = paddle.arange(0, 384, 128, dtype='int32')
            >>> qq = paddle.reshape(q, [256, 8, 16])
            >>> qkv = paddle.stack([qq, qq, qq], axis=2)
            >>> output = paddle.nn.functional.flash_attn_varlen_qkvpacked(qkv, cu, cu, 128, 128, 0.25, 0.0, False, False)
            >>> # doctest: -SKIP

    Nr   r   r   )r   r   r   rm   r   r   r   )r   )r   r   r   r	   r   r   r   r$   r   r   r   )r   r   r   r   r   r4   rZ   r0   r1   rm   rn   r   r3   ro   r   r   r   r   r_   rV   r   r   r   r   s                           r   r   r   L  s   L  L .L
 
	
"  >!KtKKCC&((CCF66E

3
3E
:
:C77>>G;;FNKKK;;FLIIK$$.	 F ""	 G *((,#| 	
 	
	     >3t33r*   )rZ   r0   r   return_softmax_lsereturn_seed_offsetrm   rn   r3   ro   rp   
block_maskstartend_row_indicesr   int | tuple | Noner   r   r   c               z   |Yt          |t                    r||f}| j        d         }| j        d         }|
J d            |rnt          j        |d         dz   ||d         z   dz   d                              dd|df          }t          j        ||                              |d          }nt          j        dd|dfd          }t          j        |d         dz   ||d         z   dz   d          |dddddf<   t          j        |d          ||d         z
  d          |dddddf<   t          j        |d|	                              |d          }||
J d
            |%t          j
        | |||	d||d| |

  
        \  }}}}n{|j        t          j        k    sJ d|j                     t          |j                  dk    sJ d|j                     |j        d         |j        d         k    s&J d|j        d          d|j        d                      |j        d         |j        d         k    s&J d|j        d          d|j        d                      |j        d         d| j        d         |j        d         fv s
J d            ||j        t          j        k    sJ d|j                     |j        d         |j        d         k    s&J d|j        d          d|j        d                      |j        d         |j        d         k    s&J d|j        d          d|j        d                      |j        d         | j        d         dz   dz  k    s
J d            |j        d         |j        d         dz   dz  k    s
J d            |j        d         dk    s
J d            |rE|j        d         dk    rd}nv|j        d         dk    rd}nbt          d|j        d                    |j        d         dk    rd}n1|j        d         dk    rd}nt          d|j        d                    dt          j                    vr't          j        d g          d          r|
J d!            dt          j                    v rd}nt          j        j                            d"g          d"         dk    r?t          j        j                            d g          d          r| j        d         dk    rd}n+t          j        j                            d"g          d"         }|dk    r<|
J d#            |
J d$            t          j        | ||||	||d| |

  
        \  }}}}n|dk    r|d%k    s
J d&            |r
J d'            |	
J d(            |
d)k    s
J d*            |s
J d+            |
J d,            || j        d         d-z  }t          j        | ||||||          \  }}nt          d.|           |g}|r||gz  }|r||gz  }t          |          dk    r|d         S |S )/a  
    FlashMask: Official Implementation

    This module provides the official implementation of the FlashMask algorithm as described in the paper. For more details, please refer to the paper available at: https://arxiv.org/abs/2410.01359.

    The core equation utilized in FlashMask is as follows:

    .. math::

        \text{result} = \text{softmax}\left(\frac{Q \cdot K^T}{\sqrt{d}} + M\right) \cdot V

    In this equation:

        - ``Q``, ``K``, and ``V`` are the input tensors to the attention module.
        - All these tensors share the same dimensions.
        - ``d`` denotes the size of the last dimension of these tensors.
        - ``M`` represents the column-wise sparse mask introduced by FlashMask.

    Args:
        query (Tensor):  The query tensor in the attention module.
            A 4-D tensor with shape [batch_size, q_seq_len, num_heads, head_dim].
            The dtype can be float16 or bfloat16.
        key (Tensor): The key tensor in the attention module.
            A 4-D tensor with shape [batch_size, k_seq_len, k_num_heads, head_dim].
            The dtype can be float16 or bfloat16.
        value (Tensor): The value tensor in the attention module.
            A 4-D tensor with shape [batch_size, k_seq_len, k_num_heads, head_dim].
            The dtype can be float16 or bfloat16.
        startend_row_indices(Tensor):
            A column-wise sparse attention mask row indices tensor.
            A 4-D tensor with shape [batch_size, k_num_heads, k_seq_len, {1, 2, 4}].
            The dtype must be int32. k_num_heads can be 1 or the same as key's num_heads. When num_heads is 1, it will be broadcast to match key's num_heads.
            Depending on the value of the causal parameter, startend_row_indices can take different shapes and meanings.

            - When `causal=True` and the shape is [batch_size, k_num_heads, k_seq_len, 1],
              indicating unidirectional attention. The value represents the starting row index of the left
              lower triangular mask in the dense mask. The value startend_row_indices[..., 0] indicates that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) will be masked.
            - When `causal=True` and the shape is [batch_size, k_num_heads, k_seq_len, 2],
              indicating unidirectional attention. The values represent the starting and ending row indices of
              the left lower triangular mask in the dense mask. The values startend_row_indices[..., 0:2] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) but above the startend_row_indices[..., 1]-th row (exclusive) will be masked.
            - When `causal=False` and the shape is [batch_size, k_num_heads, k_seq_len, 2],
              indicating bidirectional attention. The values represent the starting row index of the left
              lower triangular mask and the ending row index of the right upper triangular mask in the dense mask. The values startend_row_indices[..., 0:2] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) will be masked, and elements in the upper right triangle starting from the startend_row_indices[..., 1]-th row upwards (exclusive) will be masked.
            - When `causal=False` and the shape is [batch_size, k_num_heads, k_seq_len, 4] ,
              indicating bidirectional attention. The values represent the start and end row indices of the
              left lower triangular mask and the start and end row indices of the right upper triangular mask in the dense mask. The values startend_row_indices[..., 0:4] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) but above the startend_row_indices[..., 1] row (exclusive) will be masked, and elements in the upper right triangle starting from the startend_row_indices[..., 2]-th row downwards (inclusive) but above the startend_row_indices[..., 3] row (exclusive) will be masked.

        dropout (float): The dropout ratio. Default is 0.0.
        causal (bool): Whether to enable causal mode. Default is False.
        window_size (int|tuple, optional): Indicates the window size of sliding window local attention.
            If causal mode is enabled, Query at position i will only attend to keys between [i - window_size, i] or [i - window_size[0], i].
            If causal mode is disabled, Query at position i will only attend to keys between [i - window_size, i + window_size] or [i - window_size[0], i + window_size[1]].
        return_softmax_lse (bool): Whether to return the log-sum-exp of the softmax. Default is False.
        return_seed_offset (bool): Whether to return the random seed offset. Default is False.
        fixed_seed_offset(Tensor, optional): With fixed seed, offset for dropout mask.
        rng_name (str): The name to select Generator.
        training (bool): Whether the module is in training mode. Default is True.
        name (str, optional): Name of the operation. Default is None. Normally, users do not need to set this property.
            For more information, refer to :ref:`api_guide_Name` .
        block_mask (tensor, optional):
            A 4-D integer mask tensor indicating whether each block in the attention matrix should be kept or masked. Must be used together with flashmask.
            The shape should be [batch_size, num_heads, blocklen_q, blocklen_k], where:

            blocklen_q = ceil(seqlen_q / 128), i.e., block_mask.shape[2] must be (seqlen_q + 127) // 128
            blocklen_k = ceil(seqlen_k / 128), i.e., block_mask.shape[3] must be (seqlen_k + 127) // 128
            block_mask.shape[1] (number of heads) must match the num_heads dimension of the flashmask
            Both seqlen_q and seqlen_k must be less than or equal to 128 * 1024
            The dtype should be int32, and each element should be either 0 or 1.
            A value of 1 indicates that the corresponding block is kept (not masked), while 0 means the block is masked.

            Usage Notes:

            Only supported when blockdim_q = blockdim_k = 128 now.
            Only supported when headdim = 128 now.
            This argument must be provided together with flashmask.
            The mask will be applied at the block level: each [i, j] position in block_mask controls whether the corresponding [128 x 128] block in the attention matrix is masked.
            Any mismatch in expected shape or head dimension will raise an error.


    Returns
        Tensor. The computed attention result with the same shape as the input `query`.

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.

    Hint:
        This API supports GQA.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('flash_attn need A100 compile')
            >>> import paddle
            >>> paddle.seed(2023)
            >>> q = paddle.rand((1, 10, 2, 32),dtype="bfloat16") # shape: [batch_size, seq_len, num_heads, head_dim]
            >>> k = paddle.rand((1, 10, 2, 32),dtype="bfloat16") # shape: [batch_size, seq_len, num_heads, head_dim]
            >>> v = paddle.rand((1, 10, 2, 32),dtype="bfloat16") # shape: [batch_size, seq_len, num_heads, head_dim]
            >>> startend_row_indices = paddle.to_tensor([8]*10 + [5]*10, dtype="int32").reshape([1, 2, 10, 1])
            >>> output = paddle.nn.functional.flashmask_attention(q, k, v, startend_row_indices, causal=True)
            >>> print(output)
            Tensor(shape=[1, 10, 2, 32], dtype=bfloat16, place=Place(gpu:0), stop_gradient=True,
                [[[[0.82421875, 0.27539062, 0.80859375, 0.98046875, 0.00251770,
                    0.41992188, 0.17285156, 0.11767578, 0.42773438, 0.31250000,
                    0.34570312, 0.70312500, 0.29296875, 0.44531250, 0.51562500,
                    0.96093750, 0.85546875, 0.15625000, 0.34765625, 0.98437500,
                    0.96484375, 0.45312500, 0.33593750, 0.56640625, 0.07714844,
                    0.43750000, 0.83984375, 0.66796875, 0.93750000, 0.24804688,
                    0.51171875, 0.55468750],
                    [0.54687500, 0.74609375, 0.43164062, 0.32421875, 0.10693359,
                    0.37304688, 0.53906250, 0.17187500, 0.57421875, 0.75000000,
                    0.13378906, 0.57031250, 0.19531250, 0.01403809, 0.29101562,
                    0.14257812, 0.07568359, 0.88671875, 0.75390625, 0.17089844,
                    0.87109375, 0.93359375, 0.89843750, 0.58203125, 0.75390625,
                    0.27539062, 0.67968750, 0.24804688, 0.57812500, 0.67578125,
                    0.92578125, 0.98046875]],

                    [[0.59765625, 0.62890625, 0.62109375, 0.75781250, 0.03295898,
                    0.64062500, 0.27929688, 0.20800781, 0.72265625, 0.52343750,
                    0.53125000, 0.61718750, 0.57421875, 0.56640625, 0.65625000,
                    0.48242188, 0.68359375, 0.42968750, 0.26562500, 0.86718750,
                    0.83203125, 0.40820312, 0.38281250, 0.59765625, 0.43945312,
                    0.22851562, 0.86328125, 0.51562500, 0.89453125, 0.62500000,
                    0.50390625, 0.67968750],
                    [0.34765625, 0.61328125, 0.58593750, 0.60156250, 0.43164062,
                    0.41601562, 0.71093750, 0.59765625, 0.53515625, 0.78125000,
                    0.13867188, 0.30664062, 0.48828125, 0.04394531, 0.24316406,
                    0.18847656, 0.10644531, 0.71093750, 0.69140625, 0.35937500,
                    0.44531250, 0.81640625, 0.44140625, 0.64062500, 0.81640625,
                    0.61328125, 0.72265625, 0.53125000, 0.49414062, 0.59765625,
                    0.54296875, 0.61328125]],

                    [[0.65234375, 0.47656250, 0.71875000, 0.64843750, 0.23828125,
                    0.61328125, 0.29101562, 0.26562500, 0.54296875, 0.60937500,
                    0.67187500, 0.67578125, 0.64062500, 0.41406250, 0.47656250,
                    0.40820312, 0.66406250, 0.39453125, 0.39453125, 0.62109375,
                    0.58593750, 0.31054688, 0.31835938, 0.45703125, 0.52343750,
                    0.43164062, 0.64453125, 0.49804688, 0.82812500, 0.48242188,
                    0.38476562, 0.59375000],
                    [0.44921875, 0.62109375, 0.50390625, 0.51562500, 0.51953125,
                    0.57812500, 0.78515625, 0.73437500, 0.60546875, 0.55078125,
                    0.30273438, 0.23339844, 0.60546875, 0.33007812, 0.23242188,
                    0.30468750, 0.34570312, 0.70703125, 0.72656250, 0.58593750,
                    0.40234375, 0.62109375, 0.62109375, 0.69531250, 0.66796875,
                    0.51562500, 0.45898438, 0.67968750, 0.48828125, 0.50000000,
                    0.54687500, 0.71875000]],

                    [[0.67578125, 0.50000000, 0.58203125, 0.62109375, 0.43554688,
                    0.69531250, 0.30273438, 0.24023438, 0.57812500, 0.63671875,
                    0.51171875, 0.52734375, 0.60546875, 0.45507812, 0.42382812,
                    0.46093750, 0.55859375, 0.34960938, 0.39453125, 0.57031250,
                    0.55078125, 0.47265625, 0.24609375, 0.51953125, 0.46093750,
                    0.49218750, 0.49609375, 0.60156250, 0.76953125, 0.57421875,
                    0.40429688, 0.57031250],
                    [0.45703125, 0.71093750, 0.58984375, 0.43164062, 0.54296875,
                    0.57031250, 0.72265625, 0.61328125, 0.64453125, 0.50781250,
                    0.28125000, 0.19531250, 0.60546875, 0.40625000, 0.18554688,
                    0.33203125, 0.40039062, 0.58593750, 0.79687500, 0.45507812,
                    0.32812500, 0.58203125, 0.70703125, 0.64453125, 0.53906250,
                    0.57421875, 0.48828125, 0.53515625, 0.49804688, 0.50000000,
                    0.48437500, 0.55468750]],

                    [[0.64453125, 0.43164062, 0.54687500, 0.53125000, 0.42187500,
                    0.71484375, 0.30273438, 0.21484375, 0.50390625, 0.69531250,
                    0.58203125, 0.51562500, 0.61328125, 0.41992188, 0.40039062,
                    0.46679688, 0.58984375, 0.39062500, 0.41992188, 0.49023438,
                    0.47851562, 0.47070312, 0.30078125, 0.50390625, 0.47656250,
                    0.44921875, 0.43164062, 0.63671875, 0.78125000, 0.60156250,
                    0.48242188, 0.58203125],
                    [0.52343750, 0.69921875, 0.58984375, 0.35156250, 0.49218750,
                    0.58593750, 0.71093750, 0.59375000, 0.66406250, 0.49414062,
                    0.24023438, 0.18554688, 0.66796875, 0.50000000, 0.23144531,
                    0.29882812, 0.49414062, 0.57031250, 0.70312500, 0.42773438,
                    0.35351562, 0.47460938, 0.73437500, 0.53125000, 0.47070312,
                    0.49609375, 0.50000000, 0.55078125, 0.50000000, 0.45898438,
                    0.45703125, 0.61328125]],

                    [[0.63671875, 0.41210938, 0.52734375, 0.56640625, 0.44531250,
                    0.64843750, 0.37890625, 0.31250000, 0.56640625, 0.62890625,
                    0.53125000, 0.51562500, 0.54296875, 0.50781250, 0.35546875,
                    0.41601562, 0.55468750, 0.36914062, 0.35937500, 0.45117188,
                    0.46875000, 0.49609375, 0.28710938, 0.50000000, 0.49609375,
                    0.50000000, 0.51562500, 0.57031250, 0.77734375, 0.62109375,
                    0.43164062, 0.50781250],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]],

                    [[0.62109375, 0.44531250, 0.46875000, 0.61328125, 0.39062500,
                    0.60156250, 0.41015625, 0.28710938, 0.58984375, 0.67968750,
                    0.55859375, 0.48632812, 0.51562500, 0.42382812, 0.37695312,
                    0.46679688, 0.54687500, 0.44921875, 0.33789062, 0.36328125,
                    0.49023438, 0.44140625, 0.25000000, 0.45312500, 0.43945312,
                    0.45507812, 0.46679688, 0.57812500, 0.65625000, 0.64062500,
                    0.42382812, 0.57031250],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]],

                    [[0.62500000, 0.47070312, 0.51562500, 0.61328125, 0.36718750,
                    0.66406250, 0.37890625, 0.28320312, 0.65625000, 0.66015625,
                    0.48632812, 0.53906250, 0.46679688, 0.47851562, 0.43359375,
                    0.45703125, 0.47070312, 0.39843750, 0.32617188, 0.37304688,
                    0.49023438, 0.50390625, 0.27148438, 0.46679688, 0.37695312,
                    0.49023438, 0.47265625, 0.58593750, 0.64453125, 0.60156250,
                    0.38476562, 0.62109375],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]],

                    [[0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]],

                    [[0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]]]])
            >>> # doctest: -SKIP


    To convert FlashMask's `startend_row_indices` to `dense_mask`, use the code below:

    .. code-block:: python

        >>> import paddle
        >>> import numpy as np
        >>> def flashmask_to_densemask(startend_row_indices, dtype, causal=True):
        ...     if startend_row_indices is None:
        ...         return None
        ...     bz, num_head, seq_len, bound_num = startend_row_indices.shape
        ...     m = paddle.zeros((bz, num_head, seq_len, seq_len), dtype=dtype)
        ...     has_end = (causal and bound_num == 2) or ((not causal) and bound_num == 4)
        ...     for bi in range(bz):
        ...         for hi in range(num_head):
        ...             for j in range(seq_len):
        ...                 downstart = startend_row_indices[bi, hi, j, 0]
        ...                 if has_end:
        ...                     downend = startend_row_indices[bi, hi, j, 1]
        ...                     m[bi, hi, downstart:downend, j] = -np.inf
        ...                 else:
        ...                     m[bi, hi, downstart:, j] = -np.inf
        ...                 if causal:
        ...                     m[bi, hi, :j, j] = -np.inf
        ...                 else:
        ...                     if has_end:
        ...                         upstart = startend_row_indices[bi, hi, j, 2]
        ...                         upend = startend_row_indices[bi, hi, j, 3]
        ...                         m[bi, hi, upstart:upend, j] = -np.inf
        ...                     else:
        ...                         upend = startend_row_indices[bi, hi, j, 1]
        ...                         m[bi, hi, :upend, j] = -np.inf
        ...     return m

    For `Causal Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> startend_row_indices = paddle.to_tensor([8]*10, dtype="int32").reshape([1, 1, 10, 1])
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[8],
                [8],
                [8],
                [8],
                [8],
                [8],
                [8],
                [8],
                [8],
                [8]]]])
        >>> # doctest: -SKIP


    For `Sliding Window Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
          [0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> startend_row_indices = paddle.to_tensor([3, 4, 5, 6, 7, 8, 9, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[3 ],
                [4 ],
                [5 ],
                [6 ],
                [7 ],
                [8 ],
                [9 ],
                [10],
                [10],
                [10]]]])
        >>> # doctest: -SKIP

    For `Causal Document Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> startend_row_indices = paddle.to_tensor([4, 4, 4, 4, 7, 7, 7, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[4 ],
                [4 ],
                [4 ],
                [4 ],
                [7 ],
                [7 ],
                [7 ],
                [10],
                [10],
                [10]]]])
        >>> # doctest: -SKIP

    For `Document Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([4, 4, 4, 4, 7, 7, 7, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTE = paddle.to_tensor([0, 0, 0, 0, 4, 4, 4, 7, 7, 7], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, UTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[4 , 0 ],
                [4 , 0 ],
                [4 , 0 ],
                [4 , 0 ],
                [7 , 4 ],
                [7 , 4 ],
                [7 , 4 ],
                [10, 7 ],
                [10, 7 ],
                [10, 7 ]]]])
        >>> # doctest: -SKIP

    For `Share Question Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 1, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 1, 1, 0],
          [1, 1, 1, 1, 0, 0, 0, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> startend_row_indices = paddle.to_tensor([10, 10, 10, 10, 7, 7, 7, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[10],
                [10],
                [10],
                [10],
                [7 ],
                [7 ],
                [7 ],
                [10],
                [10],
                [10]]]])
        >>> # doctest: -SKIP

    For `Global + Sliding Window Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

        >>> # doctest: +SKIP('Only example')

       [[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 0, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 0, 0, 1, 1, 1, 0, 0, 0],
          [1, 1, 0, 0, 0, 1, 1, 1, 0, 0],
          [1, 1, 0, 0, 0, 0, 1, 1, 1, 0],
          [1, 1, 0, 0, 0, 0, 0, 1, 1, 1],
          [1, 1, 0, 0, 0, 0, 0, 0, 1, 1]]]])

        >>> import paddle
        >>> LTS = paddle.to_tensor([10, 10, 4, 5, 6, 7, 8, 9, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> LTE = paddle.to_tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTS = paddle.to_tensor([0, 0, 0, 0, 2, 2, 2, 2, 2, 2], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTE = paddle.to_tensor([0, 0, 0, 0, 3, 4, 5, 6, 7, 8], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, LTE, UTS, UTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 4], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[10, 10, 0 , 0 ],
                [10, 10, 0 , 0 ],
                [4 , 10, 0 , 0 ],
                [5 , 10, 0 , 0 ],
                [6 , 10, 2 , 3 ],
                [7 , 10, 2 , 4 ],
                [8 , 10, 2 , 5 ],
                [9 , 10, 2 , 6 ],
                [10, 10, 2 , 7 ],
                [10, 10, 2 , 8 ]]]])
        >>> # doctest: -SKIP

    For `Causal Blockwise Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([4, 4, 4, 4, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> LTE = paddle.to_tensor([7, 7, 7, 7, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, LTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[4 , 7 ],
                [4 , 7 ],
                [4 , 7 ],
                [4 , 7 ],
                [10, 10],
                [10, 10],
                [10, 10],
                [10, 10],
                [10, 10],
                [10, 10]]]])
        >>> # doctest: -SKIP

    For `Prefix LM Document Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
          [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([3, 3, 3, 5, 5, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTE = paddle.to_tensor([0, 0, 2, 3, 3, 5, 5, 7, 8, 9], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, UTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[3 , 0 ],
                [3 , 0 ],
                [3 , 2 ],
                [5 , 3 ],
                [5 , 3 ],
                [10, 5 ],
                [10, 5 ],
                [10, 7 ],
                [10, 8 ],
                [10, 9 ]]]])
        >>> # doctest: -SKIP

    For `Prefix LM Causal Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTE = paddle.to_tensor([0, 0, 0, 0, 0, 5, 6, 7, 8, 9], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, UTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[10, 0 ],
                [10, 0 ],
                [10, 0 ],
                [10, 0 ],
                [10, 0 ],
                [10, 5 ],
                [10, 6 ],
                [10, 7 ],
                [10, 8 ],
                [10, 9 ]]]])

    For `QK-sparse Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([10, 10, 2, 3, 4, 5, 6, 7, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> LTE = paddle.to_tensor([10, 10, 5, 5, 5, 5, 8, 8, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, LTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[10, 10],
                [10, 10],
                [2 , 5 ],
                [3 , 5 ],
                [4 , 5 ],
                [5 , 5 ],
                [6 , 8 ],
                [7 , 8 ],
                [10, 10],
                [10, 10]]]])

        >>> # doctest: -SKIP
    Nr"   r   z/can't use window_size with startend_row_indicesint32)r   )maxrG   )minr   z<must provide startend_row_indices when using block_mask_attnFz9startend_row_indices.dtype must be paddle.int32, but got    z,startend_row_indices rank must be 4,but got zCstartend_row_indices.shape[0] must be equal to batch_size, but got z and zAstartend_row_indices.shape[2] must be equal to seqlen_k, but got zXstartend_row_indices head_num must be equal to 1(broadcast) or head_num_q or head_num_k.z/block_mask.dtype must be paddle.int32, but got z9block_mask.shape[0] must be equal to batch_size, but got zLblock_mask.shape[1] must be equal to startend_row_indices.shape[1], but got       z1block_size must be 128 when using block_mask_attnrH   z.headdim must be 128 when using block_mask_attnrF   TzoInvalid shape of startend_row_indices, when causal is True, the last dimension should be either 1 or 2 but got zpInvalid shape of startend_row_indices, when causal is False, the last dimension should be either 2 or 4 but got rL   r{   z4 blockmask attention no supports deterministic now .r|   z^flashmask_attention does not support setting softmax_scale, use flashmask_attention_v2 insteadz0 blockmask attention only supports sm >= 90 now.rD   z/flashmask_attention_v2 does not support dropoutz:flashmask_attention_v2 does not support return seed_offsetz;flashmask_attention_v2 does not support setting seed_offsetrx   z8flashmask_attention_v2 does not support setting rng_namezAflashmask_attention_v2 does not support setting training to Falsez4flashmask_attention_v2 does not support setting namerI   r}   )
isinstancer`   rR   r$   aranger   cliprepeat_interleaveemptyr   rd   r   r   lenr   rW   r   r   r   flashmask_attentionflashmask_attention_v2)r+   r,   r-   r   rZ   r0   r   r   r   rm   rn   r3   ro   rp   r   sqbszr_   r   result_softmax_lseresult_seed_offsethas_endr   r   s                           r   r   r     s   @ k3'' 	5&4K[^k!n#++= ,++  	(#)=A"BQ$7!$;7$ $ $gq!Rm$$ ! $*;$"$ $ $Q'' ! 
 $*<Ar1W#M#M#M /5}A"BQ$7!$;70 0 0 Aqqq!, 06}Qk!n!4G0 0 0 Aqqq!, $*;$!$ $ $Q'' ! #//J 0// # L
 
	
 $)V\999dH\Hbdd :99 '-..!333W;O;UWW 433 $)!,	!<<< ERfRlmnRo  E  Evyv  AB  wC  E  E =<< $)!,	!<<< CPdPjklPm  C  Ctwt}~  uA  C  C =<< $)!,KNIaL1
 
 
 

 g
 
 
 !#v|333T*BRTT 433 #A&#)A,666tJL\]^L_ttfifopqfrtt 766 #A&*>*DQ*GGGG H_i_opq_r  H  Hy|  zC  DE  zF  H  H HGG #A&5;q>C+?C*GGGGC HGG #A&39Q<#+=#*EEEEC FEE 9Q<3&&&@ '&&  	#)"-22%+B/144  g  G[  Ga  bd  Ge  g  g   $)"-22%+B/144  h  H\  Hb  ce  Hf  h  h  
 *,,,, "=!>??+ -
 %%F &%% F%''''JJK!++-G,HII*   %//1L0MNN+	 A$$JJ.88+, (*J ?? ((p )(( %%B &%% *$! """ 1__c>>>A ">> *  L ) %,,M -,, r>>>J ">>   S 8 <<F  << $ %BD 9
 -$ "" MMMNNNeG (&'' (&''
7||qqzr*   paddle.Tensorr   c                r   | j         r|j         s
J d            t                      rt          j        | ||          }|S t	          di t                      }|                    t          j                  }|                    t          j                  }| ||d}d|i}|	                    d||           |S )aj
  
    The equation is:

    .. math::

        result=reduce\_sum(softmax(\frac{ Q * K^T }{\sqrt{d}}), dim=-2)

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seqlen_q, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seqlen_k, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        softmax_lse(Tensor): The logsumexp of each row returned by _C_ops.flash_attn().
                        3-D tensor with shape:
                        [batch_size, num_heads, seqlen_q_rounded], where seqlen_q_rounded = ceil(seqlen_q/128).
                        The dtype is float32.
    Returns:
        reduced_attention_scores(Tensor), The reduce sum of attention scores across seqlen_q.
        4-D tensor with shape: [batch_size, num_heads, 1, seqlen_k]. The dtype is float32.
    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('reduce_attn_scores need A100 compile')
            >>> import paddle
            >>> import numpy as np
            >>> import paddle._C_ops as _C_ops
            >>> from paddle.nn.functional.flash_attention import (
            >>>     calc_reduced_attention_scores
            >>> )
            >>> np.random.seed(2024)
            >>> q_shape = (5,1024,16,128)
            >>> k_shape = (5,2048,16,128)
            >>> dtype = 'float16'
            >>> query = np.random.random(q_shape)
            >>> key = np.random.random(k_shape)
            >>> q = paddle.to_tensor(
            >>>     query, place=place, dtype=dtype, stop_gradient=True
            >>> )
            >>> k = paddle.to_tensor(
            >>>     key, place=place, dtype=dtype, stop_gradient=True
            >>> )
            >>> _, _, softmax_lse, _ = _C_ops.flash_attn(
            >>>     q,
            >>>     k,
            >>>     k,
            >>>     (None,), #fixed_seed_offset
            >>>     None, #attn_mask
            >>>     0.0, #dropout
            >>>     False, #causal
            >>>     False, #return_softmax
            >>>     False, #is_test
            >>>     "" #rng_name
            >>> )
            >>> reduced_attn_scores = calc_reduced_attention_scores(
            >>>     q,
            >>>     k,
            >>>     softmax_lse,
            >>> )
            >>> # doctest: -SKIP
    z6calc_reduced_attention_scores() is for inference only.calc_reduced_attn_scores)r~   r   r   reduced_scores)r   r   r   )r   )
r&   r   r   r   r	   r   r   r$   r   r   )r+   r,   r   r   r   rV   r   r   s           r   calc_reduced_attention_scoresr   -  s    L  3#4  @ 4  83
 
 @@vxx@@F>>v~NNN77GGG" F 	.G '    
 r*   )FTT)r   r   r   r   r   r   r   r   )r    r   r   r   ).....)r+   r   r,   r   r-   r   r(   r   r.   r/   r0   r   r1   r2   r3   r   r4   r5   r   r6   )r+   r   r,   r   r-   r   r(   r   r.   r/   r0   r   r1   r=   r3   r   r4   r5   r   r>   )r+   r   r,   r   r-   r   r(   r   r.   r/   r0   r   r1   r   r3   r   r4   r5   r   rA   )NrD   FFTN)r[   r`   r   ra   )...)r+   r   r,   r   r-   r   rZ   r/   r0   r   r1   r2   rm   rq   rn   ra   r3   r   ro   rr   rp   r5   r   r6   )r+   r   r,   r   r-   r   rZ   r/   r0   r   r1   r=   rm   rq   rn   ra   r3   r   ro   rr   rp   r5   r   r>   )r+   r   r,   r   r-   r   rZ   r/   r0   r   r1   r   rm   rq   rn   ra   r3   r   ro   rr   rp   r5   r   rA   )rD   FF)r   r   rZ   r/   r0   r   r1   r2   rm   rq   rn   ra   r3   r   ro   rr   r   r6   )r   r   rZ   r/   r0   r   r1   r=   rm   rq   rn   ra   r3   r   ro   rr   r   r>   )r   r   rZ   r/   r0   r   r1   r   rm   rq   rn   ra   r3   r   ro   rr   r   rA   ).......) r+   r   r,   r   r-   r   r   r   r   r   r   r`   r   r`   r4   r/   rZ   r/   r0   r   r1   r2   rm   rq   rn   ra   r3   r   ro   rr   r   r6   ) r+   r   r,   r   r-   r   r   r   r   r   r   r`   r   r`   r4   r/   rZ   r/   r0   r   r1   r=   rm   rq   rn   ra   r3   r   ro   rr   r   r>   ) r+   r   r,   r   r-   r   r   r   r   r   r   r`   r   r`   r4   r/   rZ   r/   r0   r   r1   r   rm   rq   rn   ra   r3   r   ro   rr   r   rA   )rD   FFNrx   TN)NNNFNNNNr   rD   r"   Nr   )........)r   r   r   r   r   r   r   r`   r   r`   r4   r/   rZ   r/   r0   r   r1   r2   rm   rq   rn   ra   r   r   r3   r   ro   rr   r   r6   )r   r   r   r   r   r   r   r`   r   r`   r4   r/   rZ   r/   r0   r   r1   r=   rm   rq   rn   ra   r   r   r3   r   ro   rr   r   r>   )r   r   r   r   r   r   r   r`   r   r`   r4   r/   rZ   r/   r0   r   r1   r   rm   rq   rn   ra   r   r   r3   r   ro   rr   r   rA   )rD   FFNrx   TTNr8   )r+   r   r,   r   r-   r   r   rq   rZ   r/   r0   r   r   r   r   r   r   r   rm   rq   rn   ra   r3   r   ro   rr   rp   r5   r   rq   )r+   r   r,   r   r   r   r   r   )&
__future__r   typingr   r   r   r$   paddle.nn.functionalnn
functionalrU   r   paddle.base.frameworkr   paddle.base.layer_helperr	   paddle.base.wrapped_decoratorr
   paddle.nn.attention.sdpar   r   r   collections.abcr   r   r   r)   r;   rf   rl   ru   r   r   r   r   r   r   r   r9   r*   r   <module>r      s=   # " " " " " 3 3 3 3 3 3 3 3 3 3                          8 8 8 8 8 8 0 0 0 0 0 0 G G G G G G           )))))) !%    8    
 %(
 
 
 
 

 
 $'
  
  
  
  

  
 
' 
' 
' 
' 

'" 

34 34 34 34l      B 

 %( (+"%     
  

 $'  (+"%          
   

 ' (+"%' ' ' ' ' 
'( \ 	\ \ \ \ \~ 
 %(	
 (+
 
 
 
 
 

 
 $'	
  (+
  
  
  
  
  

  
 	
' (+
' 
' 
' 
' 
' 

' 	U 	U U U U Up 
 %('*    
& 
 $''*        
 & 
 '*' ' ' ' 
'8 	H4 H4 H4 H4f )+ + + +l )u u u up 
 %('*    
$ 
 $''*        
 $ 
 '*' ' ' ' 
'2 	}4 }4 }4 }4H +/	^ &*$$'+"& $!^ ^ ^ ^ ^ ^B` ` ` ` ` `r*   