
    ёiHZ                    ~   S SK Jr  S SKJr  S SKJrJr  S SKJr  S SK	r	S SK	J
r
  S SKJr  S SKJrJrJr  \" \S	S
S9rS SKJr  \(       a  S SK	JrJr  S SKJr  0 qS rS+S jr\ " S S5      5       r\" SS9S,S j5       r\" SS9 S-       S.S jj5       r\" SS9S/S j5       rS0S jr S0S jr!\" SS9S1S j5       r"S0S jr#S0S jr$S0S jr%\" S S9S2S! j5       r&\" SS9S3S" j5       r'\" SS9S1S# j5       r(S0S$ jr)S4S% jr*S4S& jr+S5S4S' jjr,S5S4S( jjr-S6S) jr.        S7                       S8S* jjr/g)9    )annotations)	dataclass)cached_property	lru_cache)TYPE_CHECKINGN)_C_ops)
get_logger)
SDPBackend_get_backend_priority_get_enabled_backendsINFOz&%(asctime)s-%(levelname)s: %(message)s)fmt)_math_attention)Tensordtype)Placec                    SS[         R                  R                  SS9(       a   [         R                  [         R                  4O[         R                  4S.SS[         R                  R                  SS9(       a/  [         R                  [         R                  [         R
                  4O[         R                  [         R
                  4S.S.qg )N)   r   )      F)including_emulation)MINIMUM_SM_VERSIONMAXIMUM_SM_VERSIONsupport_dtypes)   r   )
flash_attnmem_efficient_attn)paddledeviceis_bf16_supportedfloat16bfloat16float_config     Y/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/nn/functional/sdpa.pyinit_configr(   *   s     #)")}}..5.I  &~~v?.."
 #)") }}..5.I	  ..&,,/

Gr&   c                0   US:X  a  X4$ U R                  S5      UR                  S5      pU R                  SSSUS/5      UR                  SSSUS/5      pU R                  SS5      R                  5       UR                  SS5      R                  5       pX4$ )z
Repeat key and value tensors along the num_heads(3) dimension. The layout
of key and value should be [batch_size, seq_len, num_heads, head_dim].
r         )	unsqueezeexpandflatten
contiguous)keyvaluenum_repeatss      r'   
_repeat_kvr4   B   s    
 azq!5??1#5

BBR01b"b+r23 

 	Aq$$&a&&( 
 :r&   c                      \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   S
\S'   S\S'   S\S'   \SS j5       r\SS j5       r\SS j5       r\SS j5       r	\SS j5       r
Srg)	SDPParamsV   zpaddle.Sizequery_shape	key_shapevalue_shapezpaddle.Size | Noneattn_mask_shaper#   dropoutbool	is_causalfloat | Nonescalequery_stop_gradientztuple[dtype, dtype, dtype]r   ztuple[Place, Place, Place]placec                Z    U R                   S   U R                  S   U R                  S   4$ )Nr   r8   r9   r:   selfs    r'   
batch_sizeSDPParams.batch_sizec   .    "DNN1$5t7G7G7JJJr&   c                Z    U R                   S   U R                  S   U R                  S   4$ )Nr   rD   rE   s    r'   seq_lenSDPParams.seq_leng   rI   r&   c                Z    U R                   S   U R                  S   U R                  S   4$ )Nr,   rD   rE   s    r'   	num_headsSDPParams.num_headsk   rI   r&   c                Z    U R                   S   U R                  S   U R                  S   4$ )Nr+   rD   rE   s    r'   head_dimSDPParams.head_dimo   s.    #T^^B%79I9I"9MMMr&   c                >    [        S U R                   5       5      nU$ )Nc              3  n   #    U  H+  oR                  5       (       a  UR                  5       OS v   M-     g7f)r+   N)is_gpu_placegpu_device_id).0pls     r'   	<genexpr>&SDPParams.device_id.<locals>.<genexpr>u   s+      
FP//"3"3B;js   35)tuplerB   )rF   rets     r'   	device_idSDPParams.device_ids   s$     
FJjj
 
 
r&   r%   N)returnztuple[int, int, int])r_   ztuple[int, ...])__name__
__module____qualname____firstlineno____annotations__r   rG   rK   rN   rQ   r]   __static_attributes__r%   r&   r'   r6   r6   V   s    ''NO%%%%K K K K K K N N  r&   r6   r   )maxsizec                b    U S:  a  g[         R                  R                  R                  U 5      $ )Nr   )r   r   )r   r   cudaget_device_capabilityr]   s    r'   ri   ri   {   s'    1}==33I>>r&       c                J    [        U5      u  p4X44nXs=:*  =(       a    U:*  $ s  $ Nri   )min_smmax_smr]   majorminorcurrents         r'   check_sm_versionrt      s.     )3LEnG&&&&&&r&   r   c                 v    [         R                  " 5       =(       a    [         R                  R                  5       $ rm   )r   is_compiled_with_cudarh   is_availabler%   r&   r'   check_cuda_is_availablerx      s#    '')Hfkk.F.F.HHr&   c           	        U R                   S   R                  5       (       dh  U R                   S   R                  5       (       dF  [        R	                  SU R                   S    SU R                   S    SU R                   S    35        gg)	z7
Check all input tensors are placed on the GPU device.
r   zLAll input tensors should be placed on GPU or custom place, but query place: z, key place: r   z, value place: r,   FT)rB   rU   is_custom_place_loggerdebugparamss    r'   check_all_tensors_on_devicer      s}    
 	Q$$&&&,,q/*I*I*K*K"LLO,M||Av||A.?A	

 r&   c                    U R                   u  pnUS:  d
  X:w  d  X#:w  a  [        R                  SU SU SU 35        gUS-  S:w  a  [        R                  SU 35        gg	)
z]
Check the dimension of head in query, key, and value should be equal and all less than 256.
   zfThe dimension of head in query, key, and value should be equal and all less than 256, but q_head_dim: , k_head_dim: , v_head_dim: Fr   r   zZThe dimension of head in query, key, and value should be a multiple of 8, but q_head_dim: T)rQ   r{   r|   )r~   
q_head_dim
k_head_dim
v_head_dims       r'   check_head_dim_size_flashr      s     *0&JJC:3z7O)l.NS]R^`	
 A~)l,	
 r&   c           
        [         R                  (       a;  [        R                  " [        R                  R                  5       S   5      (       a  g[        5       (       d  [        R                  S5        g[        [        S   S   [        S   S   U 5      (       d<  [        R                  S[        S   S    S	[        S   S    S
[        U 5       35        gg)zJ
Check flash attention requires CUDA support and SM between 8.0 and 12.1.
r   Tz&Flash attention requires CUDA support.Fr   r   r   z$Flash attention requires SM between and , but found SM )r
   FLASH_ATTENTIONr   is_compiled_with_custom_devicer   get_all_device_typerx   r{   r|   rt   r$   ri   rj   s    r'   &check_flash_attention_hardware_supportr      s    
 !!f&K&K))+A.' ' "$$>?2323 
 	27<3HI]3^2_7<()=>?$Y/02	

 r&   c                    U R                   (       d  gU R                  u  pnX:X  a  g[        R                  SU R                   SU R
                   35        g)zk
Check flash attention only supports causal attention when the sequence length of query and key are equal.
TzxFlash attention only supports causal attention when the sequencelength of query and key are equal, but got query shape: z, key shape: F)r>   rK   r{   r|   r8   r9   )r~   q_lenk_len_s       r'   %check_flash_causal_non_square_seqlensr      sZ     nnOE!~MMC
mF,<,<+=	?
 r&   c                    U R                   u  pnX:w  d  X2:w  d  U[        S   S   ;  a,  [        R                  SU SU SU S[        S   S    35        gg)	9
check QKV share the same dtype and are supported dtype.
r   r   zoFlash attention requires query, key, and value to be of the same dtype and support dtype, but got query dtype: , key dtype: , value dtype: . Supported dtypes are: FTr   r$   r{   r|   r~   q_dtypek_dtypev_dtypes       r'   check_dtypes_low_precision_far      su     !'Gg',/0@AA  'ygYgY&>|$%567	9	
 r&   c                    U R                   u  pnX:w  d  X2:w  d  U[        S   S   ;  a,  [        R                  SU SU SU S[        S   S    35        gg)	r   r   r   zrMem_efficient_attn requires query, key, and value to be of the same dtype and support dtype, but got query dtype: r   r   r   FTr   r   s       r'   -check_dtypes_low_precision_mem_efficient_attnr      sx     !'Gg'"678HII  'ygYgY&>+,-=>?	A	
 r&   r,   c                <    [        U5      u  p#US:  a  gUS:X  a  U $ g)Nr   T   Frn   )is_halfr]   rq   r   s       r'   use_tensor_coresr     s'    $Y/HEzzr&   c                    U [         R                  [         R                  4;   n[        X!5      n[	        U5      u  pEUS:  a  SOSnU(       a  SOSnU(       a  [        USU-  5      nU$ )Nr      r      rk      )r   r!   r"   r   ri   max)r   r]   r   use_tcrq   r   matmul_alignment_mnbits_per_scalars           r'   minimum_gemm_alignmentr     sb    88Gg1F$Y/HE$qy!a#bO!"5s_7LMr&   c           
        [        5       (       d  [        R                  S5        g[        [        S   S   [        S   S   U 5      (       d<  [        R                  S[        S   S    S[        S   S    S[        U 5       35        gg	)
zR
Check mem_efficient attention requires CUDA support and SM between 5.0 and 12.1.
z.Mem efficient attention requires CUDA support.Fr   r   r   z,Mem efficient attention requires SM between r   r   T)rx   r{   r|   rt   r$   ri   rj   s    r'   $check_mem_efficient_hardware_supportr   $  s    
 #$$FG$%&:;$%&:; 
 	:7CW;XYm;n:o7/01EFG$Y/02	

 r&   c           
        U R                   S   U R                  S   U R                  S   p2n[        U R                  S   U R
                  S   5      nX-  S:w  d  X$-  S:w  d  X4-  S:w  a"  [        R                  SU SU SU SU 35        gg)	Nr+   r   z:Mem efficient attention requires head dim size aligned to z, but found q_head_dim: r   r   FT)r8   r9   r:   r   r   r]   r{   r|   )r~   r   r   r   	alignments        r'   !check_head_dim_size_mem_efficientr   ;  s    22 'J
 'v||A8H8H8KLI!#!Q&!Q&H T%%/Lzl.YcXdf	
 r&   c                    U R                   (       a  gU R                  c  gU R                  S   nUS-  S:w  a  [        R                  SU S35        gg)NTr+   r   r   ztMem efficient attention requires attn_mask last dimension to be divisible by 8 to satisfy vector alignment, but got z!. Falling back to other backends.F)r>   r;   r{   r|   )r~   last_dims     r'   check_attn_mask_alignmentr   O  sa    %%%b)H!|q44<: >..	

 r&   c                J    U R                   c  g[        R                  S5        g)NTz/Paddle's FAV2 does not support scale parameter.F)r@   r{   r|   r}   s    r'   check_scale_is_Noner   c  s    ||MMCDr&   c                    [         [        [        [        [        /nU H  nU" U 5      (       a  M    g   [        U R                  S   5      (       d  ggNFr   T)r   r   r   r   r   r   r]   )r~   general_constraints
constraints      r'   can_use_flash_attentionr   j  sQ    #!-% *
&!! * 2&2B2B12EFFr&   c                    [         [        [        [        /nU H  nU" U 5      (       a  M    g   [	        U R
                  S   5      (       d  ggr   )r   r   r   r   r   r]   )r~   constraintsr   s      r'   can_use_mem_efficient_attentionr   }  sM    #)!5	K "
&!! " 00@0@0CDDr&   c                d   [         R                  " 5       nSU;   a  g[        5       n[        5       nU Hp  nXB;  a  M
  U[        R
                  :X  a  [        U 5      (       a    gM2  U[        R                  :X  a  [        U 5      (       a    gMZ  U[        R                  :X  d  Mp    g   [        S5      e)Nxpur   mem_efficientmathz@No available backend for scaled_dot_product_attention was found.)r   
get_devicer   r   r
   r   r   EFFICIENT_ATTENTIONr   MATHRuntimeError)paramrB   enabled_backendspriority_orderbackends        r'   select_sdp_for_sdpar     s     E~,.*,N!*j000&u--# .
666.u55& 6
' " J r&   c                \
   U R                  5       S:H  nU(       d3  U R                  S5      n UR                  S5      nUR                  S5      nUR                  S   U R                  S   UR                  S   pnU	(       a0  US:X  d  X-  S:X  d   SU SU S35       eX:X  d   SU SU S35       eOXs=:X  a  U:X  d  O   SU S	U S
U S35       eU R                  u  nnnnUR                  u  nnnnUS:X  a  U R                  5       (       a|  UR                  5       (       ag  UR                  5       (       aR  Ub   SU S35       e[        R
                  R                  R                  R                  R                  U UUUUU5      nU$ [        R                  R                  5       (       d#  [        R                  R                  5       4S-  nO#U R                  UR                  UR                  4n[        U R                  UR                  UR                  Ub  UR                  OSUUUU R                   U R"                  UR"                  UR"                  4US9
n[%        [&        5      S:X  a
  [)        5         U R                  R+                  5       S:H  =(       dA    UR                  R+                  5       S:H  =(       d    UR                  R+                  5       S:H  nUbz  UR"                  [        R,                  :X  a\  [        R.                  " U[        R0                  " SU R"                  S9[        R0                  " [3        S5      * U R"                  S95      nU(       a  SnO[5        U5      n[6        R9                  SU-   5        US:X  a  SnSnSnUbU  UR:                  S:X  a   UR=                  US/UR                  Q5      nO%UR:                  S:X  a  [        R                  " USS9n[>        R@                  " U UUUUUUUU(       + U5
      u  n    nOUS:X  a  SSK!J"nJ#n   X-  n![I        XU!5      u  pU(       a  U" 5       nOyUbv  UR                  5       S:X  ab  UR                  S   S:w  aO  UR                  S   U:w  a<  UR=                  UR                  S   UUR                  S   UR                  S   /5      nU " U UUUUUUS9nOEUS:X  a1  US:w  a  X-  OSn![I        XU!5      u  p[K        U UUUUUSUU5	      S   nO[M        SU 35      eU(       d  [        RN                  " USS9nU$ ) a  
The equation is:

.. math::

    result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
The dimensions of the three parameters are the same.
``d`` represents the size of the last dimension of the three parameters.

Warning:
    This API only verifies inputs with dtype float16 and bfloat16, other dtypes may fall back to math
        implementation, which is less optimized.

Warning:
    If is_causal is set to True, the causal mask should not be provided, otherwise
        the provided mask will be ignored.

Note:
    This API differs from :ref:`api_paddle_compat_nn_functional_scaled_dot_product_attention` in that:
        1. The QKV layout of this API is [batch_size, seq_len, num_heads, head_dim] or [seq_len, num_heads, head_dim].
    If you need num_heads before seq_len layout, please use ``paddle.compat.nn.functional.scaled_dot_product_attention``.

Args:
    query(Tensor): The query tensor in the Attention module.
                    4-D tensor with shape:
                    [batch_size, seq_len_key, num_heads, head_dim].
                    3-D tensor with shape:
                    [seq_len_key, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    key(Tensor): The key tensor in the Attention module.
                    4-D tensor with shape:
                    [batch_size, seq_len_key, num_heads, head_dim].
                    3-D tensor with shape:
                    [seq_len_key, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    value(Tensor): The value tensor in the Attention module.
                    4-D tensor with shape:
                    [batch_size, seq_len_value, num_heads, head_dim].
                    3-D tensor with shape:
                    [seq_len_value, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
    attn_mask(Tensor, optional): The attention mask tensor. The shape should be broadcastable to
                    [batch_size, num_heads, seq_len_key, seq_len_query]. The dtype can be bool
                    or same type of query. The bool mask indicates the positions should take part
                    in attention. The non-bool mask will be added to attention score.
    dropout_p(float, optional): The dropout ratio.
    is_causal(bool, optional): Whether enable causal mode.
    training(bool, optional): Whether it is in the training phase.
    backend(str, optional): Specify which backend to compute scaled dot product attention.
                    Currently only support "p2p" for distribution usage.
    scale(float, optional): The scaling factor used in the calculation of attention weights.
                    If None, scale = 1 / sqrt(head_dim).
    enable_gqa(bool, optional): Whether enable GQA(Group Query Attention) mode. Default is True.
    name(str|None, optional): The default value is None. Normally there is no need for user
                    to set this property. For more information, please refer to
                    :ref:`api_guide_Name`.

Returns:
    out(Tensor): The attention tensor.
                4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim].
                3-D tensor with shape: [seq_len, num_heads, head_dim].
                The dtype can be float16 or bfloat16.

Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('bfloat need V100 compile')
        >>> import paddle
        >>> q = paddle.rand((1, 128, 2, 16), dtype=paddle.bfloat16)
        >>> output = paddle.nn.functional.scaled_dot_product_attention(q, q, q, None, 0.9, False)
        >>> print(output)
        >>> # doctest: -SKIP
r   r   r,   zThe number of groups in query(z3) must be divisible by the number of groups in key(z) if GQA enabled.zThe number of groups in key(z1) must be equal to the number of groups in value(z/) must be equal to the number of groups in key(z$) and the number of groups in value(z) if GQA disabled.p2pNzBackend z not support scale parameter.r*   )
r8   r9   r:   r;   r<   r>   r@   rA   r   rB           )r   infr   zSelected backend:r   F r   )axisr   )LowerTriangularMaskmemory_efficient_attention)	attn_biaspr@   trainingzInvalid backend )(dimr-   shapeis_distr   distributedauto_parallelring_attentionRingFlashAttentionapplybasein_dygraph_mode	framework_current_expected_place_rB   r6   stop_gradientr   lenr$   r(   numelr=   where	to_tensorr#   r   r{   r|   ndimr.   r   r   -paddle.incubate.nn.memory_efficient_attentionr   r   r4   r   
ValueErrorsqueeze)"queryr1   r2   	attn_mask	dropout_pr>   r   r   r@   
enable_gqaname
is_batchedk_headsq_headsv_headsbs	seq_len_qnum_heads_q
head_dim_qr   	seq_len_knum_heads_k
head_dim_kout	qkv_placer   is_zero_sizesdp_func_namefixed_seed_offsetreturn_softmaxrng_namer   r   repeatss"                                     r'   scaled_dot_product_attentionr    s(   p !J "mmA"		!AA G
 !|w0A5 	
,WI5hiphq  rC  D	
5 ! 	
*7)3deldmm~	
! ,W, 	
,WI5deldm n118	9KM	
, .3[[*B	;
,/II)Ay+z 	5MMOOKKMMMMOO }O	1NOO}  ..==PPVV
 
;;&&((%%>>@BQF	[[#))U[[9	KK))KK+4+@	d!//{{CIIu{{3E 7|q 	q  	$99??!	$;;!#  ??fkk)  EKK8  %,ekkBI +E2MM%56$  ~~"%,,b!-Fioo-FG	1$",,YQ?	((L
Q1 
/	)	

 $G4
+-I" 1$OOA&!+OOA&+5%,,!*#!*!*		 )
 
&	 (/1'$!G4


 
 +G9566nnSq)Jr&   )r1   r   r2   r   r3   int)r]   r  r_   tuple[int, int])r   )ro   r  rp   r  r]   r  r_   r=   )r_   r=   )r~   r6   )r]   r  )r   r=   r]   r  r_   r=   )r   r   r]   r  )r~   r6   r_   r=   )F)r   r6   r_   str)Nr   FTNNTN)r   r   r1   r   r2   r   r   zTensor | Noner   r#   r>   r=   r   r=   r   
str | Noner@   r?   r   r=   r   r	  r_   r   )0
__future__r   dataclassesr   	functoolsr   r   typingr   r   r   paddle.base.log_helperr	   paddle.nn.attention.sdpar
   r   r   r`   r{   $paddle.nn.functional.flash_attentionr   r   r   paddle.base.corer   r$   r(   r4   r6   ri   rt   rx   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r%   r&   r'   <module>r     s   # ! 0     -  fB A$&
0( ! ! !H 1? ? 2GH''%4'AD'	' ' 1I I * 1 6&** 1  2  1 ,((&H  $zz	z z 	z
 z z z z z z z zr&   