
    ΑiU                        S SK Jr  S SKJr  S SKrS SKJs  Jr  S SKJr  S SK	J
r
Jr  \(       a  S SKJr  S SKJrJr   " S S	\R                   5      rg)
    )annotations)TYPE_CHECKINGN)nn)XavierNormalXavierUniform)Tensor)	DTypeLike	PlaceLikec                  |  ^  \ rS rSrSr         S                       SU 4S jjjr      SS jr        SS jrSSS jjr        SS jr	            SS jr
                  SS	 jr              SS
 jr     S                 SS jjrSrU =r$ )MultiheadAttention   a	  
Allows the model to jointly attend to information from different representation subspaces.

Multi-Head Attention is defined as:

.. math::
    \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1,\dots,\text{head}_h)W^O

where :math:`\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.

Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
for more details.

.. note::
    This layer will use the optimized implementation
    :func:`paddle.nn.functional.scaled_dot_product_attention` if no need to return the attention weights.

Parameters:
    embed_dim (int): Total dimension of the model.
    num_heads (int): The number of heads in multi-head attention.
    dropout (float, optional): The dropout probability used on attention
        weights to drop some attention targets. 0 for no dropout. Default 0.0.
    bias (bool, optional): If specified, adds bias to input / output projection layers.
        Default: True.
    add_bias_kv (bool, optional): If specified, adds bias to the key and value sequences
        at axis=0. Default: False.
    add_zero_attn (bool, optional): If specified, adds a new batch of zeros to the
        key and value sequences at axis=1. Default: False.
    kdim (int, optional): Total number of features for keys. If None, assumed equal to
        `embed_dim`. Default: None.
    vdim (int, optional): Total number of features for values. If None, assumed equal to
        `embed_dim`. Default: None.
    batch_first (bool, optional): If True, then the input and output tensors are provided
        as [batch, seq, feature]. Default: False.
    device (PlaceLike|None, optional): The device to initialize parameters on. Default: None.
    dtype (DTypeLike|None, optional): The data type of the parameters. Default: None.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> from paddle.compat import nn

        >>> # Example with batch_first=True
        >>> embed_dim, num_heads = 128, 8
        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)

        >>> # query: [batch_size, target_seq_len, embed_dim]
        >>> query = paddle.randn([32, 10, embed_dim])
        >>> # key, value: [batch_size, source_seq_len, embed_dim]
        >>> key = paddle.randn([32, 20, embed_dim])
        >>> value = paddle.randn([32, 20, embed_dim])

        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
        >>> print(attn_output.shape)
        paddle.Size([32, 10, 128])
c                  > U(       a  [         TU ]  US9  O[         TU ]  5         Xl        Ub  UOUU l        Ub  UOUU l        U R                  U:H  =(       a    U R                  U:H  U l        X l        X0l        Xl        X-  U l	        U R                  U-  U R                  :X  d   eS U l
        S U l        S U l        S U l        U R
                  (       ar  U R                  SU-  U/U R                  SU
[!        5       S9U l        S U l        S U l        S U l        U(       a%  U R                  SU-  /U R                  SU
S9U l
        GO	U R                  X/U R                  SU
[!        5       S9U l        U R                  XR                  /U R                  SU
[!        5       S9U l        U R                  XR                  /U R                  SU
[!        5       S9U l        S U l        U(       af  U R                  U/U R                  SU
S9U l        U R                  U/U R                  SU
S9U l        U R                  U/U R                  SU
S9U l        [*        R,                  R.                  R1                  XX@R                  S9U l        XPl        X`l        U(       a[  U R                  SSU/U R                  SU
[9        5       S9U l        U R                  SSU/U R                  SU
[9        5       S9U l        g S =U l        U l        g )	Ndtype   F)shaper   is_biasdevicedefault_initializerT)r   r   r   r   )biasr      )super__init__	embed_dimkdimvdim_qkv_same_embed_dim	num_headsdropoutbatch_firsthead_dimin_proj_biasq_proj_biask_proj_biasv_proj_biascreate_parameter_dtyper   in_proj_weightq_proj_weightk_proj_weightv_proj_weightpaddlecompatr   Linearout_projadd_bias_kvadd_zero_attnr   bias_kbias_v)selfr   r   r   r   r0   r1   r   r   r    r   r   	__class__s               \/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/compat/nn/transformer.pyr   MultiheadAttention.__init__X   s*    G5)G" ,D)	 ,D)	II"=tyyI'= 	  #&!.}}y(DNN::: ##"&"7"79}i0kk$1O #8 #D "&D!%D!%D$($9$9y=/++ !	 %: %! "&!6!6 ,kk$1O "7 "D "&!6!6 )),kk$1O "7 "D "&!6!6 )),kk$1O "7 "D #'D#'#8#8$+++ !	 $9 $  $(#8#8$+++ !	 $9 $  $(#8#8$+++ !	 $9 $  ((//t;; 0 
 '*//!Y'kk$0N 0 DK //!Y'kk$0N 0 DK )-,DK$+    c           	        UR                   [        R                  :X  d   SUR                    35       e[        R                  " [        R                  " U5      R
                  US9n[        R                  " X[        R                  " XS95      $ )z
Convert boolean mask to float mask. True -> -inf, False -> 0.0

Args:
    mask (paddle.Tensor): boolean mask
    dtype (DTypeLike): float dtype

Returns:
    paddle.Tensor: float mask
zmask must be boolean, but got r   )r   r,   bool	to_tensorfinfominwhere
zeros_like)r4   maskr   fillers       r6   _convert_bool_mask_to_float.MultiheadAttention._convert_bool_mask_to_float   sn     zzV[[( 	
,TZZL9	
( !!&,,u"5"9"9G||D&*;*;D*NOOr8   c                >   UR                   [        R                  :X  a"  UR                   [        R                  :X  a  X-  $ UR                   [        R                  :X  a  U R                  XS9nUR                   [        R                  :X  a  U R                  X#S9nX-   $ )a|  
Safely combine two masks, mask can be bool or float.

If both mask are bool, this function equals to
paddle.logical_or(mask1, mask2) and return boolean mask.

Otherwise, the boolean mask will be converted to float and combined with
the float mask using addition.

Args:
    mask1 (paddle.Tensor): mask1
    mask2 (paddle.Tensor): mask2

Returns:
    paddle.Tensor: combined mask
r   )r   r,   r:   rB   )r4   mask1mask2r   s       r6   _combine_masks!MultiheadAttention._combine_masks   sy    & ;;&++%%++*D= ;;&++%44U4HE;;&++%44U4HE}r8   c                    UR                   n/ US S QUPn[        R                  " XAR                  S9n[        R                  " X/SS9$ )Nr   axis)r   r,   zerosr   concat)r4   r@   pad_amtr   	pad_shape
pad_tensors         r6   	_pad_maskMultiheadAttention._pad_mask  sF    

*eCRj*'*	\\)::>
}}d/b99r8   c                   U R                   (       Ga'  [        U5      [        U5      :X  ab  [        U5      [        U5      :X  aJ  [        R                  " XR                  R
                  U R                  5      nUR                  SSS9u  pVnGOLU R                  R                  SSS9u  pn
U R                  b  U R                  R                  SSS9OSu  pn[        R                  " XR
                  U5      n[        R                  " X)R
                  U5      n[        R                  " X:R
                  U5      nO[        R                  " XR                  R
                  U R                  5      n[        R                  " X R                  R
                  U R                  5      n[        R                  " X0R                  R
                  U R                  5      nXVU4$ )Nr   rJ   rK   r   )NNN)r   idFlinearr(   Tr"   splitchunkr)   r#   r*   r$   r+   r%   )r4   querykeyvalueqkvqkvq_wk_wv_wq_bk_bv_bs                 r6   _project_qkvMultiheadAttention._project_qkv
  sh   
 ###%yBsG#32e9(<hhu&9&9&;&;T=N=NO))AB)/a $ 3 3 9 9!! 9 D# ((4 %%++AA+6$ #
 HHUEE3/HHS%%-HHUEE3/ 2 2 4 4d6F6FGA0022D4D4DEA 2 2 4 4d6F6FGAQwr8   c                N   U R                   (       ad  [        R                  " X R                  R	                  USS/5      /SS9n[        R                  " X0R
                  R	                  USS/5      /SS9nUR                  XEU R                  U R                  /5      R                  / SQ5      nUR                  S   nUR                  XFU R                  U R                  /5      R                  / SQ5      nUR                  XFU R                  U R                  /5      R                  / SQ5      nU R                  (       ab  [        R                  " X@R                  SU R                  /UR                  S9n[        R                  " X'/SS9n[        R                  " X7/SS9nXU4$ )NrJ   r   rK   r      r   r   r   rl   )r0   r,   rN   r2   expandr3   reshaper   r!   	transposer   r1   rM   r   )r4   r_   r`   ra   
batch_sizetarget_seq_lencurrent_src_lenrM   s           r6   _prepare_qkv_heads%MultiheadAttention._prepare_qkv_heads#  sa    KK&&
B';<=AA KK&&
B';<=AA IIG

)L
! 	
 ''!*II$..$--H

)L
! 	
 II$..$--H

)L
! 	
 LL^^Q>aggE qjq1Aqjq1AQwr8   c	                   U(       a-  U R                   (       d  U R                  (       d  Uc  U(       d  g Uc  U(       d  Uc  g UcU  U(       a8  [        R                  " [        R                  " X4/[        R
                  S9SS9nO[        R                  " X4/US9n[        U R                  U R                   -   5      n	U	S:  a!  U R                  XS9nUb  U R                  X)S9nUR                  5       S:X  a*  UR                  X`R                  -  /UR                  Q5      nUR                  5       S:X  a  UR                  X`R                  US/5      nUbF  UR                  SS/S	9nUR                  S/UR                  SS QSP5      nU R!                  XU5      nUR"                  U:w  aB  UR"                  [        R
                  :X  a  U R%                  X5      nU$ UR'                  U5      nU$ )
Nr   r   )diagonalr   )rO   rl   r   rJ   rK   )r0   r1   r,   triuonesr:   rM   intrR   dimrm   r   r   rn   	unsqueezerepeatrG   r   rB   astype)
r4   	attn_maskkey_padding_maskrq   src_len_before_biasr   rp   	is_causalneed_weights	pad_counts
             r6   _prepare_attn_mask%MultiheadAttention._prepare_attn_maskJ  s    $$&& ( Y3C3K"KKKK'=V[[ 		 #LL#9	 **T-=-==>	q=yDI+#'>>$ $2 $  ==?a!((nn,?y?I ==?a!))^^^R@I '/991v9F/66-Y__Qq)-1-  ++IOI??e#&++- <<YN	  &,,U3	r8   c           	     @   UR                   u  pxpU=(       a    US L nU(       d  [        R                  R                  R                  R                  UUUUU R                  (       a  U R                  OSUS9n
U
R                  / SQ5      n
U
R                  XyU R                  /5      n
U
S 4$ [        R                  " XSS9nXR                  S-  -  nUb=  UR                  [        R                  :X  a  U R                  XKR                  5      nX-   n[         R"                  " USS9n[         R                  " XR                  U R                  S	9n[        R                  " X5      nUR                  / SQ5      R                  XyU R                  /5      n
X(       a  U4$ S 4$ )
N        )r~   	dropout_pr   rk   T)transpose_yg      ?rJ   rK   )training)r   r,   r-   r   
functionalscaled_dot_product_attentionr   r   ro   rn   r   matmulr!   r   r:   rB   rV   softmax)r4   r_   r`   ra   
final_maskr   r   rp   _rq   attn_outputscoresweightsctxs                 r6   _attention_core"MultiheadAttention._attention_core  sr    ,-77(
~4*"4	  ++HH(.2mmdll' I   &//=K%--T^^<K $$]]1T:F}}c12F%##v{{2!%!A!A"LL"J  ,iiR0GiiNG--+C--5==T^^<K <AATAAr8   c	                N   UR                  5       S:H  n	U	(       d  UR                  U R                  (       a  SOS5      nUR                  U R                  (       a  SOS5      nUR                  U R                  (       a  SOS5      nUb%  UR                  5       S:w  a  UR                  S5      nU R                  (       d9  UR                  / SQ5      nUR                  / SQ5      nUR                  / SQ5      nUR                  u  pnUR                  S   nUb  UR                  X4:X  d   eU R                  XU5      u  pnU R                  XUX5      u  pnU R                  UUUUUR                  U
UUS9nU R                  XUUXX5      u  nnU R                  U5      nU R                  (       d  UR                  / SQ5      nU(       a  Ub  U(       a  UR                  SS9nU	(       d8  UR                  U R                  (       a  SOS5      nUb  UR                  S5      nUU4$ )a5  
Forward pass of the MultiheadAttention layer.

.. note::
    If ``need_weights`` is ``False``, this api will fallback to native math implementation,
    otherwise it will call ``paddle.compat.nn.functional.scaled_dot_product_attention`` to
    compute the attention score.

    To achieve better performance, explicitly set ``need_weights=False``,
    and set ``is_causal=True`` if the attn_mask is the causal mask.

Parameters:
    query (Tensor): The query embeddings. Shape depends on `batch_first`.
        If `batch_first` is False, shape is `[target_seq_len, batch_size, embed_dim]`.
        If `batch_first` is True, shape is `[batch_size, target_seq_len, embed_dim]`.
    key (Tensor): The key embeddings. Shape depends on `batch_first`.
        If `batch_first` is False, shape is `[source_seq_len, batch_size, kdim]`.
        If `batch_first` is True, shape is `[batch_size, source_seq_len, kdim]`.
    value (Tensor): The value embeddings. Shape depends on `batch_first`.
        If `batch_first` is False, shape is `[source_seq_len, batch_size, vdim]`.
        If `batch_first` is True, shape is `[batch_size, source_seq_len, vdim]`.
    key_padding_mask (Tensor, optional): If specified, a mask indicating which
        elements within `key` to ignore for the purpose of attention (i.e. treat as "padding").
        Can be a boolean mask (True indicates padding) or a float mask.
        Shape is `[batch_size, source_seq_len]`. Default: None.
    need_weights (bool, optional): Indicate whether to return the attention
        weights. Default: True.
    attn_mask (Tensor, optional): 2D or 3D mask that prevents attention to certain positions.
        A 2D mask will be broadcasted for all batches while a 3D mask allows different masks
        for the entries in the batch. Shape is `[target_seq_len, source_seq_len]` or
        `[batch_size * num_heads, target_seq_len, source_seq_len]`. Default: None.
    average_attn_weights (bool, optional): If True, indicates that the returned
        `attn_weights` should be averaged across heads. Default: True.
    is_causal (bool, optional): If True, implies that a causal mask is applied to
        the attention implementation. If attn_mask is None and is_causal is True,
        a causal mask is automatically created and used in the attention computation.
        Default: False.

Returns:
    tuple[Tensor, Tensor|None]:
        - **attn_output** (Tensor): The output of the attention mechanism.
          Shape matches `query` (based on `batch_first`).
        - **attn_output_weights** (Tensor|None): The attention weights. Returns None if
          `need_weights` is False. Shape is `[batch_size, target_seq_len, source_seq_len]`
          if `average_attn_weights` is True.
          If `average_attn_weights` is False, shape is
          `[batch_size, num_heads, target_seq_len, source_seq_len]`.
r   r   r   rl   )r   r   rl   )r~   r   rq   r   r   rp   r   r   rK   )rz   r{   r    ro   r   rh   rs   r   r   r   r/   meansqueeze)r4   r[   r\   r]   r   r   r~   average_attn_weightsr   
is_batchedrp   rq   r   r   r_   r`   ra   r   r   attn_weightss                       r6   forwardMultiheadAttention.forward  s   v YY[A%
OO)9)9AqAE--T%5%51=COO)9)9AqAE+0@0D0D0F!0K#3#=#=a#@ OOI.E--	*COOI.E(-%
A!iil'#))j-NNNN##E6a))!:Na,,-) 3''!% - 	

 %)$8$8!Z%
!\ mmK0%//	:KL4#+00a08%--43C3CaKK'+33A6L((r8   )r   r0   r1   r    r2   r3   r   r   r!   r"   r(   r$   r*   r   r   r/   r#   r)   r%   r+   r   )	r   TFFNNFNN)r   ry   r   ry   r   floatr   r:   r0   r:   r1   r:   r   
int | Noner   r   r    r:   r   zPlaceLike | Noner   zDTypeLike | NonereturnNone)r@   paddle.Tensorr   r	   r   r   )rE   r   rF   r   r   r	   r   r   )r   )r@   r   rO   ry   r   r   )r[   r   r\   r   r]   r   r   tuple[Tensor, Tensor, Tensor])r_   r   r`   r   ra   r   rp   ry   rq   ry   r   r   )r~   Tensor | Noner   r   rq   ry   r   ry   r   r	   rp   ry   r   r:   r   r:   r   r   )r_   r   r`   r   ra   r   r   r   r   r:   r   r:   r   ztuple[Tensor, Tensor | None])NTNTF)r[   r   r\   r   r]   r   r   paddle.Tensor | Noner   r:   r~   r   r   r:   r   r:   r   z*tuple[paddle.Tensor, paddle.Tensor | None])__name__
__module____qualname____firstlineno____doc__r   rB   rG   rR   rh   rs   r   r   r   __static_attributes____classcell__)r5   s   @r6   r   r      s   8| !#!#'"&y-y- y- 	y-
 y- y- y- y- y- y- !y-  y- 
y- y-vP!P*3P	P&"+8AJ	::"(17	&2%% % 	%
 % % 
'%NG G (G 	G
 !G G G G G 
GR0B0B 0B 	0B
 "0B 0B 0B 
&0Bn 26!*.%)n)n) n) 	n)
 /n) n) (n) #n) n) 
4n) n)r8   r   )
__future__r   typingr   r,   paddle.nn.functionalr   r   rV   paddle.nn.initializerr   r   r   paddle._typingr	   r
   Layerr    r8   r6   <module>r      s7    #         =3V) V)r8   