
    x-jU                        d dl mZ d dlmZ d dlZd dlmc mZ d dlmZ d dl	m
Z
mZ erd dlmZ d dlmZmZ  G d d	ej                  ZdS )
    )annotations)TYPE_CHECKINGN)nn)XavierNormalXavierUniform)Tensor)	DTypeLike	PlaceLikec                       e Zd ZdZ	 	 	 	 	 	 	 	 	 d>d? fdZd@dZdAd ZdBdCd$ZdDd)ZdEd/Z	dFd6Z
dGd9Z	 	 	 	 	 dHdId=Z xZS )JMultiheadAttentiona
  
    Allows the model to jointly attend to information from different representation subspaces.

    Multi-Head Attention is defined as:

    .. math::
        \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1,\dots,\text{head}_h)W^O

    where :math:`\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.

    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
    for more details.

    .. note::
        This layer will use the optimized implementation
        :func:`paddle.nn.functional.scaled_dot_product_attention` if no need to return the attention weights.

    Parameters:
        embed_dim (int): Total dimension of the model.
        num_heads (int): The number of heads in multi-head attention.
        dropout (float, optional): The dropout probability used on attention
            weights to drop some attention targets. 0 for no dropout. Default 0.0.
        bias (bool, optional): If specified, adds bias to input / output projection layers.
            Default: True.
        add_bias_kv (bool, optional): If specified, adds bias to the key and value sequences
            at axis=0. Default: False.
        add_zero_attn (bool, optional): If specified, adds a new batch of zeros to the
            key and value sequences at axis=1. Default: False.
        kdim (int, optional): Total number of features for keys. If None, assumed equal to
            `embed_dim`. Default: None.
        vdim (int, optional): Total number of features for values. If None, assumed equal to
            `embed_dim`. Default: None.
        batch_first (bool, optional): If True, then the input and output tensors are provided
            as [batch, seq, feature]. Default: False.
        device (PlaceLike|None, optional): The device to initialize parameters on. Default: None.
        dtype (DTypeLike|None, optional): The data type of the parameters. Default: None.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> from paddle.compat import nn

            >>> # Example with batch_first=True
            >>> embed_dim, num_heads = 128, 8
            >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)

            >>> # query: [batch_size, target_seq_len, embed_dim]
            >>> query = paddle.randn([32, 10, embed_dim])
            >>> # key, value: [batch_size, source_seq_len, embed_dim]
            >>> key = paddle.randn([32, 20, embed_dim])
            >>> value = paddle.randn([32, 20, embed_dim])

            >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
            >>> print(attn_output.shape)
            paddle.Size([32, 10, 128])
            TFN	embed_dimint	num_headsdropoutfloatbiasbooladd_bias_kvadd_zero_attnkdim
int | Nonevdimbatch_firstdevicePlaceLike | NonedtypeDTypeLike | NonereturnNonec                4   |r#t                                          |           n t                                                       || _        ||n|| _        ||n|| _        | j        |k    o
| j        |k    | _        || _        || _        |	| _        ||z  | _	        | j	        |z  | j        k    sJ d | _
        d | _        d | _        d | _        | j        ru|                     d|z  |g| j        d|
t!                                | _        d | _        d | _        d | _        |r'|                     d|z  g| j        d|
          | _
        n|                     ||g| j        d|
t!                                | _        |                     || j        g| j        d|
t!                                | _        |                     || j        g| j        d|
t!                                | _        d | _        |rl|                     |g| j        d|
          | _        |                     |g| j        d|
          | _        |                     |g| j        d|
          | _        t*          j        j                            |||| j                  | _        || _        || _        |rh|                     dd|g| j        d|
t9                                | _        |                     dd|g| j        d|
t9                                | _        d S d x| _        | _        d S )	Nr      F)shaper   is_biasr   default_initializerT)r$   r   r%   r   )r   r      )super__init__r   r   r   _qkv_same_embed_dimr   r   r   head_dimin_proj_biasq_proj_biask_proj_biasv_proj_biascreate_parameter_dtyper   in_proj_weightq_proj_weightk_proj_weightv_proj_weightpaddlecompatr   Linearout_projr   r   r   bias_kbias_v)selfr   r   r   r   r   r   r   r   r   r   r   	__class__s               \/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/compat/nn/transformer.pyr)   zMultiheadAttention.__init__X   s     	GG5))))GG" ,DD)	 ,DD)	I"=tyI'= 	  #&!Y.}y(DN:::: # =	"&"7"79}i0k$1OO #8 # #D "&D!%D!%D $($9$9y=/+ !	 %: % %! "&!6!6 ),k$1OO "7 " "D "&!6!6 $),k$1OO "7 " "D "&!6!6 $),k$1OO "7 " "D #'D #'#8#8$++ !	 $9 $ $  $(#8#8$++ !	 $9 $ $  $(#8#8$++ !	 $9 $ $  (//yt4; 0 
 
 '* 	-//!Y'k$0NN 0  DK //!Y'k$0NN 0  DKKK )-,DK$+++    maskpaddle.Tensorr	   c                    |j         t          j        k    sJ d|j                      t          j        t          j        |          j        |          }t          j        ||t          j        ||                    S )z
        Convert boolean mask to float mask. True -> -inf, False -> 0.0

        Args:
            mask (paddle.Tensor): boolean mask
            dtype (DTypeLike): float dtype

        Returns:
            paddle.Tensor: float mask
        zmask must be boolean, but got r"   )r   r6   r   	to_tensorfinfominwhere
zeros_like)r<   r@   r   fillers       r>   _convert_bool_mask_to_floatz.MultiheadAttention._convert_bool_mask_to_float   sv     zV[(((9TZ99 )(( !&,u"5"5"9GGG|D&&*;D*N*N*NOOOr?   mask1mask2c                   |j         t          j        k    r|j         t          j        k    r||z  S |j         t          j        k    r|                     ||          }|j         t          j        k    r|                     ||          }||z   S )a  
        Safely combine two masks, mask can be bool or float.

        If both mask are bool, this function equals to
        paddle.logical_or(mask1, mask2) and return boolean mask.

        Otherwise, the boolean mask will be converted to float and combined with
        the float mask using addition.

        Args:
            mask1 (paddle.Tensor): mask1
            mask2 (paddle.Tensor): mask2

        Returns:
            paddle.Tensor: combined mask
        r"   )r   r6   r   rI   )r<   rJ   rK   r   s       r>   _combine_masksz!MultiheadAttention._combine_masks   s    & ;&+%%%+*D*D5= ;&+%%44U%4HHE;&+%%44U%4HHEu}r?   r'   r   pad_amtc                    |j         }g |d d         |}t          j        ||j                  }t          j        ||gd          S )Nr"   axis)r$   r6   zerosr   concat)r<   r@   rN   r$   	pad_shape
pad_tensors         r>   	_pad_maskzMultiheadAttention._pad_mask  sP    
*eCRCj*'*	\)4:>>>
}dJ/b9999r?   querykeyvaluetuple[Tensor, Tensor, Tensor]c                4   | j         rt          |          t          |          k    rbt          |          t          |          k    rBt          j        || j        j        | j                  }|                    dd          \  }}}n
| j                            dd          \  }}	}
| j        | j                            dd          nd\  }}}t          j        ||j        |          }t          j        ||	j        |          }t          j        ||
j        |          }not          j        || j	        j        | j
                  }t          j        || j        j        | j                  }t          j        || j        j        | j                  }|||fS )Nr#   rP   rQ   r   )NNN)r*   idFlinearr2   Tr,   splitchunkr3   r-   r4   r.   r5   r/   )r<   rX   rY   rZ   qkvqkvq_wk_wv_wq_bk_bv_bs                 r>   _project_qkvzMultiheadAttention._project_qkv
  sp   
 # 	H%yyBsGG##32e99(<(<hud&9&;T=NOO))AB)//1aa $ 3 9 9!! 9 D DS# (4 %++AA+666$ S#
 HUCE3//HS#%--HUCE3// 2 4d6FGGAd02D4DEEA 2 4d6FGGA!Qwr?   rd   re   rf   
batch_sizetarget_seq_lenc                   | j         rft          j        || j                            |ddg          gd          }t          j        || j                            |ddg          gd          }|                    ||| j        | j        g          	                    g d          }|j
        d         }|                    ||| j        | j        g          	                    g d          }|                    ||| j        | j        g          	                    g d          }| j        rYt          j        || j        d| j        g|j                  }t          j        ||gd          }t          j        ||gd          }|||fS )NrP   r'   rQ   r      r'   r#   r"   rr   )r   r6   rT   r:   expandr;   reshaper   r+   	transposer$   r   rS   r   )r<   rd   re   rf   rn   ro   current_src_lenrS   s           r>   _prepare_qkv_headsz%MultiheadAttention._prepare_qkv_heads#  s     	DK&&
B';<<=A  A DK&&
B';<<=A  A IIG
 

)LLL
!
! 	
 '!*II$.$-H
 

)LLL
!
! 	
 II$.$-H
 

)LLL
!
! 	
  	2LT^Q>ag  E q%jq111Aq%jq111A!Qwr?   	attn_maskTensor | Nonekey_padding_masksrc_len_before_bias	is_causalneed_weightsc	                   |r| j         s| j        s||sd S ||s|d S |Q|r7t          j        t          j        ||gt          j                  d          }nt          j        ||g|          }t          | j        | j         z             }	|	dk    r0|                     ||	          }||                     ||	          }|	                                dk    r%|
                    || j        z  g|j                  }|	                                dk    r|                    || j        |dg          }|V|                    ddg	          }|                    dg|j        dd         d          }|                     |||          }|j        |k    rA|j        t          j        k    r|                     ||          }n|                    |          }|S )
Nr"   r'   )diagonalr   )rN   rr   r#   rP   rQ   )r   r   r6   triuonesr   rS   r   rW   dimrs   r   r$   rt   	unsqueezerepeatrM   r   rI   astype)
r<   rx   rz   ro   r{   r   rn   r|   r}   	pad_counts
             r>   _prepare_attn_maskz%MultiheadAttention._prepare_attn_maskJ  sL    	$	 &	 !(  ) 4Y3C3K4 
"KK')<=V[   	  		 #L#%89  	 *T-==>>	q==y)DDI+#'>>$i $2 $ $  ==??a!((dn,?y? I ==??a!))T^^R@ I '/991v9FF/66-Y_QqS)-1-    ++I7GOOI?e##&+-- <<YNN		%,,U33	r?   
final_masktuple[Tensor, Tensor | None]c                   |j         \  }}}	}|o|d u }|sut          j        j        j                            ||||| j        r| j        nd|          }
|
                    g d          }
|
	                    ||	| j
        g          }
|
d fS t          j        ||d          }|| j        dz  z  }|5|j        t          j        k    r|                     ||j                  }||z   }t!          j        |d          }t!          j        || j        | j        	          }t          j        ||          }|                    g d          	                    ||	| j
        g          }
|
|r|nd fS )
Nr   )rx   	dropout_pr|   rq   T)transpose_yg      ?rP   rQ   )training)r$   r6   r7   r   
functionalscaled_dot_product_attentionr   r   ru   rt   r   matmulr+   r   r   rI   r^   softmax)r<   rd   re   rf   r   r}   r|   rn   _ro   attn_outputscoresweightsctxs                 r>   _attention_corez"MultiheadAttention._attention_core  s    ,-7(
A~q4*"4	 "	B +HH(.2mDdll' I    &//==K%--^T^< K $$]1aT:::Ft}c12F%#v{22!%!A!A"FL" "J  *,iR000GiNNNG-++C--55==^T^< K < ATAAr?   paddle.Tensor | Noneaverage_attn_weights*tuple[paddle.Tensor, paddle.Tensor | None]c	           
     4   |                                 dk    }	|	s|                    | j        rdnd          }|                    | j        rdnd          }|                    | j        rdnd          }|-|                                 dk    r|                    d          }| j        sE|                    g d          }|                    g d          }|                    g d          }|j        \  }
}}|j        d         }||j        |
|fk    sJ |                     |||          \  }}}|                     ||||
|          \  }}}|                     |||||j        |
||          }| 	                    ||||||          \  }}| 
                    |          }| j        s|                    g d          }|r||r|                    d          }|	s5|                    | j        rdnd          }||                    d          }||fS )	a  
        Forward pass of the MultiheadAttention layer.

        .. note::
            If ``need_weights`` is ``False``, this api will fallback to native math implementation,
            otherwise it will call ``paddle.compat.nn.functional.scaled_dot_product_attention`` to
            compute the attention score.

            To achieve better performance, explicitly set ``need_weights=False``,
            and set ``is_causal=True`` if the attn_mask is the causal mask.

        Parameters:
            query (Tensor): The query embeddings. Shape depends on `batch_first`.
                If `batch_first` is False, shape is `[target_seq_len, batch_size, embed_dim]`.
                If `batch_first` is True, shape is `[batch_size, target_seq_len, embed_dim]`.
            key (Tensor): The key embeddings. Shape depends on `batch_first`.
                If `batch_first` is False, shape is `[source_seq_len, batch_size, kdim]`.
                If `batch_first` is True, shape is `[batch_size, source_seq_len, kdim]`.
            value (Tensor): The value embeddings. Shape depends on `batch_first`.
                If `batch_first` is False, shape is `[source_seq_len, batch_size, vdim]`.
                If `batch_first` is True, shape is `[batch_size, source_seq_len, vdim]`.
            key_padding_mask (Tensor, optional): If specified, a mask indicating which
                elements within `key` to ignore for the purpose of attention (i.e. treat as "padding").
                Can be a boolean mask (True indicates padding) or a float mask.
                Shape is `[batch_size, source_seq_len]`. Default: None.
            need_weights (bool, optional): Indicate whether to return the attention
                weights. Default: True.
            attn_mask (Tensor, optional): 2D or 3D mask that prevents attention to certain positions.
                A 2D mask will be broadcasted for all batches while a 3D mask allows different masks
                for the entries in the batch. Shape is `[target_seq_len, source_seq_len]` or
                `[batch_size * num_heads, target_seq_len, source_seq_len]`. Default: None.
            average_attn_weights (bool, optional): If True, indicates that the returned
                `attn_weights` should be averaged across heads. Default: True.
            is_causal (bool, optional): If True, implies that a causal mask is applied to
                the attention implementation. If attn_mask is None and is_causal is True,
                a causal mask is automatically created and used in the attention computation.
                Default: False.

        Returns:
            tuple[Tensor, Tensor|None]:
                - **attn_output** (Tensor): The output of the attention mechanism.
                  Shape matches `query` (based on `batch_first`).
                - **attn_output_weights** (Tensor|None): The attention weights. Returns None if
                  `need_weights` is False. Shape is `[batch_size, target_seq_len, source_seq_len]`
                  if `average_attn_weights` is True.
                  If `average_attn_weights` is False, shape is
                  `[batch_size, num_heads, target_seq_len, source_seq_len]`.
        r#   r   r'   Nrr   )r'   r   rr   )rx   rz   ro   r{   r   rn   r|   r}   rQ   )r   r   r   ru   r$   rm   rw   r   r   r   r9   meansqueeze)r<   rX   rY   rZ   rz   r}   rx   r   r|   
is_batchedrn   ro   r   r{   rd   re   rf   r   r   attn_weightss                       r>   forwardzMultiheadAttention.forward  sr   v YY[[A%
 	AOO)9$@AAqAAE--T%5 <1==COO)9$@AAqAAE+0@0D0D0F0F!0K0K#3#=#=a#@#@  	/OOIII..E--			**COOIII..E(-%
NA!il'#)j:M-NNNNN##E3661a))!Q:~NN1a,,-) 3'!% - 	
 	

 %)$8$8q!Zy%
 %
!\ mmK00 	;%//			::K 	9L4# 9+00a088 	7%--43C.JaaKKK'+33A66L((r?   )	r   TFFNNFNN)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    )r@   rA   r   r	   r   rA   )rJ   rA   rK   rA   r   r	   r   rA   )r'   )r@   r   rN   r   r   r   )rX   r   rY   r   rZ   r   r   r[   )rd   r   re   r   rf   r   rn   r   ro   r   r   r[   )rx   ry   rz   ry   ro   r   r{   r   r   r	   rn   r   r|   r   r}   r   r   ry   )rd   r   re   r   rf   r   r   ry   r}   r   r|   r   r   r   )NTNTF)rX   rA   rY   rA   rZ   rA   rz   r   r}   r   rx   r   r   r   r|   r   r   r   )__name__
__module____qualname____doc__r)   rI   rM   rW   rm   rw   r   r   r   __classcell__)r=   s   @r>   r   r      sE       8 8| !#!#'"&y- y- y- y- y- y- y-vP P P P&   :: : : : :   2% % % %NG G G GR0B 0B 0B 0Bn 26!*.%)n) n) n) n) n) n) n) n) n)r?   r   )
__future__r   typingr   r6   paddle.nn.functionalr   r   r^   paddle.nn.initializerr   r   r   paddle._typingr	   r
   Layerr    r?   r>   <module>r      s    # " " " " "                                      = = = = = = = = 433333333V) V) V) V) V) V) V) V) V) V)r?   