
    ёio.                      S SK Jr  S SKrS SKrS SKJrJrJr  S SKr	S SK
r
S SKJr  SSKJr  SSKJr  SS	KJr  S
SKJrJr  S
SKJr  S
SKJr  S
SKJr  \(       a  S SKJr  S SK
Jr  S SKJ r J!r!  / r"\      S"S j5       r#\      S#S j5       r#S r#S$S jr$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S  S!\5      r*g)%    )annotationsN)TYPE_CHECKINGLiteraloverload)convert_dtype   )tensor)	ParamAttr   )
functional   )DropoutLinear)	LayerList)Layer)	LayerNorm)Sequence)Tensor)	DTypeLikeParamAttrLikec                    g N 
param_attrns     [/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/nn/layer/transformer.py_convert_param_attr_to_listr   +   s         c                    g r   r   r   s     r   r   r   1   s     r   c                >   [        U [        [        45      (       a  [        U 5      U:X  d   SU S35       e/ nU  H~  n[        U[        5      (       aA  U(       a'  UR                  [        R                  " S5      5        MF  UR                  S5        MY  UR                  [        R                  " U5      5        M     U$ [        U [        5      (       aC  / nU (       a1  [        U5       Vs/ s H  n[        R                  " S5      PM     nnU$ S/U-  n U$ / n[        R                  " U 5      n[        U5       H[  n[        R                  " U5      nUR                  (       a   UR                  S-   [        U5      -   Ul        UR                  U5        M]     U$ s  snf )ag  
If `param_attr` is a list or tuple, convert every element in it to a
ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
construct a list, and rename every one by appending a increasing index
suffix to avoid having same names when `param_attr` contains a name.

Parameters:
    param_attr (list|tuple|ParamAttr|bool|None): A list, tuple or something can be
        converted to a ParamAttr instance by `ParamAttr._to_attr`.
    n (int): The times to repeat to construct a list when `param_attr`
        is not a list or tuple.

Returns:
    list: A list composed of each including cell's `param_attr`.
zlength of param_attr should be z when it is a list/tupleNF_)
isinstancelisttuplelenboolappendr
   _to_attrrangecopydeepcopynamestr)r   r   param_attrsattriattr_is         r   r   r   8   sj     *tUm,,:!# 	
-aS0HI	
# D$%%&&y'9'9$'?@&&u-""9#5#5d#;< .  
J	%	%=B1XFX9--d3XKF  !'A+K  !!*-qA]]4(Fyy$kkC/#a&8v&	 
  Gs   ) Fc                    U be  U R                   U:w  aU  [        U R                   5      nUS:X  d  SU;   a  [        R                  " X5      S-
  S-  n U $ [        R                  " X5      n U $ )a  
Convert the attention mask to the target dtype we expect.

Parameters:
    attn_mask (Tensor, optional): A tensor used in multi-head attention
            to prevents attention to some unwanted positions, usually the
            paddings or the subsequent positions. It is a tensor with shape
            broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
            When the data type is bool, the unwanted positions have `False`
            values and the others have `True` values. When the data type is
            int, the unwanted positions have 0 values and the others have 1
            values. When the data type is float, the unwanted positions have
            `-INF` values and the others have 0 values. It can be None when
            nothing wanted or needed to be prevented attention to. Default None.
    dtype (VarType): The target type of `attn_mask` we expect.

Returns:
    Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
r'   intg      ?g    eA)dtyper   paddlecast)	attn_maskr5   attn_mask_dtypes      r   _convert_attention_maskr:   g   sh    ( E!9'	8f$(@Y6<CI  I5Ir   c                    ^  \ rS rSr% Sr\R                  " SSS/5      r\R                  " SSS/5      rS\	S'   S\	S	'   S\	S
'   S\	S'   S\	S'   S\	S'   S\	S'         S"                 S#U 4S jjjr
\ S$         S%S jj5       r\ S$         S&S jj5       rS'S jrS(S jr\ S)       S*S jj5       r\  S)       S+S jj5       rS\4S jr\    S,           S-S jj5       r\    S,           S.S jj5       r\    S,           S/S jj5       r\    S,           S0S jj5       r\    S,           S1S jj5       r\    S,           S2S jj5       rS3S  jrS!rU =r$ )4MultiHeadAttention   a  
Attention maps queries and a set of key-value pairs to outputs, and
Multi-Head Attention performs multiple parallel attention to jointly attending
to information from different representation subspaces.

Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
for more details.

Parameters:
    embed_dim (int): The expected feature size in the input and output.
    num_heads (int): The number of heads in multi-head attention.
    dropout (float, optional): The dropout probability used on attention
        weights to drop some attention targets. 0 for no dropout. Default 0
    kdim (int, optional): The feature size in key. If None, assumed equal to
        `embed_dim`. Default None.
    vdim (int, optional): The feature size in value. If None, assumed equal to
        `embed_dim`. Default None.
    need_weights (bool, optional): Indicate whether to return the attention
        weights. Default False.
    weight_attr(ParamAttr|None, optional):  To specify the weight parameter property.
        Default: None, which means the default weight parameter property is used.
        See usage for details in :code:`ParamAttr` .
    bias_attr (ParamAttr|bool|None, optional): To specify the bias parameter property.
        Default: None, which means the default bias parameter property is used.
        If it is set to False, this layer will not have trainable bias parameter.
        See usage for details in :code:`ParamAttr` .

Examples:

    .. code-block:: pycon

        >>> import paddle

        >>> # encoder input: [batch_size, sequence_length, d_model]
        >>> query = paddle.rand((2, 4, 128))
        >>> # self attention mask: [batch_size, num_heads, query_len, query_len]
        >>> attn_mask = paddle.rand((2, 2, 4, 4))
        >>> multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
        >>> output = multi_head_attn(query, None, None, attn_mask=attn_mask)
        >>> print(output.shape)
        paddle.Size([2, 4, 128])
CachekvStaticCacher4   	embed_dimkdimvdim	num_headshead_dimfloatdropoutr'   need_weightsNc	                  > [         T	U ]  5         US:  d
   SU 35       eUS:  d
   SU 35       eXl        Ub  UOUU l        Ub  UOUU l        X l        X0l        X`l        X-  U l        U R                  U-  U R                  :X  d   S5       e[        XXxS9U l
        [        U R                  XUS9U l        [        U R                  XUS9U l        [        XXxS9U l        g )Nr   z6Expected embed_dim to be greater than 0, but received z6Expected num_heads to be greater than 0, but received z(embed_dim must be divisible by num_heads	bias_attr)super__init__rB   rC   rD   rE   rH   rI   rF   r   q_projk_projv_projout_proj)
selfrB   rE   rH   rC   rD   rI   weight_attrrL   	__class__s
            r   rN   MultiHeadAttention.__init__   s    	1} 	
DYKP	
} 1} 	
DYKP	
} # ,D)	 ,D)	"(!.}}y(DNN: 	
6	
: +
 IIy
 IIy
 +
r   c                    g r   r   rS   querykeyvaluecaches        r   _prepare_qkvMultiHeadAttention._prepare_qkv   s     ),r   c                    g r   r   rX   s        r   r]   r^      s     >Ar   c                &   U R                  U5      n[        R                  " USSU R                  U R                  /S9n[        R
                  " U/ SQS9n[        X@R                  5      (       a  UR                  UR                  pvOU R                  X#5      u  pg[        X@R                  5      (       aS  [        R                  " UR                  U/SS9n[        R                  " UR                  U/SS9nU R                  Xg5      nUc  XVU4$ XVXt4$ )a  
Prepares linear projected queries, keys and values for usage of subsequent
multiple parallel attention. If `cache` is not None, using cached results
to reduce redundant calculations.

Parameters:
    query (Tensor): The queries for multi-head attention. It is a
        tensor with shape `[batch_size, query_length, embed_dim]`. The
        data type should be float32 or float64.
    key (Tensor): The keys for multi-head attention. It is
        a tensor with shape `[batch_size, key_length, kdim]`. The
        data type should be float32 or float64. If None, use `query` as
        `key`.
    value (Tensor): The values for multi-head attention. It
        is a tensor with shape `[batch_size, value_length, vdim]`.
        The data type should be float32 or float64. If None, use `query` as
        `value`.
    cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache|None, optional):
        It is a namedtuple with `k` and `v` as fields, and stores tensors
        shaped `[batch_size, num_heads, length, embed_dim]` which are results
        of linear projection, reshape and transpose calculations in
        MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
        fields reserve intermediate results of previous positions, which
        mostly used for decoder self attention. If it is an instance of
        `StaticCache`, `key` and `value` args would be ignored, `k` and
        `v` fields would be used as calculated results on `key` and
        `value`, which mostly used for decoder-encoder cross attention.
        It is only used for inference and should be None for training.
        Default None.

Returns:
    tuple: A tuple including linear projected keys and values. These two \
        tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
        and `[batch_size, n_head, sequence_length, d_value]` separately, \
        and their data types are same as inputs.
r   xshaper   r   r   r   rb   permr   )axis)rO   r	   reshaperE   rF   	transposer#   rA   r?   r@   
compute_kvr>   concat)rS   rY   rZ   r[   r\   qr?   r@   s           r   r]   r^      s    J KKNNQq!T^^T]]&KLq|4e--..77EGGq??3.DAeZZ((uwwl3Auwwl3AJJq$E!May?a/??r   c                \   U R                  U5      nU R                  U5      n[        R                  " USSU R                  U R
                  /S9n[        R                  " U/ SQS9n[        R                  " USSU R                  U R
                  /S9n[        R                  " U/ SQS9nX44$ )a  
Applies linear projection on input keys and values, then splits heads
(reshape and transpose) to get keys and values from different representation
subspaces. The results are used as key-values pairs for subsequent multiple
parallel attention.

It is part of calculations in multi-head attention, and is provided as
a method to pre-compute and prefetch these results, thus we can use them
to construct cache for inference.

Parameters:
    key (Tensor): The keys for multi-head attention. It is a tensor
        with shape `[batch_size, sequence_length, kdim]`. The data type
        should be float32 or float64.
    value (Tensor): The values for multi-head attention. It is a tensor
        with shape `[batch_size, sequence_length, vdim]`. The data type
        should be float32 or float64.

Returns:
    Tuple. A tuple including transformed keys and values. Their shapes
    both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`,
    and their data types are same as inputs.
r   ra   rd   re   )rP   rQ   r	   rh   rE   rF   ri   )rS   rZ   r[   r?   r@   s        r   rj   MultiHeadAttention.compute_kv1  s    0 KKKKNNQq!T^^T]]&KLq|4NNQq!T^^T]]&KLq|4tr   c                    g r   r   rS   rZ   r[   types       r   	gen_cacheMultiHeadAttention.gen_cacheQ  s     r   c                    g r   r   rp   s       r   rr   rs   V  s     r   c                   U[         R                  :X  a$  U R                  X5      u  pEU R                  XE5      $ Uc  SU R                  SU R                  /n[
        R                  " U5      S   R                  5       US'   [
        R                  " USUR                  5      n[
        R                  " USUR                  5      nU R                  XE5      $ U R                  X5      $ )a  
Generates cache for `forward` usage in inference according to arguments.
The generated cache is an instance of `MultiHeadAttention.Cache` or an
instance of `MultiHeadAttention.StaticCache`.

`Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,
and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
which are results of linear projection, reshape and transpose calculations
in MultiHeadAttention.

If the generated cache is an instance of `Cache`, `k` and `v` fields
reserve intermediate result tensors of previous positions, and the tensors
are incremental among decoding steps, which mostly are used for decoder
decoder self attention.

If the generated cache is an instance of `StaticCache`, `k` and `v` fields
would be used as calculated result tensors on keys an values in `forward`,
and the tensors keep unchanged among decoding steps, which are mostly used
for decoder-encoder cross attention.

The cache is generated as follows:

1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the
results to create an instance of `StaticCache`.

2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
`[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
to create an instance of `Cache`, where `batch_size` is from the first
dimension of `key`.

3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create
an instance of `Cache`.

Parameters:
    key (Tensor): The keys for multi-head attention. It is
        a tensor with shape `[batch_size, key_length, kdim]`. The
        data type should be float32 or float64. If `value` is None,
        it is only for batch size and data type reference.
    value (Tensor, optional): The values for multi-head attention. It
        is a tensor with shape `[batch_size, value_length, vdim]`.
        The data type should be float32 or float64. If None, `key` is only
        for batch size reference. Default None.
    type (type): It should be `MultiHeadAttention.StaticCache` or
        `MultiHeadAttention.Cache` to indicate the cache type to generate.

Returns:
    namedtuple: an instance of `Cache` or `StaticCache` accordingly.
r   )r<   rA   rj   rE   rF   r6   rc   itemfullr5   r>   )rS   rZ   r[   rq   r?   r@   
fill_shapes          r   rr   rs   ^  s    b %111??3.DA##A))]dnna?J"LL-a0557JqMJ3995AJ3995A::a## ::c))r   c                    g r   r   rS   rY   rZ   r[   r8   r\   s         r   forwardMultiHeadAttention.forward       r   c                    g r   r   r{   s         r   r|   r}     s     !$r   c                    g r   r   r{   s         r   r|   r}     s      #r   c                    g r   r   r{   s         r   r|   r}     s     &)r   c                    g r   r   r{   s         r   r|   r}     s     (+r   c                    g r   r   r{   s         r   r|   r}     s     .1r   c                0   Uc  UOUnUc  UOUnUc  U R                  XX55      u  pgnOU R                  XX55      u  pgp[        R                  " X`R                  S-  -  USS9n	Ub  [	        XIR
                  5      nX-   n	[        R                  " U	5      n
U R                  (       a+  [        R                  " U
U R                  U R                  SS9n
[        R                  " X5      n[        R                  " U/ SQS9n[        R                  " USSUR                  S	   UR                  S
   -  /S9nU R                  U5      nU/nU R                  (       a  UR!                  U
5        Ub  UR!                  U5        [#        U5      S:X  a  U$ [%        U5      $ )a  
Applies multi-head attention to map queries and a set of key-value pairs
to outputs.

Parameters:
    query (Tensor): The queries for multi-head attention. It is a
        tensor with shape `[batch_size, query_length, embed_dim]`. The
        data type should be float32 or float64.
    key (Tensor|None, optional): The keys for multi-head attention. It is
        a tensor with shape `[batch_size, key_length, kdim]`. The
        data type should be float32 or float64. If None, use `query` as
        `key`. Default None.
    value (Tensor|None, optional): The values for multi-head attention. It
        is a tensor with shape `[batch_size, value_length, vdim]`.
        The data type should be float32 or float64. If None, use `query` as
        `value`. Default None.
    attn_mask (Tensor|None, optional): A tensor used in multi-head attention
        to prevents attention to some unwanted positions, usually the
        paddings or the subsequent positions. It is a tensor with shape
        broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
        When the data type is bool, the unwanted positions have `False`
        values and the others have `True` values. When the data type is
        int, the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.
    cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache|None, optional):
        It is a namedtuple with `k` and `v` as fields, and stores tensors
        shaped `[batch_size, num_heads, length, embed_dim]` which are results
        of linear projection, reshape and transpose calculations in
        MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`
        fields reserve intermediate results of previous positions, which
        mostly used for decoder self attention. If it is an instance of
        `StaticCache`, `key` and `value` args would be ignored, `k` and
        `v` fields would be used as calculated results on `key` and
        `value`, which mostly used for decoder-encoder cross attention.
        It is only used for inference and should be None for training.
        Default None.

Returns:
    Tensor|tuple. It is a tensor that has the same shape and data type
    as `query`, representing attention output. Or a tuple if
    `need_weights` is True or `cache` is not None. If `need_weights`
    is True, except for attention output, the tuple also includes
    the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`.
    If `cache` is not None, the tuple then includes the new cache
    having the same type as `cache`, and if it is `StaticCache`, it
    is same as the input `cache`, if it is `Cache`, the new cache
    reserves tensors concatenating raw tensors with intermediate
    results of current query.
g      T)rb   ytranspose_yupscale_in_train)trainingmoderd   )rf   r   r   r   ra   r   )r]   r6   matmulrF   r:   r5   FsoftmaxrH   r   r	   ri   rh   rc   rR   rI   r(   r&   r%   )rS   rY   rZ   r[   r8   r\   rl   r?   r@   productweightsoutoutss                r   r|   r}     sk   h {eE=''EAGA!!..u5HNA! --==$&'1$
  /	==II)G))G$<<ii'	G mmG' s6nns1a1		!1L*MN mmC uKK KK$i1ns5%+5r   )rH   rB   rF   rP   rC   rI   rE   rR   rO   rQ   rD   )g        NNFNN)rB   r4   rE   r4   rH   rG   rC   
int | NonerD   r   rI   r'   rT   ParamAttrLike | NonerL   r   returnNone.)
rY   r   rZ   r   r[   r   r\   r   r   ztuple[Tensor, Tensor, Tensor])
rY   r   rZ   r   r[   r   r\   zCache | StaticCacher   z2tuple[Tensor, Tensor, Tensor, Cache | StaticCache]r   )rZ   r   r[   r   r   tuple[Tensor, Tensor]..)rZ   r   r[   Tensor | Nonerq   ztype[Cache]r   r>   )rZ   r   r[   r   rq   ztype[StaticCache]r   rA   )....)rY   r   rZ   r   r[   r   r8   r   r\   r   r   r   )rY   r   rZ   r   r[   r   r8   r   r\   r   r   r   )rY   r   rZ   r   r[   r   r8   r   r\   r>   r   ztuple[Tensor, Cache])rY   r   rZ   r   r[   r   r8   r   r\   rA   r   ztuple[Tensor, StaticCache])rY   r   rZ   r   r[   r   r8   r   r\   r>   r   ztuple[Tensor, Tensor, Cache])rY   r   rZ   r   r[   r   r8   r   r\   rA   r   z"tuple[Tensor, Tensor, StaticCache])NNNN)__name__
__module____qualname____firstlineno____doc__collections
namedtupler>   rA   __annotations__rN   r   r]   rj   rr   r|   __static_attributes____classcell__rU   s   @r   r<   r<      s2   )V ""7S#J7E((c
CKN
I
INMN ",0*.+
+
 +
 	+

 +
 +
 +
 *+
 (+
 
+
 +
Z  ,, , 	,
 , 
', ,  &)AA A 	A
 #A 
<A A5@n@ KN"/=H	    #"%	   	
 
  $(e <*|  !"  	
   
   !"$$ $ 	$
 $ $ 
$ $  !"## # 	#
 # # 
# #  !" )) ) 	)
 ) ) 
$) )  !"++ + 	+
 + + 
&+ +  !" 11 1 	1
 1 1 
,1 1[6 [6r   r<   c                     ^  \ rS rSr% SrS\S'   S\S'           S                       SU 4S jjjr\  S       SS jj5       r\  S       SS	 jj5       rSS
 jrSS jr	Sr
U =r$ )TransformerEncoderLayeri6  a  
TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
attention and feedforward network. Before and after each sub-layer, pre-process
and post-process would be applied on the input and output accordingly. If
`normalize_before` is True, pre-process is layer normalization and post-process
includes dropout, residual connection. Otherwise, no pre-process and post-process
includes dropout, residual connection, layer normalization.

Parameters:
    d_model (int): The expected feature size in the input and output.
    nhead (int): The number of heads in multi-head attention(MHA).
    dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
    dropout (float, optional): The dropout probability used in pre-process
        and post-process of MHA and FFN sub-layer. Default 0.1
    activation (str, optional): The activation function in the feedforward
        network. Default relu.
    attn_dropout (float, optional): The dropout probability used
        in MHA to drop some attention target. If None, use the value of
        `dropout`. Default None
    act_dropout (float, optional): The dropout probability used after FFN
        activation.  If None, use the value of `dropout`. Default None
    normalize_before (bool, optional): Indicate whether to put layer normalization
        into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
        normalization and post-process includes dropout, residual connection.
        Otherwise, no pre-process and post-process includes dropout, residual
        connection, layer normalization. Default False
    weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
        If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
        MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
        Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
        Default: None, which means the default weight parameter property is used.
        See usage for details in :code:`ParamAttr` .
    bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
        If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
        MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
        Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
        The `False` value means the corresponding layer would not have trainable
        bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
        which means the default bias parameter property is used.
    layer_norm_eps (float, optional): the eps value in layer normalization components. Default=1e-5.


Examples:

    .. code-block:: pycon

        >>> import paddle
        >>> from paddle.nn import TransformerEncoderLayer

        >>> # encoder input: [batch_size, src_len, d_model]
        >>> enc_input = paddle.rand((2, 4, 128))
        >>> # self attention mask: [batch_size, n_head, src_len, src_len]
        >>> attn_mask = paddle.rand((2, 2, 4, 4))
        >>> encoder_layer = TransformerEncoderLayer(128, 2, 512)
        >>> enc_output = encoder_layer(enc_input, attn_mask)
        >>> print(enc_output.shape)
        paddle.Size([2, 4, 128])
r   
activationr'   normalize_beforec                  > [        5       U l        U R                  R                  S5        U R                  R                  SS 5        [        TU ]  5         US:  d
   SU 35       eUS:  d
   SU 35       eUS:  d
   SU 35       eUc  UOUnUc  UOUnXl        [        U	S5      n[        U
S5      n[        UUUUS   US   S9U l        [        XUS	   US	   S
9U l
        [        USS9U l        [        X1US	   US	   S
9U l        [        X5      U l        [        X5      U l        [        USS9U l        [        USS9U l        ['        [(        U5      U l        g )NrS   rU   r   4Expected d_model to be greater than 0, but received 2Expected nhead to be greater than 0, but received <Expected dim_feedforward to be greater than 0, but received r   rH   rT   rL   r   rK   r   r   )locals_configpoprM   rN   r   r   r<   	self_attnr   linear1r   rH   linear2r   norm1norm2dropout1dropout2getattrr   r   rS   d_modelnheaddim_feedforwardrH   r   attn_dropoutact_dropoutr   rT   rL   layer_norm_epsweight_attrs
bias_attrsrU   s                 r   rN    TransformerEncoderLayer.__init__u  s    x d+{ 	
B7)L	
{ qy 	
@H	
y " 	
+,.	
"
 #/"6wL!,!4g+ 02;B0A>
+ $Q m
 l1oA
 {1CDl1oA
 w7
w7
.@A.@A!!Z0r   c                    g r   r   rS   srcsrc_maskr\   s       r   r|   TransformerEncoderLayer.forward       r   c                    g r   r   r   s       r   r|   r     s     36r   c           	     t   [        X!R                  5      nUnU R                  (       a  U R                  U5      nUc  U R	                  XX5      nOU R	                  XXU5      u  pX@R                  U5      -   nU R                  (       d  U R                  U5      nUnU R                  (       a  U R                  U5      nU R                  U R                  U R                  U R                  U5      5      5      5      nX@R                  U5      -   nU R                  (       d  U R                  U5      nUc  U$ UW4$ )a  
Applies a Transformer encoder layer on the input.

Parameters:
    src (Tensor): The input of Transformer encoder layer. It is
        a tensor with shape `[batch_size, sequence_length, d_model]`.
        The data type should be float32 or float64.
    src_mask (Tensor|None, optional): A tensor used in multi-head attention
        to prevents attention to some unwanted positions, usually the
        paddings or the subsequent positions. It is a tensor with shape
        broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
        When the data type is bool, the unwanted positions have `False`
        values and the others have `True` values. When the data type is
        int, the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.
    cache (MultiHeadAttention.Cache, optional): It is an instance of `MultiHeadAttention.Cache`.
        See `TransformerEncoderLayer.gen_cache` for more details. It is
        only used for inference and should be None for training. Default
        None.

Returns:
    Tensor|tuple: It is a tensor that has the same shape and data type \
        as `enc_input`, representing the output of Transformer encoder \
        layer. Or a tuple if `cache` is not None, except for encoder \
        layer output, the tuple includes the new cache which is same \
        as input `cache` argument but `incremental_cache` has an \
        incremental length. See `MultiHeadAttention.gen_cache` and \
        `MultiHeadAttention.forward` for more details.
)r:   r5   r   r   r   r   r   r   rH   r   r   r   )rS   r   r   r\   residualincremental_caches         r   r|   r     s   @ +8YY?  **S/C=..39C%)^^#&"C s++$$**S/C  **S/Cll4<<S8I(JKLs++$$**S/CmsA#/@)AAr   c                `    U R                   R                  XR                   R                  S9nU$ )ab  
Generates cache for `forward` usage. The generated cache is an
instance of `MultiHeadAttention.Cache`.

Parameters:
    src (Tensor): The input of Transformer encoder. It is a tensor
        with shape `[batch_size, source_length, d_model]`. The data
        type should be float32 or float64.

Returns:
    incremental_cache: It is an instance of `MultiHeadAttention.Cache` \
        produced by `self_attn.gen_cache`, it reserves two tensors
        shaped `[batch_size, nhead, 0, d_model // nhead]`. See \
        `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
        for more details.
rq   )r   rr   r>   )rS   r   r   s      r   rr   !TransformerEncoderLayer.gen_cache  s3    " !NN44nn** 5 
 ! r   )r   r   rH   r   r   r   r   r   r   r   r   皙?reluNNFNNgh㈵>r   r4   r   r4   r   r4   rH   rG   r   r.   r   float | Noner   r   r   r'   rT   .ParamAttrLike | Sequence[ParamAttrLike] | NonerL   r   r   rG   r   r   r   r   r   r   r   r\   r   r   r   )r   r   r   r   r\   MultiHeadAttention.Cacher   z'tuple[Tensor, MultiHeadAttention.Cache]NN)r   r   r   r   r   r   r   r   r   r   rN   r   r|   rr   r   r   r   s   @r   r   r   6  sI   9v   %)$(!&FJDH $8181 81 	81
 81 81 #81 "81 81 D81 B81 81 
81 81t  #&	   	
 
   #&*-	66  6 (	6
 
16 68Bt! !r   r   c                     ^  \ rS rSr% SrS\S'   S\S'    S       SU 4S jjjr\  S       SS jj5       r\  S       SS	 jj5       rSS
 jrSS jr	Sr
U =r$ )TransformerEncoderi  a  
TransformerEncoder is a stack of N encoder layers.

Parameters:
    encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It
        would be used as the first layer, and the other layers would be created
        according to the configurations of it.
    num_layers (int): The number of encoder layers to be stacked.
    norm (LayerNorm|None, optional): the layer normalization component. If provided,
        apply layer normalization on the output of last encoder layer.

Examples:

    .. code-block:: pycon

        >>> import paddle
        >>> from paddle.nn import (
        ...     TransformerEncoderLayer,
        ...     TransformerEncoder,
        ... )

        >>> # encoder input: [batch_size, src_len, d_model]
        >>> enc_input = paddle.rand((2, 4, 128))
        >>> # self attention mask: [batch_size, n_head, src_len, src_len]
        >>> attn_mask = paddle.rand((2, 2, 4, 4))
        >>> encoder_layer = TransformerEncoderLayer(128, 2, 512)
        >>> encoder = TransformerEncoder(encoder_layer, 2)
        >>> enc_output = encoder(enc_input, attn_mask)
        >>> print(enc_output.shape)
        paddle.Size([2, 4, 128])
r4   
num_layersLayerNorm | Nonenormc           
        > [         TU ]  5         [        [        U5       Vs/ s H&  nUS:X  a  UO[	        U5      " S0 UR
                  D6PM(     sn5      U l        X l        X0l        g s  snf Nr   r   	rM   rN   r   r*   rq   r   layersr   r   )rS   encoder_layerr   r   r1   rU   s        r   rN   TransformerEncoder.__init__4  x     	 z* +A Av "m,E}/D/DEF +	
 %	   -A(c                    g r   r   r   s       r   r|   TransformerEncoder.forwardH  r   r   c                    g r   r   r   s       r   r|   r   P  s     9<r   c                
   [        X!R                  5      nUn/ n[        U R                  5       H,  u  pgUc  U" XBS9nM  U" XBX6   S9u  pHUR	                  U5        M.     U R
                  b  U R                  U5      nUc  U$ XE4$ )a<  
Applies a stack of N Transformer encoder layers on inputs. If `norm` is
provided, also applies layer normalization on the output of last encoder
layer.

Parameters:
    src (Tensor): The input of Transformer encoder. It is a tensor
        with shape `[batch_size, sequence_length, d_model]`. The data
        type should be float32 or float64.
    src_mask (Tensor, optional): A tensor used in multi-head attention
        to prevents attention to some unwanted positions, usually the
        paddings or the subsequent positions. It is a tensor with shape
        broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
        When the data type is bool, the unwanted positions have `False`
        values and the others have `True` values. When the data type is
        int, the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.
    cache (list, optional): It is a list, and each element in the list
        is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`.
        See `TransformerEncoder.gen_cache` for more details. It is only
        used for inference and should be None for training. Default None.

Returns:
    Tensor|tuple: It is a tensor that has the same shape and data type \
        as `src`, representing the output of Transformer encoder. \
        Or a tuple if `cache` is not None, except for encoder output, \
        the tuple includes the new cache which is same as input `cache` \
        argument but `incremental_cache` in it has an incremental length. \
        See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
        for more details.
r   )r   r\   r:   r5   	enumerater   r(   r   )	rS   r   r   r\   output
new_cachesr1   mod	new_caches	            r   r|   r   X  s    D +8YY?
,FA}V7$'UX%! !!), - 99 YYv&Fv@V,@@r   c                d    U R                    Vs/ s H  o"R                  U5      PM     nnU$ s  snf )af  
Generates cache for `forward` usage. The generated cache is a list, and
each element in it is `incremental_cache` produced by
`TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache`
for more details.

Parameters:
    src (Tensor): The input of Transformer encoder. It is a tensor
        with shape `[batch_size, source_length, d_model]`. The data type
        should be float32 or float64.

Returns:
    list: It is a list, and each element in the list is `incremental_cache`
    produced by `TransformerEncoderLayer.gen_cache`. See
    `TransformerEncoderLayer.gen_cache` for more details.
)r   rr   )rS   r   layerr\   s       r   rr   TransformerEncoder.gen_cache  s.    " 48;;?;%%;? @s   -r   r   r   r   )r   r   r   r4   r   r   r   r   r   r   )N.)r   r   r   r   r\   list[MultiHeadAttention.Cache]r   z-tuple[Tensor, list[MultiHeadAttention.Cache]]r   )r   r   r   r   r   r   s   @r   r   r     s    @ O
 "&	.  	
 
 (  #&	   	
 
   #'03	<<  < .	<
 
7< <2Ah r   r   c                    ^  \ rS rSr% SrS\S'   S\S'           S                       SU 4S jjjr\   S           SS jj5       r\   S           SS	 jj5       rSS
 jr    SS jr	Sr
U =r$ )TransformerDecoderLayeri  a#  
TransformerDecoderLayer is composed of three sub-layers which are decoder
self (multi-head) attention, decoder-encoder cross attention and feedforward
network. Before and after each sub-layer, pre-process and post-process would
be applied on the input and output accordingly. If `normalize_before` is True,
pre-process is layer normalization and post-process includes dropout, residual
connection. Otherwise, no pre-process and post-process includes dropout, residual
connection, layer normalization.

Parameters:
    d_model (int): The expected feature size in the input and output.
    nhead (int): The number of heads in multi-head attention(MHA).
    dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
    dropout (float, optional): The dropout probability used in pre-process
        and post-process of MHA and FFN sub-layer. Default 0.1
    activation (str, optional): The activation function in the feedforward
        network. Default relu.
    attn_dropout (float, optional): The dropout probability used
        in MHA to drop some attention target. If None, use the value of
        `dropout`. Default None
    act_dropout (float, optional): The dropout probability used after FFN
        activation.  If None, use the value of `dropout`. Default None
    normalize_before (bool, optional): Indicate whether to put layer normalization
        into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
        normalization and post-process includes dropout, residual connection.
        Otherwise, no pre-process and post-process includes dropout, residual
        connection, layer normalization. Default False
    weight_attr (ParamAttr|list|tuple|None, optional): To specify the weight parameter property.
        If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
        self attention, `weight_attr[1]` would be used as `weight_attr` for
        cross attention, and `weight_attr[2]` would be used as `weight_attr`
        for linear in FFN. Otherwise, the three sub-layers all uses it as
        `weight_attr` to create parameters. Default: None, which means the
        default weight parameter property is used. See usage for details
        in :ref:`api_paddle_base_param_attr_ParamAttr` .
    bias_attr (ParamAttr|list|tuple|bool|None, optional): To specify the bias parameter property.
        If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
        self attention, `bias_attr[1]` would be used as `bias_attr` for
        cross attention, and `bias_attr[2]` would be used as `bias_attr`
        for linear in FFN. Otherwise, the three sub-layers all uses it as
        `bias_attr` to create parameters. The `False` value means the
        corresponding layer would not have trainable bias parameter. See
        usage for details in :code:`ParamAttr` . Default: None,which means
        the default bias parameter property is used.
    layer_norm_eps (float, optional): the eps value in layer normalization components. Default=1e-5.

Examples:

    .. code-block:: pycon

        >>> import paddle
        >>> from paddle.nn import TransformerDecoderLayer

        >>> # decoder input: [batch_size, tgt_len, d_model]
        >>> dec_input = paddle.rand((2, 4, 128))
        >>> # encoder output: [batch_size, src_len, d_model]
        >>> enc_output = paddle.rand((2, 6, 128))
        >>> # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
        >>> self_attn_mask = paddle.rand((2, 2, 4, 4))
        >>> # cross attention mask: [batch_size, n_head, tgt_len, src_len]
        >>> cross_attn_mask = paddle.rand((2, 2, 4, 6))
        >>> decoder_layer = TransformerDecoderLayer(128, 2, 512)
        >>> output = decoder_layer(dec_input, enc_output, self_attn_mask, cross_attn_mask)
        >>> print(output.shape)
        paddle.Size([2, 4, 128])
r'   r   r   r   c                   > [        5       U l        U R                  R                  S5        U R                  R                  SS 5        [        TU ]  5         US:  d
   SU 35       eUS:  d
   SU 35       eUS:  d
   SU 35       eUc  UOUnUc  UOUnXl        [        U	S5      n[        U
S5      n[        UUUUS   US   S9U l        [        UUUUS	   US	   S9U l	        [        XUS
   US
   S9U l        [        USS9U l        [        X1US
   US
   S9U l        [        X5      U l        [        X5      U l        [        X5      U l        [        USS9U l        [        USS9U l        [        USS9U l        [-        [.        U5      U l        g )NrS   rU   r   r   r   r   r   r   r   r   rK   r   r   )r   r   r   rM   rN   r   r   r<   r   
cross_attnr   r   r   rH   r   r   r   r   norm3r   r   dropout3r   r   r   r   s                 r   rN    TransformerDecoderLayer.__init__  s    x d+{ 	
B7)L	
{ qy 	
@H	
y " 	
+,.	
"
 #/"6wL!,!4g+ 02;B0A>
+ $Q m
 - $Q m
 l1oA
 {1CDl1oA
 w7
w7
w7
.@A.@A.@A!!Z0r   c                    g r   r   rS   tgtmemorytgt_maskmemory_maskr\   s         r   r|   TransformerDecoderLayer.forward+  r~   r   c                    g r   r   r   s         r   r|   r  5  s     r   c           	        [        X1R                  5      n[        XBR                  5      nUnU R                  (       a  U R                  U5      nUc  U R	                  XXS5      nOU R	                  XXUS   5      u  pX`R                  U5      -   nU R                  (       d  U R                  U5      nUnU R                  (       a  U R                  U5      nUc  U R                  XX$S5      nOU R                  XX$US   5      u  pX`R                  U5      -   nU R                  (       d  U R                  U5      nUnU R                  (       a  U R                  U5      nU R                  U R                  U R                  U R                  U5      5      5      5      nX`R                  U5      -   nU R                  (       d  U R                  U5      nUc  U$ UWW44$ )a
  
Applies a Transformer decoder layer on the input.

Parameters:
    tgt (Tensor): The input of Transformer decoder layer. It is a tensor
        with shape `[batch_size, target_length, d_model]`. The data type
        should be float32 or float64.
    memory (Tensor): The output of Transformer encoder. It is a tensor
        with shape `[batch_size, source_length, d_model]`. The data type
        should be float32 or float64.
    tgt_mask (Tensor, optional): A tensor used in self attention
        to prevents attention to some unwanted positions, usually the
        the subsequent positions. It is a tensor with shape broadcasted
        to `[batch_size, n_head, target_length, target_length]`.
        When the data type is bool, the unwanted positions have `False`
        values and the others have `True` values. When the data type is
        int, the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.
    memory_mask (Tensor, optional): A tensor used in decoder-encoder
        cross attention to prevents attention to some unwanted positions,
        usually the paddings. It is a tensor with shape broadcasted to
        `[batch_size, n_head, target_length, source_length]`. When the
        data type is bool, the unwanted positions have `False` values
        and the others have `True` values. When the data type is int,
        the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.
    cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ),
        `incremental_cache` is an instance of `MultiHeadAttention.Cache`,
        `static_cache` is an instance of `MultiHeadAttention.StaticCache.
        See `TransformerDecoderLayer.gen_cache` for more details. It is
        only used for inference and should be None for training. Default
        None.

Returns:
    Tensor|tuple: It is a tensor that has the same shape and data type \
        as `tgt`, representing the output of Transformer decoder layer. \
        Or a tuple if `cache` is not None, except for decoder layer output, \
        the tuple includes the new cache which is same as input `cache` \
        argument but `incremental_cache` in it has an incremental length. \
        See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
        for more details.
Nr   r   )r:   r5   r   r   r   r   r   r   r   r   r   rH   r   r   r   )	rS   r   r  r  r  r\   r   r   static_caches	            r   r|   r  C  s   ^ +8YY?-k<<H  **S/C=..3$?C%)^^#q&"C s++$$**S/C  **S/C=//#vDIC $V%(!C s++$$**S/C  **S/Cll4<<S8I(JKLs++$$**S/C=C	
'*->,M&N	
r   c                    U R                   R                  XR                   R                  S9nU R                  R                  XU R                  R                  S9nX#4$ )a  
Generates cache for `forward` usage. The generated cache is a tuple
composed of an instance of `MultiHeadAttention.Cache` and an instance
of `MultiHeadAttention.StaticCache`.

Parameters:
    memory (Tensor): The output of Transformer encoder. It is a tensor
        with shape `[batch_size, source_length, d_model]`. The data type
        should be float32 or float64.

Returns:
    tuple: It is a tuple( :code:`(incremental_cache, static_cache)` ). \
        `incremental_cache` is an instance of `MultiHeadAttention.Cache` \
        produced by `self_attn.gen_cache(memory, MultiHeadAttention.Cache)`, \
        it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. \
        `static_cache` is an instance of `MultiHeadAttention.StaticCache` \
        produced by `cross_attn.gen_cache(memory, MultiHeadAttention.StaticCache)`, \
        it reserves two tensors shaped `[batch_size, nhead, source_length, d_model // nhead]`.
        See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
        for more details.
r   )r   rr   r>   r   rA   )rS   r  r   r  s       r   rr   !TransformerDecoderLayer.gen_cache  s`    0 !NN44-- 5 
 00!<!< 1 
 !..r   )r   r   r   rH   r   r   r   r   r   r   r   r   r   r   r   r   ...r   r   r  r   r  r   r  r   r\   r   r   r   )r   r   r  r   r  r   r  r   r\   ?tuple[MultiHeadAttention.Cache, MultiHeadAttention.StaticCache]r   zNtuple[Tensor, tuple[MultiHeadAttention.Cache, MultiHeadAttention.StaticCache]]NNN)r  r   r   r  r   r   s   @r   r   r     s   AF   %)$(!&FJDH $A1A1 A1 	A1
 A1 A1 #A1 "A1 A1 DA1 BA1 A1 
A1 A1F 
 #&%(   	
 #  
  
 #&%(    	
 #

 U
n//	H/ /r   r   c                  B  ^  \ rS rSr% SrS\S'   S\S'    S       SU 4S jjjr\   S           SS jj5       r\   S           SS	 jj5       rSS
 jr\ S     SS jj5       r	\ S     SS jj5       r	\ S     SS jj5       r	SS jr	Sr
U =r$ )TransformerDecoderi  a  
TransformerDecoder is a stack of N decoder layers.

Parameters:
    decoder_layer (Layer): an instance of the `TransformerDecoderLayer`. It
        would be used as the first layer, and the other layers would be created
        according to the configurations of it.
    num_layers (int): The number of decoder layers to be stacked.
    norm (LayerNorm|None, optional): the layer normalization component. If provided,
        apply layer normalization on the output of last encoder layer.

Examples:

    .. code-block:: pycon

        >>> import paddle
        >>> from paddle.nn import (
        ...     TransformerDecoderLayer,
        ...     TransformerDecoder,
        ... )

        >>> # decoder input: [batch_size, tgt_len, d_model]
        >>> dec_input = paddle.rand((2, 4, 128))
        >>> # encoder output: [batch_size, src_len, d_model]
        >>> enc_output = paddle.rand((2, 6, 128))
        >>> # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
        >>> self_attn_mask = paddle.rand((2, 2, 4, 4))
        >>> # cross attention mask: [batch_size, n_head, tgt_len, src_len]
        >>> cross_attn_mask = paddle.rand((2, 2, 4, 6))
        >>> decoder_layer = TransformerDecoderLayer(128, 2, 512)
        >>> decoder = TransformerDecoder(decoder_layer, 2)
        >>> output = decoder(dec_input, enc_output, self_attn_mask, cross_attn_mask)
        >>> print(output.shape)
        paddle.Size([2, 4, 128])
r4   r   r   r   c           
        > [         TU ]  5         [        [        U5       Vs/ s H&  nUS:X  a  UO[	        U5      " S0 UR
                  D6PM(     sn5      U l        X l        X0l        g s  snf r   r   )rS   decoder_layerr   r   r1   rU   s        r   rN   TransformerDecoder.__init__  r   r   c                    g r   r   r   s         r   r|   TransformerDecoder.forward  r~   r   c                    g r   r   r   s         r   r|   r    s     r   c           	     B   [        X1R                  5      n[        XBR                  5      nUn/ n[        U R                  5       H3  u  pUc  U	" UUUUSS9nM  U	" UUUUXX   S9u  pjUR	                  U
5        M5     U R
                  b  U R                  U5      nUc  U$ Xg4$ )a
  
Applies a stack of N Transformer decoder layers on inputs. If `norm` is
provided, also applies layer normalization on the output of last decoder
layer.

Parameters:
    tgt (Tensor): The input of Transformer decoder. It is a tensor
        with shape `[batch_size, target_length, d_model]`. The data type
        should be float32 or float64.
    memory (Tensor): The output of Transformer encoder. It is a tensor
        with shape `[batch_size, source_length, d_model]`. The data type
        should be float32 or float64.
    tgt_mask (Tensor|None, optional): A tensor used in self attention
        to prevents attention to some unwanted positions, usually the
        the subsequent positions. It is a tensor with shape broadcasted
        to `[batch_size, n_head, target_length, target_length]`. When
        the data type is bool, the unwanted positions have `False`
        values and the others have `True` values. When the data type is
        int, the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.
    memory_mask (Tensor|None, optional): A tensor used in decoder-encoder
        cross attention to prevents attention to some unwanted positions,
        usually the paddings. It is a tensor with shape broadcasted to
        `[batch_size, n_head, target_length, source_length]`. When the
        data type is bool, the unwanted positions have `False` values
        and the others have `True` values. When the data type is int,
        the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.
    cache (list|tuple, optional): It is a list, and each element in the list
        is a tuple( :code:`(incremental_cache, static_cache)` ). See
        `TransformerDecoder.gen_cache` for more details. It is only
        used for inference and should be None for training. Default None.

Returns:
    Tensor|tuple: It is a tensor that has the same shape and data type \
        as `tgt`, representing the output of Transformer decoder. \
        Or a tuple if `cache` is not None, except for decoder output, \
        the tuple includes the new cache which is same as input `cache` \
        argument but `incremental_cache` in it has an incremental length. \
        See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
        for more details.
N)r  r  r\   r   )rS   r   r  r  r  r\   r   r   r1   r   r   s              r   r|   r    s    ^ +8YY?-k<<H
,FA}% + %(% +(%! !!),# -& 99 YYv&Fv@V,@@r   c                    g r   r   rS   r  do_zips      r   rr   TransformerDecoder.gen_cache\       r   c                    g r   r   r  s      r   rr   r  g  s     r   c                    g r   r   r  s      r   rr   r  o  r  r   c                    U R                    Vs/ s H  o3R                  U5      PM     nnU(       a  [        [        U6 5      nU$ s  snf )a  
Generates cache for `forward` usage. The generated cache is a list, and
each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
for more details. If `do_zip` is True, apply `zip` on these tuples to get
a list with two elements.


Parameters:
    memory (Tensor): The output of Transformer encoder. It is a tensor
        with shape `[batch_size, source_length, d_model]`. The data type
        should be float32 or float64.
    do_zip (bool, optional): Indicate whether to apply `zip` on the tuples.
        If True, return a list with two elements. Default False

Returns:
    list: It is a list, and each element in the list is a tuple produced \
        by `TransformerDecoderLayer.gen_cache(memory)`. See `TransformerDecoderLayer.gen_cache` \
        for more details. If `do_zip` is True, apply `zip` on these tuples \
        and return a list with two elements.
)r   rr   r$   zip)rS   r  r  r   r\   s        r   rr   r  z  s?    , 7;kkBkU(kBe%E Cs   Ar   r   )r  r   r   r4   r   r   r   r   r
  r  )r   r   r  r   r  r   r  r   r\   zISequence[tuple[MultiHeadAttention.Cache, MultiHeadAttention.StaticCache]]r   zTtuple[Tensor, list[tuple[MultiHeadAttention.Cache, MultiHeadAttention.StaticCache]]]r  r   )r  r   r  zLiteral[False]r   list[tuple[MultiHeadAttention.Cache, MultiHeadAttention.StaticCache]] | list[tuple[MultiHeadAttention.Cache, ...] | tuple[MultiHeadAttention.StaticCache, ...]])r  r   r  zLiteral[True]r   zWlist[tuple[MultiHeadAttention.Cache, ...] | tuple[MultiHeadAttention.StaticCache, ...]])r  r   r  r'   r   r   )Fr   r   s   @r   r  r    s   "H O
 "&	.  	
 
 ( 
 #&%(   	
 #  
  
 #&%(    	
 #

 JAX 7:&4	
  69&3
  -0&*	
  r   r  c                     ^  \ rS rSr% SrS\S'   S\S'   S\S'   S\S'                 S                             SU 4S	 jjjr   S           SS
 jjrSS jrSr	U =r
$ )Transformeri  a  
A Transformer model composed of an instance of `TransformerEncoder` and an
instance of `TransformerDecoder`. While the embedding layer and output layer
are not included.

Please refer to `Attention is all you need <http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_ ,
and see `TransformerEncoder` and `TransformerDecoder` for more details.

Users can configure the model architecture with corresponding parameters.
Note the usage of `normalize_before` representing where to apply layer
normalization (in pre-process or post-process of multi-head attention or FFN),
and some transformer like models are different on this, such as
`BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ .
The default architecture here places layer normalization in post-process and
applies another layer normalization on the output of last encoder/decoder layer.

Parameters:
    d_model (int, optional): The expected feature size in the encoder/decoder input
        and output. Default 512
    nhead (int, optional): The number of heads in multi-head attention(MHA). Default 8
    num_encoder_layers (int, optional): The number of layers in encoder. Default 6
    num_decoder_layers (int, optional): The number of layers in decoder. Default 6
    dim_feedforward (int, optional): The hidden layer size in the feedforward network(FFN). Default 2048
    dropout (float, optional): The dropout probability used in pre-process
        and post-process of MHA and FFN sub-layer. Default 0.1
    activation (str, optional): The activation function in the feedforward
        network. Default relu.
    attn_dropout (float, optional): The dropout probability used
        in MHA to drop some attention target. If None, use the value of
        `dropout`. Default None
    act_dropout (float, optional): The dropout probability used after FFN
        activation.  If None, use the value of `dropout`. Default None
    normalize_before (bool, optional): Indicate whether to put layer normalization
        into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
        normalization and post-process includes dropout, residual connection.
        Otherwise, no pre-process and post-process includes dropout, residual
        connection, layer normalization. Default False
    weight_attr(ParamAttr|list|tuple|None, optional): To specify the weight parameter property.
        If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3,
        `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]`
        would be used as `weight_attr` for cross attention of `TransformerDecoder`,
        and `weight_attr[2]` would be used as `weight_attr` for linear in FFN.
        If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention
        and cross attention and `weight_attr[1]` would be used as `weight_attr` for
        linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr`
        for self attention, cross attention and linear in FFN. Otherwise,
        the three sub-layers all uses it as `weight_attr` to create parameters.
        Default: None, which means the default weight parameter property is used.
        See usage for details
        in :code:`ParamAttr` .
    bias_attr (ParamAttr|list|tuple|bool|None, optional): To specify the bias parameter property.
        If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3,
        `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]`
        would be used as `bias_attr` for cross attention of `TransformerDecoder`,
        and `bias_attr[2]` would be used as `bias_attr` for linear in FFN.
        If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention
        and cross attention and `bias_attr[1]` would be used as `bias_attr` for
        linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr`
        for self attention, cross attention and linear in FFN. Otherwise,
        the three sub-layers all uses it as `bias_attr` to create parameters.
        The `False` value means the corresponding layer would not have trainable
        bias parameter. See usage for details in :code:`ParamAttr` .
        Default: None,which means the default bias parameter property is used.
    custom_encoder (Layer|None, optional): If custom encoder is provided, use it as the encoder.
        Default None
    custom_decoder (Layer|None, optional): If custom decoder is provided, use it as the decoder.
        Default None

Examples:

    .. code-block:: pycon

        >>> import paddle
        >>> from paddle.nn import Transformer

        >>> # src: [batch_size, tgt_len, d_model]
        >>> enc_input = paddle.rand((2, 4, 128))
        >>> # tgt: [batch_size, src_len, d_model]
        >>> dec_input = paddle.rand((2, 6, 128))
        >>> # src_mask: [batch_size, n_head, src_len, src_len]
        >>> enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
        >>> # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
        >>> dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
        >>> # memory_mask: [batch_size, n_head, tgt_len, src_len]
        >>> cross_attn_mask = paddle.rand((2, 2, 6, 4))
        >>> transformer = Transformer(128, 2, 4, 4, 512)
        >>> output = transformer(
        ...     enc_input,
        ...     dec_input,
        ...     enc_self_attn_mask,
        ...     dec_self_attn_mask,
        ...     cross_attn_mask,
        ... )
        >>> print(output.shape)
        paddle.Size([2, 6, 128])
r   encoderdecoderr4   r   r   c                  > [         TU ]  5         US:  d
   SU 35       eUS:  d
   SU 35       eUS:  d
   SU 35       e[        U[        [        45      (       ai  [        U5      S:X  a  US   /S-  nUS   /S-  nOK[        U5      S:X  a  UnUS   US   US   /nO+[        U5      S:X  a  US   US   /nUnO[        S	5      eUnUn[        U[        [        45      (       ai  [        U5      S:X  a  US   /S-  nUS   /S-  nOK[        U5      S:X  a  UnUS   US   US   /nO+[        U5      S:X  a  US   US   /nUnO[        S
5      eUnUnUb  Xl        O1[        UUUUUUU	U
UU5
      n[        U5      n[        UUU5      U l        Ub  Xl        O1[        UUUUUUU	U
UU5
      n[        U5      n[        UUU5      U l        Xl        X l        g )Nr   r   r   r   r   r   r   rv   zAlength of bias_attr should be 1 or 2 or 3 when it is a list/tuplezClength of weight_attr should be 1 or 2 or 3 when it is a list/tuple)rM   rN   r#   r$   r%   r&   AssertionErrorr#  r   r   r   r$  r   r  r   r   )rS   r   r   num_encoder_layersnum_decoder_layersr   rH   r   r   r   r   rT   rL   custom_encodercustom_decoderencoder_bias_attrdecoder_bias_attrencoder_weight_attrdecoder_weight_attrr   encoder_normr  decoder_normrU   s                          r   rN   Transformer.__init__  s   " 	{ 	
B7)L	
{ qy 	
@H	
y " 	
+,.	
"
 i$//9~"%.q\NQ$6!%.q\NQ$6!Y1$$-!%.q\9Q<2$O!Y1$%.q\9R=$A!$-!$W  !* )kD%=11;1$'21~&6&:#'21~&6&:#[!Q&&1#NNO'#
 [!Q&'21~{2&G#&1#$Y  #."-%)L3 #!M %W-L-1<DL %)L3 #!M %W-L-1<DL 
r   c                    [        X1R                  5      nU R                  XS9n[        XBR                  5      n[        XVR                  5      nU R                  X&XES9nU$ )a]  
Applies a Transformer model on the inputs.

Parameters:
    src (Tensor): The input of Transformer encoder. It is a tensor
        with shape `[batch_size, source_length, d_model]`. The data type
        should be float32 or float64.
    tgt (Tensor): The input of Transformer decoder. It is a tensor
        with shape `[batch_size, target_length, d_model]`. The data type
        should be float32 or float64.
    memory (Tensor): The output of Transformer encoder. It is a tensor
        with shape `[batch_size, source_length, d_model]`. The data type
        should be float32 or float64.
    src_mask (Tensor|None, optional): A tensor used in multi-head attention
        to prevents attention to some unwanted positions, usually the
        paddings or the subsequent positions. It is a tensor with shape
        broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
        When the data type is bool, the unwanted positions have `False`
        values and the others have `True` values. When the data type is
        int, the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.
    tgt_mask (Tensor|None, optional): A tensor used in self attention
        to prevents attention to some unwanted positions, usually the
        the subsequent positions. It is a tensor with shape broadcasted
        to `[batch_size, n_head, target_length, target_length]`. When
        the data type is bool, the unwanted positions have `False`
        values and the others have `True` values. When the data type is
        int, the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.
    memory_mask (Tensor|None, optional): A tensor used in decoder-encoder
        cross attention to prevents attention to some unwanted positions,
        usually the paddings. It is a tensor with shape broadcasted to
        `[batch_size, n_head, target_length, source_length]`. When the
        data type is bool, the unwanted positions have `False` values
        and the others have `True` values. When the data type is int,
        the unwanted positions have 0 values and the others have 1
        values. When the data type is float, the unwanted positions have
        `-INF` values and the others have 0 values. It can be None when
        nothing wanted or needed to be prevented attention to. Default None.

Returns:
    Tensor: It is a tensor that has the same shape and data type \
        as `tgt`, representing the output of Transformer decoder.
r   )r  r  )r:   r5   r#  r$  )rS   r   r   r   r  r  r  r   s           r   r|   Transformer.forwardn  s^    p +8YY?c5*8YY?-k<<H(  
 r   c                    [         R                  R                  [         R                  " X/[        R
                  * [         R                  " 5       S9S5      $ )a&  
Generate a square mask for the sequence. The mask ensures that the
predictions for position i can depend only on the known outputs at
positions less than i.

Parameters:
    length (int|Tensor): The length of sequence.

Returns:
    Tensor, generated square mask according to the given length. The shape is [length, length].

Examples:
    .. code-block:: python

        >>> import paddle
        >>> from paddle.nn.layer.transformer import Transformer
        >>> length = 5
        >>> d_model, n_head, dim_feedforward = 8, 4, 64
        >>> transformer_paddle = Transformer(
        ...     d_model, n_head, dim_feedforward=dim_feedforward)
        >>> mask = transformer_paddle.generate_square_subsequent_mask(length)
        >>> print(mask)
        Tensor(shape=[5, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[ 0.  , -inf., -inf., -inf., -inf.],
         [ 0.  ,  0.  , -inf., -inf., -inf.],
         [ 0.  ,  0.  ,  0.  , -inf., -inf.],
         [ 0.  ,  0.  ,  0.  ,  0.  , -inf.],
         [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ]])

)rc   
fill_valuer5   r   )r6   r	   triurx   npinfget_default_dtype)rS   lengths     r   generate_square_subsequent_mask+Transformer.generate_square_subsequent_mask  sG    > }}!!KK&FF7..0
 
 	
r   )r   r$  r#  r   )i         r>  i   r   r   NNFNNNN)r   r4   r   r4   r'  r4   r(  r4   r   r4   rH   rG   r   r.   r   r   r   r   r   r'   rT   r   rL   r   r)  Layer | Noner*  r?  r   r   r  )r   r   r   r   r   r   r  r   r  r   r   r   )r:  zint | Tensorr   r   )r   r   r   r   r   r   rN   r|   r;  r   r   r   s   @r   r"  r"    sZ   _B NNLJ "#"## %)$(!&FJDH'+'+oo o  	o
  o o o o #o "o o Do Bo %o %o  
!o oj #'"&%)@@ @  	@
  @ #@ 
@D&
 &
r   r"  )r   z)Sequence[Literal[False]] | Literal[False]r   r4   r   z
list[bool])r   z.Sequence[ParamAttrLike] | ParamAttrLike | Noner   r4   r   zlist[ParamAttr])r8   r   r5   r   r   r   )+
__future__r   r   r+   typingr   r   r   numpyr7  r6   paddle.base.data_feederr    r	   	frameworkr
   r   r   commonr   r   	containerr   r   r   r   r   collections.abcr   r   paddle._typingr   r   __all__r   r:   r<   r   r   r   r  r"  r   r   r   <module>rK     s   #   3 3   1  "  #    (7
 
9>A 

 
>
  
,^:o6 o6dW!e W!tN NbW/e W/tX Xv@
% @
r   