
    Αir                       S SK Jr  S SKrS SKJr  S SKJr  S SKJr  S SK	r	S SK	J
r
  S SKJr  SS	KJr  SS
KJr  \(       a  S SK	Jr  S SKJrJrJrJrJr  / SQrSS jr\" SS5      r " S S\
R6                  5      r " S S\
R6                  5      r " S S\
R6                  5      r " S S\
R>                  5      r " S S\
R6                  5      r  " S S\
R6                  5      r!\r"\r#\r$g)    )annotationsN)repeat)sqrt)TYPE_CHECKING)nn)ForbidKeywordsDecorator   )
functional)MultiheadAttention)Tensor)	DTypeLike	PlaceLikeSize1Size2Size3)
UnfoldLinearSoftmax	AvgPool1D	AvgPool2D	AvgPool3D	AvgPool1d	AvgPool2d	AvgPool3dr   c                    ^  U 4S jnXl         U$ )Nc                   > [        U [        R                  R                  5      (       a  [	        U 5      $ [	        [        U T5      5      $ N)
isinstancecollectionsabcIterabletupler   )xns    Y/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/compat/nn/__init__.pyparse_ntuple.<locals>.parse7   s4    a11228OVAq\""    )__name__)r$   namer&   s   `  r%   _ntupler+   6   s    #
 NLr(   _singlec                     ^  \ rS rSr% Sr/ SQrS\S'   S\S'   S\S'   S\S	'   S\S
'   \" SS1SSS9    S           SU 4S jjj5       rSS jr	SS jr
SrU =r$ )r   C   at
  
This operation applies a 1D average pooling over an input signal composed
of several input planes, based on the input, output_size, return_mask parameters.
Input(X) and output(Out) are in NCL format, where N is batch
size, C is the number of channels, L is the length of the feature.
The output tensor shape will be [N, C, output_size].

The output value of the layer with input size (N, C, L),
output (N, C, :math:`L_{out}`) and kernel_size ksize can be precisely described as
For average pool1d:

..  math::

    Output(N_i, C_i, l) = \frac{Input[N_i, C_i, stride \times l:stride \times l+k]}{ksize}

Parameters:
    kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
        it must contain an integer.
    stride(int|list|tuple|None, optional): The pool stride size. If pool stride size is a tuple or list,
        it must contain an integer. Default None, then stride will be equal to the kernel_size.
    padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
        1. A string in ['valid', 'same'].
        2. An int, which means the feature map is zero padded by size of `padding` on every sides.
        3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
        4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
        5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
        The default value is 0.
    ceil_mode(bool, optional): ${ceil_mode_comment}Whether to use the ceil function to calculate output height
        and width. If it is set to False, the floor function will be used. The default value is False.
    count_include_pad(bool, optional): Whether to include padding points in average pooling mode, default is `False`.

Shape:
    - x(Tensor): The input tensor of avg pool1d operator, which is a 3-D tensor.
      The data type can be float32, float64.
    - output(Tensor): The output tensor of avg pool1d  operator, which is a 3-D tensor.
      The data type is same as input x.

Returns:
    A callable object of AvgPool1D.

Examples:

    .. code-block:: pycon

        >>> import paddle
        >>> import paddle.compat.nn as nn

        >>> data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
        >>> AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
        >>> pool_out = AvgPool1D(data)
        >>> print(pool_out.shape)
        paddle.Size([1, 3, 16])

)kernel_sizestridepadding	ceil_modecount_include_padr   r/   r0   r1   boolr2   r3   	exclusiver*   zpaddle.compat.nn.AvgPool1Dzpaddle.nn.AvgPool1Dillegal_keys	func_namecorrect_namec                   > [         TU ]  5         [        U5      U l        [        Ub  UOU5      U l        [        U5      U l        X@l        XPl        g r   )super__init__r,   r/   r0   r1   r2   r3   )selfr/   r0   r1   r2   r3   	__class__s         r%   r<   AvgPool1D.__init__   sG     	";/(:fLw'"!2r(   c                    [         R                  R                  UU R                  U R                  U R
                  U R                  (       + U R                  5      $ r   )r   r
   
avg_pool1dr/   r0   r1   r3   r2   r=   inputs     r%   forwardAvgPool1D.forward   sF    }}''KKLL&&&NN
 	
r(   c                T    SU R                    SU R                   SU R                   3$ Nzkernel_size=z	, stride=z
, padding=r/   r0   r1   r=   s    r%   
extra_reprAvgPool1D.extra_repr   -    d../yZPTP\P\~^^r(   )r2   r3   r/   r1   r0   )Nr   FT)r/   r   r0   zSize1 | Noner1   r   r2   r4   r3   r4   returnNonerC   r   rM   r   rM   strr)   
__module____qualname____firstlineno____doc____constants____annotations__r   r<   rD   rJ   __static_attributes____classcell__r>   s   @r%   r   r   C   s    5nM MNO!6*.*  $"&33 3 	3
 3  3 
3
3
_ _r(   r   c                     ^  \ rS rSr% Sr/ SQrS\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   \" 1 SkSSS9     S           SU 4S jjj5       rSS jr	SS jr
SrU =r$ )r      a@  
This operation applies 2D average pooling over input features based on the input,
and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
in NCHW format, where N is batch size, C is the number of channels,
H is the height of the feature, and W is the width of the feature.

Example:
    Input:
        X shape: :math:`(N, C, :math:`H_{in}`, :math:`W_{in}`)`
    Attr:
        kernel_size: ksize

    Output:
        Out shape: :math:`(N, C, :math:`H_{out}`, :math:`W_{out}`)`

    ..  math::

        Output(N_i, C_j, h, w)  = \frac{\sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
            Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)}{ksize[0] * ksize[1]}


Parameters:
    kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
        it must contain two integers, (pool_size_Height, pool_size_Width).
        Otherwise, the pool kernel size will be a square of an int.
    stride(int|list|tuple|None, optional): The pool stride size. If pool stride size is a tuple or list,
        it must contain two integers, (pool_stride_Height, pool_stride_Width).
        Otherwise, the pool stride size will be a square of an int.
        Default None, then stride will be equal to the kernel_size.
    padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
        1. A string in ['valid', 'same'].
        2. An int, which means the feature map is zero padded by size of `padding` on every sides.
        3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
        4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
        5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
        The default value is 0.
    ceil_mode(bool, optional): When True, will use `ceil` instead of `floor` to compute the output shape.
    count_include_pad(bool, optional): Whether to include padding points in average pooling
        mode, default is `False`.
    divisor_override(float, optional): If specified, it will be used as divisor, otherwise kernel_size will be
        used. Default None.

Shape:
    - x(Tensor): The input tensor of avg pool2d operator, which is a 4-D tensor.
      The data type can be float32, float64.
    - output(Tensor): The output tensor of avg pool2d  operator, which is a 4-D tensor.
      The data type is same as input x.

Returns:
    A callable object of AvgPool2D.

Examples:
    .. code-block:: pycon

        >>> import paddle
        >>> import paddle.compat.nn as nn

        >>> # max pool2d
        >>> input = paddle.uniform([1, 3, 32, 32], dtype="float32", min=-1, max=1)
        >>> AvgPool2D = nn.AvgPool2D(kernel_size=2, stride=2, padding=0)
        >>> output = AvgPool2D(input)
        >>> print(output.shape)
        paddle.Size([1, 3, 16, 16])

r/   r0   r1   r2   r3   divisor_overrider   r/   r0   r1   r4   r2   r3   
int | Noner_   >   r*   r5   data_formatzpaddle.compat.nn.AvgPool2Dzpaddle.nn.AvgPool2Dr6   c                v   > [         TU ]  5         Xl        Ub  UOUU l        X0l        X@l        XPl        X`l        g r   r;   r<   r/   r0   r1   r2   r3   r_   r=   r/   r0   r1   r2   r3   r_   r>   s          r%   r<   AvgPool2D.__init__   :     	&!'!3f+"!2 0r(   c           	         [         R                  R                  UU R                  U R                  U R
                  U R                  U R                  (       + U R                  5      $ r   )	r   r
   
avg_pool2dr/   r0   r1   r2   r3   r_   rB   s     r%   rD   AvgPool2D.forward  O    }}''KKLLNN&&&!!
 	
r(   c                T    SU R                    SU R                   SU R                   3$ rG   rH   rI   s    r%   rJ   AvgPool2D.extra_repr  rL   r(   r2   r3   r_   r/   r1   r0   Nr   FTN)r/   r   r0   zSize2 | Noner1   r   r2   r4   r3   r4   r_   r`   rO   rP   rR   r[   s   @r%   r   r      s    @DM MNO  9.*  $"&'+11 1 	1
 1  1 %1
1"	
_ _r(   r   c                     ^  \ rS rSr% Sr/ SQrS\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   \" 1 SkSSS9     S             SU 4S jjj5       rSS jr	SS jr
U 4S jrSrU =r$ )r   i#  a
  

This operation applies 3D max pooling over input features based on the input,
and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
in NCDHW format, where N is batch size, C is the number of channels,
H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.

Parameters:
    kernel_size(int|list|tuple): The pool kernel size. If pool kernel size
        is a tuple or list, it must contain three integers,
        (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
        Otherwise, the pool kernel size will be the cube of an int.
    stride(int|list|tuple|None, optional): The pool stride size. If pool stride size is a tuple or list,
        it must contain three integers, [stride_Depth, stride_Height, stride_Width).
        Otherwise, the pool stride size will be a cube of an int.
        Default None, then stride will be equal to the kernel_size.
    padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.

        1. A string in ['valid', 'same'].
        2. An int, which means the feature map is zero padded by size of `padding` on every sides.
        3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
        4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
        5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).

        The default value is 0.
    ceil_mode(bool, optional): ${ceil_mode_comment}
    count_include_pad(bool, optional): Whether to include padding points in average pooling mode, default is True.
    divisor_override(int|float, optional): if specified, it will be used as divisor, otherwise kernel_size will
        be used. Default None.

Returns:
    A callable object of AvgPool3D.

Shape:
    - x(Tensor): The input tensor of avg pool3d operator, which is a 5-D tensor.
      The data type can be float16, float32, float64.
    - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
      The data type is same as input x.

Examples:
    .. code-block:: pycon

        >>> import paddle
        >>> import paddle.compat.nn as nn

        >>> # avg pool3d
        >>> input = paddle.uniform([1, 2, 3, 32, 32], dtype="float32", min=-1, max=1)
        >>> AvgPool3D = nn.AvgPool3D(kernel_size=2, stride=2, padding=0)
        >>> output = AvgPool3D(input)
        >>> print(output.shape)
        paddle.Size([1, 2, 1, 16, 16])

r^   r   r/   r0   r1   r4   r2   r3   r`   r_   >   r*   r5   ra   zpaddle.compat.nn.AvgPool3Dzpaddle.nn.AvgPool3Dr6   c                v   > [         TU ]  5         Xl        Ub  UOUU l        X0l        X@l        XPl        X`l        g r   rc   rd   s          r%   r<   AvgPool3D.__init__i  rf   r(   c           	         [         R                  R                  UU R                  U R                  U R
                  U R                  U R                  (       + U R                  5      $ r   )	r   r
   
avg_pool3dr/   r0   r1   r2   r3   r_   rB   s     r%   rD   AvgPool3D.forward  rj   r(   c                T    SU R                    SU R                   SU R                   3$ rG   rH   rI   s    r%   rJ   AvgPool3D.extra_repr  rL   r(   c                   > [         TU ]  U5        U R                  R                  SS5        U R                  R                  SS5        U R                  R                  SS5        g )Nr1   r   r2   Fr3   T)r;   __setstate____dict__
setdefault)r=   stater>   s     r%   rx   AvgPool3D.__setstate__  sM    U#  A.  e4  !4d;r(   rm   rn   )r/   r   r0   zSize3 | Noner1   r   r2   r4   r3   r4   r_   r`   rM   rN   rO   rP   )r)   rS   rT   rU   rV   rW   rX   r   r<   rD   rJ   rx   rY   rZ   r[   s   @r%   r   r   #  s    4lM MNO  9.*  $"&'+11 1 	1
 1  1 %1 
1
1"	
_< <r(   r   c                     ^  \ rS rSr% SrS\S'   S\S'   S\S'   S\S'   \" 1 SkS	S
S9   S         SU 4S jjj5       rSS jrSr	U =r
$ )r   i  ag  
A compatible version of paddle.nn.Unfold:

The keyword arguments are in non-plural forms, example: `kernel_size` instead of `kernel_sizes`. `padding` restricts the size of the input to be 1(int) or 2, Size4 is not allowed.

All the input parameters allow `Tensor` or `pir.Value` as inputs, and will be converted to lists. Other aspects are the same. To use a more input-flexible version of Unfold, please refer to `paddle.nn.Unfold`.

Args:
    kernel_size(int|list|tuple|Tensor): The size of convolution kernel, should be [k_h, k_w]
        or an integer k treated as [k, k].
    stride(int|list|tuple|Tensor, optional): The strides, should be [stride_h, stride_w]
        or an integer stride treated as [sride, stride]. For default, strides will be [1, 1].
    padding(int|list|tuple|Tensor, optional): The paddings of each dimension, should be
        a single integer or [padding_h, padding_w]. If [padding_h, padding_w] was given, it will expanded to
        [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given,
        [padding, padding, padding, padding] will be used. By default, paddings will be 0.
    dilation(int|list|tuple|Tensor, optional): The dilations of convolution kernel, should be
        [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation].
        For default, it will be [1, 1].

Examples:
    .. code-block:: pycon

        >>> import paddle
        >>> x = paddle.randn((100, 3, 224, 224))
        >>> unfold = paddle.compat.nn.Unfold(kernel_size=[3, 3])
        >>> result = unfold(x)
        >>> print(result.shape)
        paddle.Size([100, 27, 49284])
r   kernel_sizes	dilationspaddingsstrides>   r   r   r   r~   zpaddle.compat.nn.Unfoldzpaddle.nn.Unfoldr6   c                &   > [         TU ]  XX45        g r   )r;   r<   )r=   r/   dilationr1   r0   r>   s        r%   r<   Unfold.__init__  s     	@r(   c           	         S n[         R                  R                  UU" U R                  5      U" U R                  5      U" U R
                  5      U" U R                  5      S9$ )Nc                    [        U [        R                  R                  [        R                  45      (       a  U R                  5       n U $ r   )r   paddlepirValuer   tolist)r#   s    r%   to_list_if_necessary,Unfold.forward.<locals>.to_list_if_necessary  s2    !fjj..>??HHJHr(   )r~   r   r   r   )r   r
   unfoldr~   r   r   r   )r=   rC   r   s      r%   rD   Unfold.forward  sZ    	
 }}##-d.?.?@(6)$--8*4>>: $ 
 	
r(    )r	   r   r	   )
r/   r   r   r   r1   r   r0   r   rM   rN   rO   )r)   rS   rT   rU   rV   rX   r   r<   rD   rY   rZ   r[   s   @r%   r   r     s    > ONI+' AA A 	A
 A 
A
A
 
r(   r   c                     ^  \ rS rSr% SrSS/rS\S'   S\S'   S\S'   \" 1 SkS	S
S9   S           SU 4S jjj5       rSS jr	SS jr
SS jrSrU =r$ )r   i  a  

Python compatible fully-connected linear transformation layer. For each input :math:`X` ,
the equation is:

.. math::

    Out = XW^T + b

where :math:`W` is the weight and :math:`b` is the bias.

Linear layer takes only one multi-dimensional tensor as input with the
shape :math:`[*, in\_features]` , where :math:`*` means any
number of additional dimensions. It multiplies input tensor with the transpose
of weight (a 2-D tensor of shape :math:`[out\_features, in\_features]` ) and
produces an output tensor of shape :math:`[*, out\_features]` .
If ``bias`` is not False, the bias (a 1-D tensor of
shape :math:`[out\_features]` ) will be created and added to the output. At the
end of the initialization, ``reset_parameters`` will be called to initialize
the ``weight`` and ``bias`` (if available) randomly.

Parameters:
    in_features (int):
        The number of input units.
    out_features (int):
        The number of output units.
    bias (bool): If True, the bias (a 1-D tensor of shape :math:`[out\_features]` ) will be created and
        added to the output. Default: True.
    device (PlaceLike): The device of the parameters created. Default: None,
        representing the default paddle device.
    dtype (DTypeLike): The dtype of the parameters created. Default: None, and is set by
        the default dtype of Linear (float32).

Variables:
    weight (paddle.Tensor): learnable parameters of the module of shape :math:`[out\_features, in\_features]`.
        The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where :math:`k` is :math:`\frac{1}{in\_features}`.
    bias (paddle.Tensor): learnable parameters of the module of shape :math:`[out\_features]`. If ``bias`` is True,
        the values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where :math:`k` is :math:`\frac{1}{in\_features}`.

Shape:
    - input: Multi-dimensional tensor with shape :math:`[*, in\_features]` . Its data types are float16, float32, float64 ,The default is float32 .
    - output: Multi-dimensional tensor with shape :math:`[*, out\_features]` . The data type is the same as the input .

Examples:
    .. code-block:: python

        >>> import paddle
        >>> paddle.seed(100)

        >>> # Define the linear layer.
        >>> linear = paddle.compat.nn.Linear(2, 4, bias=True)
        >>> print(linear.weight)
        Parameter containing:
        Tensor(shape=[4, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
               [[-0.49191639,  0.28120756],
                [-0.17887023,  0.40572405],
                [ 0.35139430,  0.45717543],
                [-0.06135514, -0.21088189]])

        >>> print(linear.bias)
        Parameter containing:
        Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=False,
               [ 0.49166456, -0.06108528, -0.14973064,  0.31168410])

        >>> x = paddle.arange(6, dtype="float32").reshape([3, 2])
        >>> y = linear(x)
        >>> print(y)
        Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=False,
               [[ 0.77287209,  0.34463876,  0.30744481,  0.10080221],
                [ 0.35145447,  0.79834640,  1.92458415, -0.44367185],
                [-0.06996319,  1.25205410,  3.54172373, -0.98814595]])
in_featuresout_featuresintr   weight>   r*   	bias_attrweight_attrzpaddle.compat.nn.Linearzpaddle.nn.Linearr6   c                J  > [         TU ]  5         Uc  U R                  R                  5       OUU l        Xl        X l        U R                  X!/S U R                  SUS9U l        S U l	        U(       a#  U R                  U/S U R                  SUS9U l	        U R                  5         g )NF)shapeattrdtypeis_biasdeviceT)r;   r<   _helperget_default_dtype_dtyper   r   create_parameterr   biasreset_parameters)r=   r   r   r   r   r   r>   s         r%   r<   Linear.__init__%  s     	05DLL**,5 	 '(++-++ , 
 	--#nkk . DI 	r(   c                f    [         R                  R                  XR                  U R                  S9$ )N)rC   r   r   )r
   linear__wrapped__r   r   rB   s     r%   rD   Linear.forwardK  s-      ,,$)) - 
 	
r(   c                X    SU R                    SU R                   SU R                  SL 3$ )z0
Return the extra representation of the module.
zin_features=z, out_features=z, bias=N)r   r   r   rI   s    r%   rJ   Linear.extra_reprP  s:     d../t?P?P>QQXY]YbYbjnYnXoppr(   c                8   [         R                  R                  U R                  [	        S5      S9  U R
                  b\  U R                  R                  S   nUS:  a  S[	        U5      -  OSn[         R                  R                  U R
                  U* U5        gg)zG
Resets parameters based on their initialization used in ``__init__``.
   )aNr	   r   )r   initkaiming_uniform_r   r   r   r   uniform_)r=   fan_inbounds      r%   r   Linear.reset_parametersV  sz    
 	  Q 899  [[&&q)F(.
AV$EGGTYY6 !r(   )r   r   r   r   r   )TNN)r   r   r   r   r   r4   r   zPlaceLike | Noner   zDTypeLike | NonerM   rN   rO   rP   )rM   rN   )r)   rS   rT   rU   rV   rW   rX   r   r<   rD   rJ   r   rY   rZ   r[   s   @r%   r   r     s    GR #N3MN9+' #'"&     	 
 !     
 
 B

q7 7r(   r   c                  `   ^  \ rS rSrSr\" S1SSS9SSU 4S jjj5       rSS jrSS	 jrS
r	U =r
$ )r   ie  a  
Softmax Activation.

This operator implements the softmax layer. The calculation process is as follows:

1. The dimension :attr:`dim` of ``input`` will be permuted to the last.

2. Then ``input`` will be logically flattened to a 2-D matrix. The matrix's second
dimension(row length) is the same as the dimension :attr:`dim` of ``input``,
and the first dimension(column length) is the product of all other dimensions
of ``input``. For each row of the matrix, the softmax operator squashes the
K-dimensional(K is the width of the matrix, which is also the size of ``input``'s
dimension :attr:`dim`) vector of arbitrary real values to a K-dimensional
vector of real values in the range [0, 1] that add up to 1.

3. After the softmax operation is completed, the inverse operations of steps 1 and 2
are performed to restore the two-dimensional matrix to the same dimension as the ``input`` .

It computes the exponential of the given dimension and the sum of exponential
values of all the other dimensions in the K-dimensional vector input.
Then the ratio of the exponential of the given dimension and the sum of
exponential values of all the other dimensions is the output of the softmax
operator.

For each row :math:`i` and each column :math:`j` in the matrix, we have:

.. math::

    Softmax[i, j] = \frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])}

Example:

.. code-block:: text

    Case 1:
      Input:
        x.shape = [2, 3, 4]
        x.data = [[[2.0, 3.0, 4.0, 5.0],
                   [3.0, 4.0, 5.0, 6.0],
                   [7.0, 8.0, 8.0, 9.0]],
                  [[1.0, 2.0, 3.0, 4.0],
                   [5.0, 6.0, 7.0, 8.0],
                   [6.0, 7.0, 8.0, 9.0]]]

      Attrs:
        dim = -1

      Output:
        out.shape = [2, 3, 4]
        out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
                     [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
                     [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
                    [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
                     [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
                     [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]

    Case 2:
      Input:
        x.shape = [2, 3, 4]
        x.data = [[[2.0, 3.0, 4.0, 5.0],
                   [3.0, 4.0, 5.0, 6.0],
                   [7.0, 8.0, 8.0, 9.0]],
                  [[1.0, 2.0, 3.0, 4.0],
                   [5.0, 6.0, 7.0, 8.0],
                   [6.0, 7.0, 8.0, 9.0]]]
      Attrs:
        dim = 1

      Output:
        out.shape = [2, 3, 4]
        out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
                     [0.01786798, 0.01786798, 0.04661262, 0.04661262],
                     [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
                    [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
                     [0.26762315, 0.26762315, 0.26762315, 0.26762315],
                     [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]

Parameters:
    dim (int, optional): The dim along which to perform log_softmax
        calculations. It should be in range [-D, D), where D is the
        dimensions of ``input`` . If ``dim`` < 0, it works the same way as
        :math:`dim + D` . Default is None.

Shape:
    - input: Tensor with any shape.
    - output: Tensor with the same shape as input.

Examples:
    .. code-block:: python

        >>> import paddle

        >>> x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
        ...                        [3.0, 4.0, 5.0, 6.0],
        ...                        [7.0, 8.0, 8.0, 9.0]],
        ...                       [[1.0, 2.0, 3.0, 4.0],
        ...                        [5.0, 6.0, 7.0, 8.0],
        ...                        [6.0, 7.0, 8.0, 9.0]]], dtype='float32')
        >>> m = paddle.compat.nn.Softmax()
        >>> out = m(x)
        >>> print(out)
        Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[0.73105854, 0.73105854, 0.73105854, 0.73105854],
          [0.11920292, 0.11920292, 0.11920292, 0.11920292],
          [0.73105854, 0.73105854, 0.50000000, 0.50000000]],
         [[0.26894143, 0.26894143, 0.26894143, 0.26894143],
          [0.88079703, 0.88079703, 0.88079703, 0.88079703],
          [0.26894143, 0.26894143, 0.50000000, 0.50000000]]])

axiszpaddle.compat.nn.Softmaxzpaddle.nn.Softmaxr6   c                <   > [         TU ]  5         Xl        S U l        g r   )r;   r<   _dimr   )r=   dimr>   s     r%   r<   Softmax.__init__  s     		r(   c                B    [         R                  " XR                  5      $ r   )r
   softmaxr   rB   s     r%   rD   Softmax.forward  s    !!%33r(   c                     SU R                    3$ )Nzdim=)r   rI   s    r%   rJ   Softmax.extra_repr  s    dhhZ  r(   )r   r   r   )r   r`   rM   rN   rO   rP   )r)   rS   rT   rU   rV   r   r<   rD   rJ   rY   rZ   r[   s   @r%   r   r   e  sB    m^ X,(
 

4! !r(   r   )r&   )%
__future__r   r   	itertoolsr   mathr   typingr   r   r   paddle.utils.decorator_utilsr    r
   transformerr   r   paddle._typingr   r   r   r   r   __all__r+   r,   Layerr   r   r   r   r   r   r   r   r   r   r(   r%   <module>r      s    #        @  +  !Y
e_ e_Pu_ u_pn< n<b?
RYY ?
DL7RXX L7^~!bhh ~!B 			r(   