
    {-jj                       d dl mZ d dlZd dlZd dlmZ d dlZd dlmZ	 d dl
mZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZmZm Z  erd dlm!Z! g Z"d)dZ#d)dZ$d)dZ%da&d Z'd Z(d*dZ)d Z* G d d          Z+ G d de+          Z,d Z- G d d          Z. G d de.          Z/ G d  d!e.          Z0da1d" Z2da3d# Z4 G d$ d%e.          Z5ej6        d+d&            Z7d' Z8d( Z9e.Z:e/Z;e0Z<e5Z=dS ),    )annotationsN)TYPE_CHECKING)_C_ops)core	frameworkunique_name)check_variable_and_dtype)DataType)Variable
check_typedefault_main_program)get_complete_pp_mesh)LayerHelperin_dynamic_modein_dynamic_or_pir_modein_pir_mode)Tensorc                   t                      rt          j        | |          S t          di t	                      }t          | dg dd           t          |dt          d           |.t          j	        d
                    |j        dg                    }|                    | j        || j        d	          }|                    dd| id|id
|i           |S )a  

    Limits the L2 norm of the input :math:`x` within :math:`max\_norm`.
    If the L2 norm of :math:`x` is less than or equal to :math:`max\_norm`, :math:`out` will be
    the same as :math:`x`. If the L2 norm of :math:`x` is greater than :math:`max\_norm`, :math:`x` will
    be linearly scaled to make the L2 norm of :math:`out` equal to :math:`max\_norm`, as
    shown in the following formula:

    .. math::

        out = \frac{max\_norm * x}{norm(x)}

    where :math:`norm(x)` represents the L2 norm of :math:`x`.

    Args:
        x(Tensor): The input of clip_by_norm and data type is float32.
            The number of dimensions must be between [1, 9].
        max_norm(float): The maximum norm value.
        name(str, optional): For detailed information, please refer
            to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.

    Returns:
        Tensor: The output of clip_by_norm with shape as input.
            The data type is float32.

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.nn import clip

            >>> input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
            >>> reward = clip.clip_by_norm(x=input, max_norm=1.0)
            >>> print(reward)
            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
            [[0.50000000, 0.50000000],
             [0.50000000, 0.50000000]])
    clip_by_normX)float16float32uint16max_normN.tmpF)typenamedtypepersistableOutr   inputsattrsoutputs)r   )r   r   r   r   localsr	   r   floatr   generate_with_ignorable_keyjoinr   create_variabler   r   	append_op)xr   r   helperouts        N/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/nn/clip.pyr   r   +   s
   T  0"1h///4468844F	3000.   xen===|6HHfk5)**
 
 
 
 V$ag5 !  C Qx8$	     J    c                    t                      rt          j        |           S t          di t	                      }|                    | j                  }|                    dd| ii d|i           |S )a  
    Merge by adding duplicated rows in the input SelectedRows object.

    Args:
        x(Tensor): The input selected rows to be merge.
        name(basestring|None): Name of the output.

    Returns:
        Tensor, merged output.

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> import paddle.base as base

            >>> b = paddle.static.default_main_program().global_block()
            >>> var = b.create_var(
            ...     name="X", dtype="float32", persistable=True,
            ...     type=base.core.VarDesc.VarType.SELECTED_ROWS)
            >>> y = paddle.nn.clip.merge_selected_rows(var)
    merge_selected_rowsr   r   r!   r"   )r2   )r   r   r2   r   r&   "create_variable_for_type_inferencer   r+   r,   r   r-   r.   s       r/   r2   r2   q   s    0  -)!,,,;;&((;;F

3
3!'
3
B
BC
"Qx	     Jr0   c                t   t                      rt          j        |           S t          | dt          d           | j        t          j        j        j	        k    rt          d          t          di t                      }|                    | j                  }|                    dd| id|ii            |S )	aC  
    Get tensor data from input with SelectedRows type, and outputs a Tensor.

    .. code-block:: text

        input x is SelectedRows:
           x.rows = [0, 5, 5, 4, 19]
           x.height = 20
           x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]

        Output is DenseTensor:
           out.shape = [5, 2]
           out.data = [[1, 1],
                       [2, 2],
                       [2, 2],
                       [3, 3],
                       [6, 6]]

    Args:
        x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name` .

    Returns:
        Variable: DenseTensor transformed from SelectedRows. The data type is same with input.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import paddle.base as base
            >>> from paddle.base import core
            >>> paddle.enable_static()
            >>> scope = core.Scope()
            >>> block = paddle.static.default_main_program().global_block()
            >>> x_rows = [0, 5, 5, 4, 19]
            >>> height = 20
            >>> x = scope.var('X').get_selected_rows()
            >>> x.set_rows(x_rows)
            >>> x.set_height(height)
            >>> x = block.create_var(name="X", dtype="float32", persistable=True, type=base.core.VarDesc.VarType.SELECTED_ROWS)
            >>> z = paddle.nn.clip.get_tensor_from_selected_rows(x)
    r,   get_tensor_from_selected_rowszGThe type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS.r3   r   r!   r   r#   r%   r$   )r7   )r   r   r7   r   r   r   r   VarDescVarTypeSELECTED_ROWS	TypeErrorr   r&   r4   r   r+   r5   s       r/   r7   r7      s    X }} 73A666q#x!@AAAv%333U
 
 	
 EEFHHEEF

3
3!'
3
B
BC
,Qx	     Jr0   Fc                     t          |           dk    sJ t          |           dk    r.t          | d         t                    sJ t          }| d         a|S t          S )N   r   )len
isinstancebool'_clip_by_global_norm_using_mp_type_flagargs	old_values     r/   "_clip_by_global_norm_using_mp_typerF      sY    t99>>>>
4yyA~~$q'4(((((;	26q'/66r0   c                   | j         t          j        j        j        k    s| j         t          j        j        j        k    r7t                      r)|                     t          j        j        j                  S | j         t          j
        k    s| j         t          j        k    r-t                      r|                     t          j                  S | S N)r   r   r9   r:   FP16BF16rF   astypeFP32r
   FLOAT16BFLOAT16FLOAT32r,   s    r/   _cast_to_mp_type_if_enabledrQ      s    	4<',,,7dl*///
,
.
. 0xx,1222	8###qw(2C'C'C
,
.
. (Dxx()))r0   gradr   
clip_inputc                    |                                  r|                                 sdS |                                 s|                                 rt          | j                  dk    rdS dS )NFr   T)_is_initializedis_distis_denser?   shape)rR   rS   s     r/   _can_inplace_clip_gradrY      sk    !! )C)C)E)E u 	 $--// s4:!/C/Ct5r0   c                :   t          |           } t                      rt          j        |           S d}t	          | dg d|           t          |fi t                      }|                    | j                  }d| i}d|i}|	                    |||           |S )z1
    Return the squared L2 norm of a tensor.
    squared_l2_normr,   )r   float64r   r   r   r!   r   r#   r%   )
rQ   r   r   r[   r	   r   r&   r4   r   r+   )r,   op_typer-   r.   r#   r%   s         r/   _squared_l2_normr_      s    
 	$A&&A )%a(((G	3;;;W   --FHH--F

3
3AG
<
<C1XFclG
'&'BBBJr0   c                      e Zd Zd Zd ZdS )BaseErrorClipAttrc                    t           rH   NotImplementedErrorselfs    r/   __str__zBaseErrorClipAttr.__str__      !!r0   c                    t           rH   rc   )rf   block	grad_names      r/   _append_clip_opz!BaseErrorClipAttr._append_clip_op  rh   r0   N)__name__
__module____qualname__rg   rl    r0   r/   ra   ra     s2        " " "" " " " "r0   ra   c                  &    e Zd ZdZddZd Zd ZdS )ErrorClipByValuea1  
    Clip tensor values to the range [min, max].

    Given a tensor ``t`` (see Examples below), this operation clips its value \
    to ``min`` and ``max`` inplace.

    - Any values less than min are set to min.
    - Any values greater than max are set to max.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
        will be set to ``-max`` by framework.

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> paddle.enable_static()
            >>> BATCH_SIZE = 128
            >>> CLIP_MAX = 2e-6
            >>> CLIP_MIN = -1e-6
            >>> prog = paddle.static.Program()
            >>> with paddle.static.program_guard(main_program=prog):
            ...     image = paddle.static.data(name='x', shape=[None, 784], dtype='float32')
            ...     hidden1 = paddle.static.nn.fc(image, size=128, activation='relu')
            ...     hidden2 = paddle.static.nn.fc(hidden1, size=64, activation='relu')
            ...     predict = paddle.static.nn.fc(hidden2, size=10, activation='softmax')
            ...     label = paddle.static.data(name='y', shape=[1], dtype='int64')
            ...     cost = paddle.nn.functional.cross_entropy(input=predict, label=label)
            ...     avg_cost = paddle.mean(cost)
            >>> prog_clip = prog.clone()
            >>> prog_clip.block(0).var(hidden1.name)._set_error_clip(
            ...     paddle.nn.clip.ErrorClipByValue(
            ...         max=CLIP_MAX, min=CLIP_MIN))
    Nc                j    t          |          }|| }nt          |          }|| _        || _        d S rH   )r'   maxmin)rf   rt   ru   s      r/   __init__zErrorClipByValue.__init__E  s8    Cjj;$CC**Cr0   c                *    d| j         dd| j        dS )NzByValue, min=f, max=ru   rt   re   s    r/   rg   zErrorClipByValue.__str__N  s"    =tx=======r0   c                *   |j                                         }|                    d           |                    d|g           |                    d|g           |                    d| j                   |                    d| j                   d S )Nclipr   r!   ru   rt   )descr+   set_type	set_input
set_output	_set_attrru   rt   )rf   rj   rk   clip_op_descs       r/   rl   z ErrorClipByValue._append_clip_opQ  s    z++--f%%%sYK000	{333udh///udh/////r0   rH   )rm   rn   ro   __doc__rv   rg   rl   rp   r0   r/   rr   rr     sQ        $ $L   > > >0 0 0 0 0r0   rr   c                   || j                             | j                                         dz
            }fd|                                D             D ]l}|                     |                   }t          |dd           }|$t          |t                    st          d          ||	                    | |           md S )Nr>   c                    g | ]}|v |	S rp   rp   ).0ngrad_to_vars     r/   
<listcomp>z'error_clip_callback.<locals>.<listcomp>^  s#    MMMA<L<L1<L<L<Lr0   
error_clipzIVariable's error_clip should be an instance of BaseErrorClipAttr or None.)
r}   opop_sizeoutput_arg_names_var_recursivegetattrr@   ra   r<   rl   )rj   contextop_descgrad_nfwd_varr   r   s         @r/   error_clip_callbackr   Z  s    KjmmEJ..001455GMMMMg6688MMM 
6 
6&&{6':;;WlD99
*Z9J"K"K[   !&&uf555
6 
6r0   c                  t     e Zd Z fdZd Z ej                    d             Zd Zd Z	dd	Z
d
 Zd Z xZS )ClipGradBasec                H    t                                                       d S rH   )superrv   )rf   	__class__s    r/   rv   zClipGradBase.__init__l  s    r0   c                    t           rH   rc   re   s    r/   rg   zClipGradBase.__str__o  rh   r0   c                    t           rH   rc   rf   params_gradss     r/   _dygraph_clipzClipGradBase._dygraph_clipr  s    !!r0   c                    t           rH   rc   r   s     r/   	_pir_clipzClipGradBase._pir_clipv  rh   r0   c                    t           rH   rc   r   s     r/   _static_clipzClipGradBase._static_clipy  rh   r0   r   list[tuple[Tensor, Tensor]]returnc                   t                      r|                     |          S t                      r|                     |          S |D ],\  }}t	          |dd           t          j        d            n-|                     |          S )Ngradient_clip_attrz'set_gradient_clip' will be ineffective, because you have set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' is redundant and you can remove it.)r   r   r   r   r   warningswarnr   )rf   r   pgs       r/   __call__zClipGradBase.__call__|  s      	3%%l333]] 	3>>,///$  112D99EM>  
 E F $$\222r0   c                    t           rH   rc   rf   r   paramrR   s       r/   _process_contextzClipGradBase._process_context  rh   r0   c                    t           rH   rc   )rf   r   rR   s      r/   _create_operatorszClipGradBase._create_operators  rh   r0   )r   r   r   r   )rm   rn   ro   rv   rg   imperative_baseno_gradr   r   r   r   r   r   __classcell__r   s   @r/   r   r   k  s            " " " _" " "" " "" " "3 3 3 3$" " "" " " " " " "r0   r   c                       e Zd ZU dZded<   ded<   dd fd	ZddZ ej                    d             Z	d Z
d Zd Z xZS )ClipGradByValuea  
    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].

    - Any values less than min are set to ``min``.

    - Any values greater than max are set to ``max``.

    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    Note:
        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
            automatically. In this case, ``max`` must be greater than :math:`0`.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByValue(min=-1, max=1)
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    r'   rt   ru   Nfloat | Noner   Nonec                    t                                                       ||dk    sJ | }t          |          | _        t          |          | _        d S )N        )r   rv   r'   rt   ru   )rf   rt   ru   r   s      r/   rv   zClipGradByValue.__init__  sP    ;9999$C::::r0   strc                *    d| j         dd| j        dS )NzClip Gradient By Value, min = rx   ry   rz   re   s    r/   rg   zClipGradByValue.__str__  s"    NNNN$(NNNNr0   c                    g }|D ]k\  }}|t          |dd          du r|                    ||f           3t          j        || j        | j                  }|                    ||f           l|S )N	need_clipTFr,   ru   rt   )r   appendpaddler|   ru   rt   rf   r   params_and_gradsr   r   new_grads         r/   r   zClipGradByValue._dygraph_clip  s      	3 	3DAqyq+t,,55 ''A///{QDH$(CCCH##QM2222r0   c                   g }i }t          j        d          5  |D ]\  }}|t          |dd          du r|                    ||f           3|j        j                            ||g          5  t          j        || j	        | j
                  }d d d            n# 1 swxY w Y   |                    ||f           |j        ||j        <   	 d d d            n# 1 swxY w Y   t          ||           |S )Ngradient_clipr   TFr   )r   
name_scoper   r   rj   program_optimized_guardr   r|   ru   rt   r   _correct_clip_op_role_varrf   r   r   param_new_grad_name_dictr   r   r   s          r/   r   zClipGradByValue._static_clip  s   #% !/22 	A 	A$ 
A 
A191k400E99$++QF333W_55q!f== L L%{QDH$(KKKHL L L L L L L L L L L L L L L ''H6663;=(00
A	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	""24LMMMs6   AC0"BCB""C%B"&*CC!$C!c                    d S rH   rp   r   s       r/   r   z ClipGradByValue._process_context      r0   c                L    t          j        || j        | j                  }||fS )Nr   )r   r|   ru   rt   rf   r   rR   r   s       r/   r   z!ClipGradByValue._create_operators  s&    ;48BBBhr0   rH   )rt   r'   ru   r   r   r   r   r   )rm   rn   ro   r   __annotations__rv   rg   r   r   r   r   r   r   r   r   s   @r/   r   r     s         % %N JJJJJJ      O O O O _
  
  
      $        r0   r   c                       e Zd ZU dZded<   d fdZddZd	 Z ej	                    d
             Z
d Zd Zd Zd Z xZS )ClipGradByNorma  
    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .

    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.

    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.

    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    The clipping formula is:

    .. math::
        Out =
        \left\{
            \begin{array}{ccl}
                X & & if (norm(X) \leq clip\_norm) \\
                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
        \end{array}
        \right.


    where :math:`norm(X)` represents the L2 norm of :math:`X`.

    .. math::
        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}

    Note:
        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.

    Args:
        clip_norm(float): The maximum norm value.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    r'   	clip_normr   r   c                p    t                                                       t          |          | _        d S rH   )r   rv   r'   r   )rf   r   r   s     r/   rv   zClipGradByNorm.__init__+  s,    y))r0   r   c                    d| j         dS )Nz!Gradient Clip By Norm, clip_norm=rx   r   re   s    r/   rg   zClipGradByNorm.__str__/  s    E4>EEEEr0   c                    g }|D ]`\  }}|t          |dd          du r|                    ||f           3t          || j                  }|                    ||f           a|S )Nr   TFr,   r   )r   r   r   r   r   s         r/   _clip_gradientszClipGradByNorm._clip_gradients2  s      	3 	3DAqyq+t,,55 ''A///#a$.AAAH##QM2222r0   c                ,    |                      |          S rH   r   r   s     r/   r   zClipGradByNorm._dygraph_clip>  s    ##L111r0   c                ,    |                      |          S rH   r   r   s     r/   r   zClipGradByNorm._pir_clipB  s    ##L111r0   c                   g }t          j        d          5  i }|D ]\  }}|t          |dd          du r|                    ||f           3|j        j                            ||g          5  t          || j                  }d d d            n# 1 swxY w Y   |j	        ||j	        <   |                    ||f           	 d d d            n# 1 swxY w Y   t          ||           |S )Nr   r   TFr   )r   r   r   r   rj   r   r   r   r   r   r   r   s          r/   r   zClipGradByNorm._static_clipE  s   !/22 	7 	7')$$ 
7 
7191k400E99$++QF333W_55q!f== J J+a$.IIIHJ J J J J J J J J J J J J J J3;=(0 ''H6666
7	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	""24LMMMs6   AC0BCBCB*CCCc                    d S rH   rp   r   s       r/   r   zClipGradByNorm._process_contextW  r   r0   c                6    t          || j                  }||fS )Nr   )r   r   r   s       r/   r   z ClipGradByNorm._create_operatorsZ  s     $@@@hr0   )r   r'   r   r   r   )rm   rn   ro   r   r   rv   rg   r   r   r   r   r   r   r   r   r   r   s   @r/   r   r     s         4 4l * * * * * *F F F F
  
  
  _2 2 22 2 2     $        r0   r   c                     t          |           dk    rt          S t          |           dk    rt          | d         t                    sJ t          }| d         a|S Nr   r>   )r?   &_allow_pure_fp16_global_norm_clip_flagr@   rA   rC   s     r/   !_allow_pure_fp16_global_norm_clipr   b  R    
4yyA~~554yyA~~*T!Wd";";~~;:	15a.r0   c                     t          |           dk    rt          S t          |           dk    rt          | d         t                    sJ t          }| d         a|S r   )r?   &_allow_pure_bf16_global_norm_clip_flagr@   rA   rC   s     r/   !_allow_pure_bf16_global_norm_clipr   p  r   r0   c                       e Zd ZU dZded<   ded<   ded<   	 	 dd fdZddZ ej                    d             Z	d Z
d Zd Zd Z xZS )ClipGradByGlobalNorman  
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
    :math:`t\_list` , and limit it to ``clip_norm`` .

    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.

    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.

    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    The clipping formula is:

    .. math::

        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}

    where:

    .. math::

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

    Note:
        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.

    Args:
        clip_norm (float): The maximum norm value.
        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
        auto_skip_clip (bool, optional): skip clipping gradient. Default value is ``False``.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    r'   r   r   
group_namerA   auto_skip_clipdefault_groupFr   r   c                    t                                                       t          |          | _        || _        t          |t                    sJ || _        d | _        d| _	        d S )NF)
r   rv   r'   r   r   r@   rA   r   _async_add_nshould_comm_on_shard_dim)rf   r   r   r   r   s       r/   rv   zClipGradByGlobalNorm.__init__  sg     	y))$.$/////, !(-%%%r0   c                    d| j         dS )Nz)Gradient Clip By GlobalNorm, global_norm=rx   r   re   s    r/   rg   zClipGradByGlobalNorm.__str__  s    M4>MMMMr0   c                <   g }g }g }g }d}t          |          dk    r-t          |d                   dk    r|d         d         j        }nd }|D ]\  }}	|		t          |dd          du r|	}
t                      r8|	                                r$t          |	          }
|
                                }
n=|	j        t          j	        j
        j        k    rt          |	          }
t          |
          }
t          |
          }||	j        |k    r}d}t          |	j                  }t          |	j        j                  t          |j                  k     rd}t#          j        |||j                  }t#          j        |||j                  }|j        t*          j        k    s|j        t*          j        k    r|                    |           ||j        t*          j        k    r|                    |           |                    |           t          |          t          |          z   t          |          z   dk    r|S d }t          |          dk    rdnd}g }t          |          dk    r3 ||          }|                    |                    |                     t          |          dk    rO ||          }|dk    r|                    |           n(|                    |                    |                     t          |          dk    r  ||          }|                    |            ||          }|r|t#          j                    }|rd|j        v r|                    d          d	k    r|                    d                              d          }|                                 }t#          j!        |t"          j"        j#        |
           t#          j$        ||j        |j                  }| j%        rZtM          | d          rJt*          j'        !                    |                                 | j(                  )                                 | j%        rZtM          | d          rJt*          j'        !                    |                                 | j*                  )                                 | j%        rHtM          | d          r8t*          j'        !                    || j+                  )                                 t+          j,        |          }t+          j-        d	g|| j.                  }d}| j/        s-d}t+          j0        |t+          j1        ||                    }n||k    rd}t+          j0        ||          }|D ] \  }}	|		t          |dd          du r|                    ||	f           4|r|j        |	j        k    r|                    |	j                  n|}|j        |	j        k    r	t          |	j        j                  t          |j        j                  k     rL|j        }d}|D ]}|2                                sd} n|r|                                 }ntg          d          t          |	j                  }t          |	j        j                  t          |j                  k     rt#          j        |||j                  }t*          j'                            ||	j        |j                  }ti          |	|          r.|	5                    |           |                    ||	f           t+          j6        |	|          }|                    ||f           |                    ||	f           |S )NTr   r   Fc                N    t          j        |                                           S rH   r   stacksumvar_lists    r/   async_add_nz7ClipGradByGlobalNorm._dygraph_clip.<locals>.async_add_n	      <))--///r0   r\   r   ppr>   )r   groupsharding_group)r   mp_group
fsdp_grouprX   r   
fill_valuer,   yzLReshard a sharded tensor from a local mesh to a global mesh is not supported)7r?   process_meshr   r   is_selected_rowsr2   _get_tensor_from_selected_rowsr   r   r9   r:   r;   r7   r_   r   setprocess_idsdistreshard
placementsr   r   r   bfloat16r   r   rK   get_mesh	dim_namesget_dim_sizeget_submesh_with_dim	get_group_local_value
all_reduceReduceOpSUMshard_tensorr   hasattrdistributedr   waitr   r  sqrtfullr   r   dividemaximumis_replicatedrd   rY   	multiply_multiply)rf   r   r   sum_square_listsum_square_list_fp16sum_square_list_fp32flag_auto_hybrid_ppsrc_meshr   r   
merge_grad
sum_squarepp_meshr   	sum_dtypeglobal_norm_varglobal_norm_var_fp16global_norm_var_fp32global_norm_var_fp64g_meshpp_groupglobal_norm_var_localmax_global_normr   clip_varrS   r  is_replicate	placementr   s                                 r/   r   z"ClipGradByGlobalNorm._dygraph_clip  s/   !!"|q  Sa%9%9A%=%=#Aq)6HHH  (	3 (	3DAqyq+t,,55J   GQ%7%7%9%9 G033
'FFHH

4</===033
::FF
)*55J #((B(B&+#.q~>>q~122S9L5M5MMM*.'!%"GZ-B" "J "\**? 

  FN22#v66$++J7777!V^33$++J7777&&z2222   &''(&''(  
  	0 	0 	0 "%_!5!5!9!9IIy	#$$q((#.;/C#D#D ""#7#>#>y#I#IJJJ#$$q((#.;/C#D#D I%%&&';<<<<&&';'B'B9'M'MNNN!###.;#?#? ""#7888%+o66  	8#7]__FF,,,''--11 "66t<<FFtLL )8(D(D(F(F%)}("    #'"3)#0#.# # ( 	WT;K-L-L 	)),,..d6I *  dfff( 	WT:-F-F 	)),,..dm *  dfff( 	WT<-H-H 	))t *  dfff +o66 +#Y4>
 
 
 	" 		KI}!.?oFFF  HH ..I}/JJJH  5	0 5	0DAqyq+t,,55 ''A/// .0  ~00 OOAG,,,! 
 *an<< 1>566"/;: :   &0%:
'+)3 & &I#,#:#:#<#< &/4 %& ( )3)@)@)B)BJJ"5 n# #  #7q~"F"Fq~9::S#/> >   *. *GZ5J* *J &,%7%?%?&
8M& &
 *!Z88 ;KK
+++$++QF3333%q*==H$++QM:::: ''A////r0   c                   g }g }g }g }g }g }g }g }	g }
g }d}t                      }d }|D ]\  }}|                                r|                    |                                j                   d|                                j        j        v r||                                j        }|                                j        }t          |j                  t          |j                  k     r|}t          |j                  t          |j                  k    sJ t          |          dk    rddlm} d}|J |D ]\  }}|	t          |dd          du r|}t                      r2|                                rt          |          }t          |          }t          |          }|r|                                j        |k    rqt          j                            || ||                                j        |                                j        |                                j                            }| j        r|j        d         r|j        t.          j        k    s|j        t.          j        k    r|                    |           S|j        t.          j        k    r|                    |           |                    |           |j        r|j        t.          j        k    s|j        t.          j        k    r|                    |           |j        t.          j        k    r|                    |           
|                    |           !|j        t.          j        k    s|j        t.          j        k    r|
                    |           b|j        t.          j        k    r|                    |           |	                    |           t          |          t          |          z   t          |          z   t          |          z   t          |          z   t          |          z   t          |	          z   t          |
          z   t          |          z   dk    r|S d }t          |          t          |          z   t          |	          z   dk    rd	nd
}g }g }g }t          |          dk    r3 ||          }|                    |                    |                     t          |          dk    r3 ||          }|                    |                    |                     t          |
          dk    r3 ||
          }|                    |                    |                     t          |          dk    rO ||          }|d
k    r|                    |           n(|                    |                    |                     t          |          dk    rO ||          }|d
k    r|                    |           n(|                    |                    |                     t          |          dk    rO ||          }|d
k    r|                    |           n(|                    |                    |                     t          |          dk    r  ||          }|                    |           t          |          dk    r  ||          }|                    |           t          |	          dk    r  ||          }|                    |           d }t          |          dk    r ||          }t          |          dk    r ||          }n&| j        r| j        rt          j        dg|d          }| j        r| j        r~t          j         !                    || j"        j#        tH          j%        j&                  }t          j         !                    || j'        j#        tH          j%        j&                  }||}n||z   }t          |          dk    r ||          }n&| j        r| j(        rt          j        dg|d          }| j        rK| j(        rDt          j         !                    || j"        j#        tH          j%        j&                  }||}n||z   }t          j)        |          }t          j        dg|j        | j*                  } d}!| j+        s-d}!t          j,        | t          j-        ||                     }"n|| k    rd}!t          j,        | |          }"|D ]f\  }}|	t          |dd          du r|                    ||f           4|!r|"j        |j        k    r|"                    |j                  n|"}#|r|#                                j        |                                j        k    rt          j                            |#|                                j         ||#                                j        |#                                j        |#                                j                            }#t          j.        ||#          }$|                    ||$f           O|                    ||f           h|S )NFr   r>   )to_placementsTr   	no_fusionc                N    t          j        |                                           S rH   r   r   s    r/   r   z3ClipGradByGlobalNorm._pir_clip.<locals>.async_add_n  r   r0   r\   r   r   r  r  )/r	  is_dist_dense_tensor_typeadd	dist_attrr  r
  r?   /paddle.distributed.auto_parallel.placement_typer8  r   r   is_selected_row_typer2   r7   r_   r   r  r  dims_mappingpartial_dimsr   optimize_attrr   r
   rM   rN   r   rO   is_distributedrK   has_dist_paramr  r   r  r   idr  r  r  r   has_not_dist_paramr  r   r   r  r  r"  )%rf   r   r   no_fusion_sum_squareno_fusion_sum_square_fp16no_fusion_sum_square_fp32sum_square_distsum_square_dist_fp16sum_square_dist_fp32sum_square_not_distsum_square_not_dist_fp16sum_square_not_dist_fp32auto_parallel_pp	pp_meshespp_stage0_meshr   r   p_meshr8  r(  r)  r   r+  no_fusion_global_normglobal_norm_distglobal_norm_not_distr-  r.  r/  r,  global_norm_dist_varglobal_norm_not_dist_varr3  r   r4  rS   r   s%                                        r/   r   zClipGradByGlobalNorm._pir_clip  s     "$&!$&! !! !#% #%  EE	  	 	DAq**,, akkmm89992>>>%-)*)C!"!;~9::S".> >   .4N"6#566#*6; ;         y>>A       $!---  9	; 9	;DAqyq+t,,55J}} G!7!7!9!9 G033
::FF
)*55J ((**7>II#/77"!M",,..;",,..;",,..;  
 1 ;?;/ ;
 $(888!'8+<<<-44Z@@@@%)999-44Z@@@@(//
;;;;! ;$(888!'8+<<<(//
;;;;%)999(//
;;;;#**:6666 $(888!'8+<<<,33J????%)999,33J????'..z:::: $%%+,,-+,,- /""# &''	(
 &''( %&&' *++, *++, 	 	  	0 	0 	0
 '((/""#%&&'   I
  	 !#!())A--#.;/H#I#I !(()=)D)DY)O)OPPP#$$q((#.;/C#D#D ##$8$?$?	$J$JKKK'((1,,#.;/G#H#H  ''(<(C(CI(N(NOOO())A--#.;/H#I#I I%%%,,-ABBBB%,,(//	::   #$$q((#.;/C#D#D I%% ''(<==== ''(<(C(CI(N(NOOO'((1,,#.;/G#H#H I%%$++,@AAAA$++(//	::   #$$q((#.;/C#D#D !(()=>>>!###.;#?#? ##$8999"##a''#.;#?#?  ''(<===$%%)))k*?@@O  1$$#.;/?#@#@  * 	t/B 	#);cs$ $ $  ( 
	IT-@ 
	I#)=#;#;$d&9&<dm>O$ $  $*=#;#;$dm&68I$ $  &"6"14H"H#$$q(('2{3G'H'H$$* 	t/F 	'-{cs( ( ($ ( 		MT-D 		M'-}'?'?(#&!( ($
 &":"14L"L +o66 +#_2t~
 
 
 	" 		KI}!.?oFFF  HH ..I}/JJJH  	0 	0DAqyq+t,,55 ''A/// 0  ~00 OOAG,,,!  %",,..;{{}}12 2 "(!3!;!;"2%&0022?&0022?&0022? " "J "?1j99 ''H6666 ''A////r0   c                    g }g }g }g }g } fd}t          j        d          5  |D ]X\  }}	|		t          |dd          du r|	}
|j        j                            ||	g          5  |	j        t          j        j	        j
        k    rt          |	          }
t          |
          }
t          |
          }|j        t          j        j	        j        k    r|                    |           n|j        t          j        j	        j        k    r|                    |           nJ|j        t          j        j	        j        k    r|                    |           n|                    |           d d d            n# 1 swxY w Y   Zt'          |          dk    r"t'          |          dk    rt)          d          t'          |          t'          |          z   t'          |          z   dk    rAt'          |          t'          |          z   t'          |          z   dk    r|cd d d            S |j        j                            ||	g          5  t'          |          dk    rdnd	}g }t'          |          dk    r[ ||          }|s|st+                      s)|                    |                    |                     n|                    |           t'          |          dk    r[ ||          }|s|st/                      s)|                    |                    |                     n|                    |           t'          |          dk    rO ||          }|d	k    r|                    |           n(|                    |                    |                     t'          |          dk    r  ||          }|                    |           t'          |          d
k    r ||          n|d         }t1          j        |          }t1          j        d
g|j         j                  }t1          j        |t1          j        ||                    }d d d            n# 1 swxY w Y   i }|D ]\  }}	|		t          |dd          du r|                    ||	f           4|j        j                            ||	g          5  t=          |	          }|j        t          j        j	        j        k    r5|j        t          j        j	        j        k    r|                    d          }nV|j        t          j        j	        j        k    r5|j        t          j        j	        j        k    r|                    d          }n|}t?                                                       }|!                    d||dd|i           ||	ur*|!                    dd|id|	i|j        |	j        d           d d d            n# 1 swxY w Y   |	j"        ||j"        <   |                    ||	f           	 d d d            n# 1 swxY w Y   tG          ||           |S )Nc                    j         r&t          j        |                                           S t          j        |           S rH   )r   r   r   r   add_n)r   rf   s    r/   _add_nz1ClipGradByGlobalNorm._static_clip.<locals>._add_n  s:      .|H--11333|H---r0   r   r   TFr   z1FP16 and BF16 are not supported at the same time.r\   r   r>   rP   r  r  r   r  elementwise_mulr   Yr!   r]   castr   )in_dtype	out_dtyper8   )$r   r   r   rj   r   r   r   r   r9   r:   r;   r2   r7   r_   r   rI   r   rJ   rL   r?   rd   r   rK   r   r   r  r  r   r  r  rQ   r   current_blockr+   r   r   )rf   r   r   r#  r$  sum_square_list_bf16r%  r\  r   r   r(  r)  r+  r,  r-  global_norm_var_bf16r.  global_norm_var_other_dtyper3  	scale_varr   new_gscale_inputrj   s   `                       r/   r   z!ClipGradByGlobalNorm._static_clip  s\   !!!	. 	. 	. 	. 	. !/22 M	0 M	0$ ; ;191k400E99
W_55q!f== ; ;v!5!CCC%8%;%;
%B:%N%N
!1*!=!=J!'4<+?+DDD,33J????#)T\-A-FFF,33J????#)T\-A-FFF,33J????'..z:::; ; ; ; ; ; ; ; ; ; ; ; ; ; ; '((1,,5I1J1JQ1N1N)G   O$$*++,*++,  
 O$$*++,*++,  
 $MM	0 M	0 M	0 M	0 M	0 M	0 M	0 M	0P 111a&99 7 7),_)=)=)A)AIIy	"$+,,q00+162F+G+G(,	E*	E  ABB	E
 (..077	BB    (../CDDD+,,q00+162F+G+G(,	E*	E  ABB	E
 (..077	BB    (../CDDD+,,q00+162F+G+G( I--'../CDDDD'..077	BB   ''!++28&2I2I/#**+FGGG ?++a// F?+++(+  
 #)+"@"@"@"(+#)/#~# # #
 #M%n/JJJ  	i7 7 7 7 7 7 7 7 7 7 7 7 7 7 7p (*$$ ,0 ,0191k400E99$++QF333W_55q!f== " "7::E t|';'@@@%Ot|/C/HHH&/&6&6y&A&At|';'@@@%Ot|/C/HHH&/&6&6z&B&B&/
 122@@BBEOO.%*==!& $   
 A~~!'$'<%*AJ,1K-.W# #	 (   5" " " " " " " " " " " " " " "H 456(0 ''A////Y,0CM	0 M	0 M	0 M	0 M	0 M	0 M	0 M	0 M	0 M	0 M	0 M	0 M	0 M	0 M	0^ 	""24LMMMs   AX?(DE5)X?5E99X?<E9=B!X?+!X?HQ*X?*Q.	.X?1Q.	2AX?D%W?3X??XX?X+X??YYc                h   | j         |vrJg || j         <   | j        || j         dz   <   t          j        dg|j        | j                  || j         dz   <   n(| j        || j         dz            k    st          d          |}|j        t          j        j	        j
        k    rt          |          }t          |          }n@t                      r2|                                rt          |          }t          |          }t          |          }|| j                                      |           || _        d S )N_clip_valuer>   r  _clipz>All parameters' 'clip_norm' of a same group should be the same)r   r   r   r  r   
ValueErrorr   r   r9   r:   r;   r2   r7   r   r?  r_   r   r   )rf   r   r   rR   r(  local_norm_vars         r/   r   z%ClipGradByGlobalNorm._process_context?  s6   ?'))')GDO$7;~GDOm3417c2 2 2GDOg-.. >WT_}-L%MMM T   
9,:::,T22J6zBBJJ]] 	Ct88:: 	C,T22J6zBBJ)*55 ''777r0   c                    d }| j         dz   }|| j        vr || j        | j                            }t          j        |          }| j        | j         dz            }t          j        |t          j        ||                    }|j        dk    sJ || j        |<   t                      r$t          j        || j        |                   }||fS |j	        
                    d|| j        |         dd	|i
           ||fS )Nc                N    t          j        |                                           S rH   r   r   s    r/   r   z;ClipGradByGlobalNorm._create_operators.<locals>.async_add_nZ  r   r0   _scalerP   rl  r  )r>   r]  r^  r!   r]   )r   r   r   r  r  r  rX   r   r"  rj   r+   )rf   r   rR   r   group_scale_namegroup_norm_varr4  group_scale_vars           r/   r   z&ClipGradByGlobalNorm._create_operatorsY  s1   	0 	0 	0  ?X54<//([do)FGGN#[>:::N|DOg$=>H$m.8~>>>  O #(D0000-<DL)*== 	?46F)GHHD$; 	"DL1A$BCCDM 	 	
 	
 	
 d{r0   )r   F)r   r'   r   r   r   rA   r   r   r   )rm   rn   ro   r   r   rv   rg   r   r   r   r   r   r   r   r   r   s   @r/   r   r   {  s        2 2h OOO
 *$	. . . . . . .&N N N N _H  H  H TI  I  I V]  ]  ] ~  4      r0   r   c                   t          j        d           t          | t                    st	          d          t          j                                        d          j        D ]E}d|	                                v r-d|
                    d          v rt          j        d            nF|'                    d                                          }t          d |D                       rfd	|D             }t          d
 |D                       st	          d          |D ]}t          j        |           |_        dS )a  
    Warning:

        This API must be used after building network, and before ``minimize`` ,
        and it may be removed in future releases, so it is not recommended.
        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
        this is a better method to clip gradient. There are three clipping strategies:
         :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
         :ref:`api_paddle_nn_ClipGradByValue` .

    To specify parameters that require gradient clip.

    Args:
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default value: None, and there is no
            gradient clipping.
        param_list (list(Variable), optional): Parameters that require gradient clip.
                It can be a list of parameter or a list of parameter's name.
                Default None, meaning that all parameters in the program will be included.
        program (Program, optional): The program where parameters are located.
                Default None, meaning that using :ref:`api_paddle_static_default_main_program` .

    Returns:
        None

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> paddle.enable_static()

            >>> def network():
            ...     image = paddle.static.data(name='image', shape=[
            ...                        None, 28], dtype='float32')
            ...     param_attr1 = paddle.ParamAttr("fc1_param")
            ...     fc1 = paddle.static.nn.fc(image, size=10, weight_attr=param_attr1)
            ...     param_attr2 = paddle.ParamAttr("fc2_param")
            ...     fc2 = paddle.static.nn.fc(fc1, size=10, weight_attr=param_attr2)
            ...     loss = paddle.mean(fc2)
            ...     return loss


            >>> # network 1: clip all parameter gradient
            >>> with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     paddle.nn.clip.set_gradient_clip(
            ...         paddle.nn.ClipGradByGlobalNorm(clip_norm=2.0))
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
            ...     sgd.minimize(loss)

            >>> # network 2: clip parameter gradient by name
            >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     paddle.nn.clip.set_gradient_clip(
            ...         paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
            ...         param_list=["fc1_param", "fc2_param"])
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
            ...     sgd.minimize(loss)

            >>> # network 3: clip parameter gradient by value
            >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     param_var1 = paddle.static.default_main_program().global_block().var("fc1_param")
            ...     param_var2 = paddle.static.default_main_program().global_block().var("fc2_param")
            ...     paddle.nn.clip.set_gradient_clip(
            ...         paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
            ...         param_list=[param_var1, param_var2])
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
            ...     sgd.minimize(loss)

            >>> # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
            >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     clip1 = paddle.nn.ClipGradByValue(min=-1.0, max=1.0)
            ...     clip2 = paddle.nn.ClipGradByNorm(clip_norm=1.0)
            ...     # Set the gradient clipping strategy: clip1
            ...     paddle.nn.clip.set_gradient_clip(clip1)
            ...     # Set the gradient clipping strategy: clip2
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
            ...     sgd.minimize(loss)
            ...     # 'set_gradient_clip' will not take effect when setting has a conflict,
            ...     # and the gradient clipping strategy will be 'clip2'


    zCaution! 'set_gradient_clip' is not recommended and may be deprecated in future! We recommend a new strategy: set 'grad_clip' when initializing the 'optimizer'. This method can reduce the mistakes, please refer to documentation of 'optimizer'.z<'clip' should be an instance of ClipGradBase's derived classNr   op_namescope	optimizerz'minimize' has been invoked before, this will make 'set_gradient_clip' be ineffective! Please invoke 'set_gradient_clip' before 'minimize'.c              3  @   K   | ]}t          |t                    V  d S rH   )r@   r   r   elems     r/   	<genexpr>z$set_gradient_clip.<locals>.<genexpr>  s,      
8
8T:dC  
8
8
8
8
8
8r0   c                `    g | ]*}                     d                               |          +S )r   )rj   var)r   rz  r   s     r/   r   z%set_gradient_clip.<locals>.<listcomp>  s3    HHHTgmmA&&**400HHHr0   c              3  J   K   | ]}t          |t          j                  V  d S rH   )r@   r   	Parameterry  s     r/   r{  z$set_gradient_clip.<locals>.<genexpr>  s/      LLz$	 344LLLLLLr0   zK'param_list' should be a list of Parameter or basestring(parameter's name).)r   r   r@   r   r<   r   r   rj   ops	all_attrsattrall_parametersallcopydeepcopyr   )r|   
param_listr   r   r   s     `  r/   set_gradient_clipr  w  s   t M	1   dL)) 
J
 
 	
 022mmA"  R\\^^++rww@
 @
 1
 1
 MW   E]]1%%4466


8
8Z
8
8
888 IHHHHZHHH
LLLLLLL 
Y
 
 	
  7 7#'=#6#6  7 7r0   c                H   i }| D ]\  }}||j         j                            ||g          5  t          j        d          5  t          |dd           }|| cd d d            cd d d            c S t          |t                    st          d          |	                    |||           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   g }i }| D ]\  }}||j         j                            ||g          5  t          j        d          5  |
                    ||          \  }}|j        ||j        <   |                    ||g           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   t          ||           |S )Nr   r   z8clip attribute should be an instance of GradientClipBase)r   r   rR   )r   rR   )rj   r   r   r   r   r   r@   r   r<   r   r   r   r   r   )	param_gradsr   r   r   	clip_attrresr   r   r   s	            r/   append_gradient_clip_opsr    sC   G I I19GO,,aV44	I 	I 11	I 	I  #7>>I "	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I i66 N   &&waa&HHH	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I C! 	* 	*19GO,,aV44	* 	* 11	* 	* (999JJOE83;=$UZ0JJx()))	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* c#;<<<Jsx   CB;C3<B;/C;B??CB?CC	C	F"AE/#F/E33F6E37FF
	F
	c                   g }t          |          dk    rd S | D ]\  }}||j        j        }||v r|                    |           |j        j                                        j        D ]}|                    d          rkd|                    d          v rT|                    d          r?|                    d          d         }||v r |||         g}|	                    d|           d S )Nr   rv  r   op_role_var)
r?   rj   idxr   r   global_blockr  has_attrr  r   )	r   r   block_id_listr   rR   block_idr   
param_namecorrect_p_gs	            r/   r   r     s$   M
#$$))# = =t<;?}$$X&&&+%22448 	= 	=BN++=#rww~'>'>>>GGM** ?  WW]33A6
!999"0<#K LL<<<	== =r0   rH   )rR   r   rS   r   )NN)>
__future__r   r  r   typingr   r   paddle.autogradautogradr   paddle.distributedr  r  r   paddle.baser   r   r   paddle.base.data_feederr	   paddle.base.libpaddler
   paddle.common_ops_importr   r   r   "paddle.distributed.utils.moe_utilsr   paddle.frameworkr   r   r   r   r   __all__r   r2   r7   rB   rF   rQ   rY   r_   ra   rr   r   r   r   r   r   r   r   r   r   dygraph_not_supportr  r  r   GradientClipBaseGradientClipByValueGradientClipByNormGradientClipByGlobalNormrp   r0   r/   <module>r     s   # " " " " "                ) ) ) ) ) ) ! ! ! ! ! !       4 4 4 4 4 4 4 4 4 4 < < < < < < * * * * * * O O O O O O O O O O C C C C C C             
C C C CL# # # #L< < < <~ +0 '	7 	7 	7  	 	 	 	  ." " " " " " " "90 90 90 90 90( 90 90 90x6 6 6"'" '" '" '" '" '" '" '"TZ Z Z Z Zl Z Z Zzj j j j j\ j j jZ */ &   */ &  y y y y y< y y yx }7 }7 }7 }7@! ! !N= = =4   % # /   r0   