
    x-j%                     f    d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	 d dl
mZ  G d de          ZdS )    N)_C_ops_legacy_C_opspir)	framework)in_dynamic_modein_pir_mode)	Optimizerc                   H     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 d fd		Zd
 Zd Z xZS )LarsMomentumOptimizera  
    Momentum optimizer with LARS support

    The update equations are as follows:

    .. math::

        & local\_learning\_rate = learning\_rate * lars\_coeff * \\
          \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}

        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)

        & param = param - velocity

    Parameters:
        learning_rate (float|Variable): The learning rate used to update parameters. \
            Can be a float value or a Variable with one float value as data element. \
            momentum (float): momentum factor
        lars_coeff (float): Defines how much we trust the layer to change its weights.
        lars_weight_decay (float): Weight decay coefficient for decaying using LARS.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_paddle_regularizer_L1Decay` , :ref:`api_paddle_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_paddle_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.
        exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
        epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
            before updating. Often choose to be `1.0/batch_size`.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import numpy as np

            >>> paddle.enable_static()
            >>> np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
            >>> inp = paddle.static.data(
            ...     name="inp", shape=[2, 2], dtype='float32')
            >>> out = paddle.static.nn.fc(inp, size=3)
            >>> out = paddle.sum(out)
            >>> optimizer = paddle.incubate.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
            >>> optimizer.minimize(out)

            >>> exe = paddle.static.Executor(paddle.CPUPlace())
            >>> exe.run(paddle.static.default_startup_program())
            >>> exe.run(
            ...     feed={"inp": np_inp},
            ...     fetch_list=[out.name])
    velocityMbP?Mb@?Nr   F      ?c                 ^   |J |J t                                          |||||           d| _        || _        t	          |          | _        t	          |          | _        t	          |
          | _        |	g | _        n|	| _        || _	        t	          |          | _
        i | _        d S )N)learning_rate
parametersweight_decay	grad_clipnamelars_momentum)super__init__type	_momentumfloat_lars_coeff_lars_weight_decay_epsilon_exclude_from_weight_decay_multi_precision_rescale_grad_master_weights)selfr   momentum
lars_coefflars_weight_decayparameter_listregularizationr   r   exclude_from_weight_decayepsilonmulti_precisionrescale_grad	__class__s                g/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/incubate/optimizer/lars_momentum.pyr   zLarsMomentumOptimizer.__init__Z   s     (((###'%' 	 	
 	
 	
 $	! ,,"'(9":":g$,.0D++.GD+ /"<00!    c                    t          |t          j        t          j        f          st	          d          |D ]}| j        rK|                     |j                  r1|                     |          }| 	                    | j
        |           T|                     |j                  r| j        st          j        d           | 	                    | j
        |           d S )Nblock is not instance of Block.zAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Lars optimizer.)
isinstancer   Blockr   	TypeErrorr    _is_dtype_fp16_or_bf16dtype_create_master_weight_add_accumulator_velocity_acc_strwarningswarn)r#   blockr   pmaster_ps        r.   _create_accumulatorsz*LarsMomentumOptimizer._create_accumulators   s    %)/39!=>> 	?=>>> 	= 	=A$ )D)DQW)M)M 55a88%%d&<hGGG++AG44- X   !!$"8!<<<<	= 	=r/   c                 ^   t          |t          j        t          j        f          st	          d          | j        }|d         j        }t          | j                  dk    r| j        D ]
}||v rd} n| 	                    | j
        |d                   }|                     |          }| j        o|                     |d         j                  }|r| j        |d         j                 nd }	| j        | j        |g|| j        | j        d}
|d         |d         ||d}|d         |d}|r
|	|d<   |	|d	<   t)                      rWt+          j        |d         g|d         g|g|g|d         g|gd
| j        d| j        d|gd|d| j        d| j                  \  }}d S t/                      rft          |	t          j                  r|	g}	t3          j        |d         g|d         g|g|g|	| j        | j        |g| j        || j                  \  }}}d S |                    | j        |||
d          }|S )Nr1   r   g        )mur%   r&   r+   r*   r,      )ParamGradVelocityLearningRate)ParamOutVelocityOutMasterParamMasterParamOutrA   r%   r&   r+   r*   r,   T)r   inputsoutputsattrsstop_gradient)r2   r   r3   r   r4   r   r   lenr   _get_accumulator_masterr9   _create_param_lrr    r5   r6   r"   r   r   r   r!   r   r   r   r   Valuer   lars_momentum_	append_opr   )r#   r<   param_and_gradr   
param_namer   velocity_acclrfind_mastermaster_weightrM   rK   rL   tmptmp2_momentum_ops                    r.   _append_optimize_opz)LarsMomentumOptimizer._append_optimize_op   s   %)/39!=>> 	?=>>>!4#A&+
t.//!337  :%%),&E & 33"N1$5
 
 "">22+ 
0K0K1#1
 1

 D !2!788 	 .*"4!5*} .
 
 $A&"1%$	
 
  .a0NN 	6$1F=!(5G$% 0	%3"#"#"# ##$!"% IC( ]] 	-33 0!.+"#"# #$" GAq! 4  //Y" *  K r/   )
r   r   NNNNNr   Fr   )	__name__
__module____qualname____doc__r9   r   r?   r_   __classcell__)r-   s   @r.   r   r      s        < <| #  "&#" #" #" #" #" #"J= = =$_ _ _ _ _ _ _r/   r   )r:   paddler   r   r   paddle.baser   paddle.frameworkr   r   paddle.optimizerr	   r    r/   r.   <module>rj      s     - - - - - - - - - - ! ! ! ! ! !        ' & & & & &W W W W WI W W W W Wr/   