
    ёib                       S SK Jr  S SKrS SKJr  S SKrS SKJrJr  S SKJ	r	  S SK
Jr  SSKJrJr  S	S
KJr  \(       a5  S SKJr  S SKJr  S SKJr  S SKJr  S SK
Jr  S	SKJr  S	SKJr   " S S\5      r/ r " S S\5      rg)    )annotationsN)TYPE_CHECKING)_C_opspir)in_dynamic_or_pir_mode)L2Decay   )core	framework   )	Optimizer)Sequence)NotRequired)Tensor)GradientClipBase)WeightDecayRegularizer)LRScheduler)_ParameterConfigc                  H    \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S
rg)_MomentumParameterConfig(   zNotRequired[float]momentumzNotRequired[bool]use_nesterovrescale_gradzNotRequired[str]regularization_methodregularization_coeff N)__name__
__module____qualname____firstlineno____annotations____static_attributes__r       Y/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/optimizer/momentum.pyr   r   (   s     $$''((//00r$   r   c                     ^  \ rS rSrSrSr          S                     SU 4S jjjrS rS rSU 4S jjr	S r
S	 rS
 rS rSrU =r$ )Momentum3   a  

Simple Momentum optimizer with velocity state

This optimizer has a flag for Nestrov Momentum.

The update equations are as follows:

.. math::

    & velocity = mu * velocity + gradient

    & if (use\_nesterov):

    &\quad   param = param - (gradient + mu * velocity) * learning\_rate

    & else:

    &\quad   param = param - learning\_rate * velocity

Parameters:

    learning_rate (float|Tensor|LRScheduler, optional): The learning rate used to update ``Parameter``.
        It can be a float value, a ``Tensor`` with a float type or a LRScheduler. The default value is 0.001.
    momentum (float): Momentum factor. The default value is 0.9.
    parameters (list|tuple|None, optional): List|Tuple of ``Tensor`` to update to minimize ``loss``. \
        This parameter is required in dygraph mode. And you can specify different options for \
        different parameter groups such as the learning rate, weight decay, etc, \
        then the parameters are list of dict. Note that the learning_rate in parameter groups \
        represents the scale of base learning_rate. \
        The default value is None in static graph mode, at this time all parameters will be updated.
    use_nesterov(bool, optional): Enables Nesterov momentum. The default value is False.
    weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
        It can be a int or float value as coeff of L2 regularization or \
        :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
        If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
        the regularization setting here in optimizer will be ignored for this parameter. \
        Otherwise, the regularization setting here in optimizer will take effect. \
        Default None, meaning there is no regularization.
    grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
        some derived class of ``GradientClipBase`` . There are three clipping strategies
        ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
        :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
    multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
    rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
        Often choose to be ``1.0/batch_size``.
    use_multi_tensor (bool, optional): Whether to use multi-tensor strategy to update all parameters at once . Default is false.
    name (str|None, optional): The default value is None. Normally there is no need for user
            to set this property. For more information, please refer to
            :ref:`api_guide_Name` .

Examples:
    .. code-block:: python

        >>> import paddle

        >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
        >>> linear = paddle.nn.Linear(10, 10)
        >>> inp = paddle.to_tensor(inp)
        >>> out = linear(inp)
        >>> loss = paddle.mean(out)
        >>> momentum = paddle.optimizer.Momentum(
        ...     learning_rate=0.1,
        ...     parameters=linear.parameters(),
        ...     weight_decay=0.01
        ... )
        >>> back = out.backward()
        >>> momentum.step()
        >>> momentum.clear_grad()

        >>> # Note that the learning_rate of linear_2 is 0.01.
        >>> linear_1 = paddle.nn.Linear(10, 10)
        >>> linear_2 = paddle.nn.Linear(10, 10)
        >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
        >>> out = linear_1(inp)
        >>> out = linear_2(out)
        >>> loss = paddle.mean(out)
        >>> momentum = paddle.optimizer.Momentum(
        ...     learning_rate=0.1,
        ...     parameters=[{ # type: ignore
        ...         'params': linear_1.parameters()
        ...     }, {
        ...         'params': linear_2.parameters(),
        ...         'weight_decay': 0.001,
        ...         'learning_rate': 0.1
        ...     }],
        ...     weight_decay=0.01,
        ...     momentum=0.9
        ... )
        >>> out.backward()
        >>> momentum.step()
        >>> momentum.clear_grad()

velocityc                  > Uc  [        S5      eUc  [        S5      e[        U[        5      (       a  [        U5      nS n[        U[        5      (       a_  [        US   [
        5      (       aG  U HA  nSU;   a  US   OUnU R                  U5      u  pXS'   XS'   U" U5      (       a  S OUnUUS'   MC     U" U5      (       a  S OUn[        TU ]!  UUUUU
S9  S	U l	        X l
        [        U5      U l        U R                  U5      u  U l        U l        Xpl        Xl        0 U l        UUUU R                  U R                  S
.U l        Xl        U R&                  (       ay  U R)                  5       U l        U R)                  5       U l        U R)                  5       U l        S U R.                  S'   U R)                  5       U l        U R)                  5       U l        g g )Nzlearning_rate is not setzmomentum is not setc                .    [        U [        [        45      $ N)
isinstancer   float)regulars    r%   <lambda>#Momentum.__init__.<locals>.<lambda>   s    Jw%8H$Ir$   r   weight_decayr   r   )learning_rate
parametersr2   	grad_clipnamer   )r   r   r   r   r   FP32_DenseTensor)
ValueErrorr-   intr.   listdict_update_regularizationsuper__init__type	_momentumbool_use_nesterov_regularization_method_regularization_coeff_multi_precision_rescale_grad_master_weights_default_dict_use_multi_tensor_create_multi_tensor_dict_param_dict_velocity_dict_master_weight_dict_regularization_method_dict_regularization_coeff_dict)selfr3   r   r4   r   r2   r5   multi_precisionr   use_multi_tensorr6   	predicateparam_groupdecay
reg_method	reg_coeff
py_regular	__class__s                    r%   r>   Momentum.__init__   s     788233lC(( .LI	j$''*Q-..#-K *[8 $N3) 
 -1,G,G,N)J;E 78:C 67)25)9)9uJ2<K/ $. '|44T,
'!# 	 	
 	!!,/ ''5	
'& /)! !((%)%@%@$($>$>
 "2!!#==?D"&"@"@"BD'+'E'E'GD$;?D$$%78/3/M/M/OD,.2.L.L.ND+ "r$   c                    SnSn[        U[        5      (       a  SnUR                  n[        U[        5      (       a  SnUnX#4$ )N         l2_decay)r-   r   _coeffr.   )rP   r2   rV   rW   s       r%   r<   Momentum._update_regularization   sI    
	lG,,#J$++IlE**#J$I$$r$   c                .   [        U[        R                  [        R                  R                  45      (       d   e[        U[
        5      (       a  U R                  U5      nU GH-  nUR                  U R                  ;   a  M   U R                  (       at  U R                  UR                  5      (       aT  U R                  U5      nU R                  U R                  U5        U R                  R                  UR                  5        M  U R                  UR                  5      (       a'  U R                  (       d  [         R"                  " S5        U R                  U R                  U5        U R                  R                  UR                  5        GM0     g)z,
if framework.in_dynamic_mode():
    return
zAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Momentum optimizer.N)r-   r   Blockpaddler   r;   _update_param_groupr6   _already_create_accumulatorrE   _is_dtype_fp16_or_bf16dtype_create_master_weight_add_accumulator_velocity_acc_straddwarningswarn)rP   blockr4   pmaster_ps        r%   _create_accumulatorsMomentum._create_accumulators   s$   
 %)//6::3C3C!DEEEEj$''11*=JAvv999$$)D)DQWW)M)M55a8%%d&<&<hG0044QVV<++AGG44--\ !!$"8"8!<,,008# r$   c                   > [        US5      (       a!  [        UR                  [        5      (       a  U$ [        TU ]  XU5      $ )z`Create and add backward regularization Operators

Function helper of append_regularization_ops.
regularizer)hasattrr-   rt   r   r=   _create_regularization_of_grad)rP   paramgradregularizationrY   s       r%   rv   'Momentum._create_regularization_of_grad  sG     5-((Zw.
 .
 Kw5
 	
r$   c                   [        U[        R                  [        R                  45      (       d  [	        S5      e[        U[
        5      (       a  U R                  U5      nU R                  U R                  US   5      nU R                  U5      nUS   nU R                  nU R                  n[        US5      (       aI  [        UR                  [        5      (       a  SnUR                  R                  nOUR                  b  SnSnU R                   =(       a    U R#                  US   R$                  5      nU(       a  U R&                  US   R(                     OS n	[+        5       (       am  [        U[
        5      (       a  U R-                  US   5        [.        R0                  " US   US   UUU	U R2                  U R4                  UUUU R6                  5      $ U R2                  U R4                  UUUU R6                  S	.n
US   /US   /U/U/S
.nUS   /U/S.nU(       a  XS'   XS'   UR9                  U R:                  UUU
SS9nU$ )Nzblock is not instance of Block.r   rt   r^   r\   r]   r2   r   )mur   r   r   rQ   r   ParamGradVelocityLearningRateParamOutVelocityOutMasterParamMasterParamOutTr?   inputsoutputsattrsstop_gradient)r-   r   rb   r   	TypeErrorr;   rd   _get_accumulator_masterrj   _create_param_lrrC   rD   ru   rt   r   r_   rE   rf   rg   rG   r6   r   r<   r   	momentum_r@   rB   rF   	append_opr?   )rP   rn   param_and_gradvelocity_acclrrw   r   r   find_mastermaster_weightr   r   r   momentum_ops                 r%   _append_optimize_opMomentum._append_optimize_op  sj   %)//399!=>>=>>nd++!55nEN33""N1$5
 "">2 q! $ ; ;#995-((%++W55(2%','8'8'?'?$"".(*%'*$++ 
0K0K1##1

    !2!7!78 	 "##.$//++N>,JK##q!q!""%$""  nn $ 2 2)>(<#. $ 2 2E )+,'*+)N!#	F ,A./ ,~G
 (5}%,9()  //YY" * K r$   c                   U R                  X5        U GHB  nU R                  U R                  U5      nU R                  nU R                  n[        US5      (       aI  [        UR                  [        5      (       a  SnUR                  R                  nOUR                  b  SnSnUR                  [        R                  :X  a  U R                  S   U   R                  U5        U R                  S   U   R                  U5        U R                   S   U   R                  U5        U R"                  S   U   R                  U5        GM7  U R%                  UR                  5      (       a  U R                  S   U   R                  U5        U R                  S   U   R                  U5        U R&                  (       a9  U R(                  S   U   R                  U R*                  UR,                     5        OSU R(                  S   U'   U R                   S   U   R                  U5        U R"                  S   U   R                  U5        GM:  [/        S5      e   g)	a  
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, bf16, float32).
This function will be overridden in the corresponding optimizer file.

Args:
    target_block: the block in which the loss tensor is present
    parameters: list of parameter tensors for the optimizer
rt   r^   Nr\   r]   r7   FP16_DenseTensorz^Now multi_tensor_momentum only support fp32, fp16 or bf16 parameters and grad is DENSE_TENSOR.)rq   r   rj   rC   rD   ru   r-   rt   r   r_   rg   rc   float32rK   appendrL   rN   rO   rf   rE   rM   rG   r6   r8   )rP   target_blockr4   param_group_idxrw   r   r   r   s           r%   _multi_tensor_initMomentum._multi_tensor_initl  sH    	!!,;E77&&L %)$?$?!#'#=#= um,,e//99,6)+0+<+<+C+C(&&2,.)+.({{fnn,  !34_ELL ##$67HOO  001CD#&.///0BC#&-.,,U[[99  !34_ELL ##$67HOO  ((,,-?@'fT11%**=>  ,,-?@' 001CD#&.///0BC#&-. t e  r$   c                   [        U[        R                  5      (       d   e/ / S.n/ / S.n[        U[        5      (       GaV  U GHM  nUS   c  M  US   R                  SL d  M   US   R
                  [        R                  :X  as  US   R                  [        R                  R                  R                  :X  a>  US   R                  US   5        U R                  U5      nUS   R                  U5        M  U R                  US   R
                  5      (       d  M  US   R                  [        R                  R                  R                  :X  d  GM  US   R                  US   5        U R                  U5      nUS   R                  U5        GMP     GOUS    GH  nUS   c  M  US   R                  SL d  M   0 nXhS'   UR!                  UR#                  5        V	V
s0 s H  u  pU	S:w  d  M  X_M     sn
n	5        U R%                  U5      nUS   R
                  [        R                  :X  at  US   R                  [        R                  R                  R                  :X  a?  US   R                  US   5        U R                  U5      nUS   R                  U5        GM  U R                  US   R
                  5      (       d  GM.  US   R                  [        R                  R                  R                  :X  d  GMf  US   R                  US   5        U R                  U5      nUS   R                  U5        GM     SS/nU GH  n['        U R(                  U   U   5      S:  d  M%  U R*                  =(       a    US:H  nU R,                  U   nUb  X   OSn[/        5       (       GaO  U R1                  S	5      nU(       aY  [        U[        R2                  R4                  [        R6                  R8                  45      (       a  U R;                  S	S
5        M  M  [        U[        R2                  R4                  [        R6                  R8                  45      (       a  U R;                  S	S5        [<        R>                  " U R(                  U   U   XL   U R@                  U   U   X\   UU RB                  U RD                  U RF                  U   U   U RH                  U   U   UU RJ                  5      u      nGM  U R(                  U   U   XL   U R@                  U   U   X\   S.nU R(                  U   U   U R@                  U   U   S.nU RB                  U RD                  U RF                  U   U   U RH                  U   U   S.nU(       a/  U R,                  U   U   US'   U R,                  U   U   US'   UUS'   URM                  SUUUS
S9  GM     gs  sn
n	f )z=
For Multi Tensor, append optimize merged_operator to block.
)r7   r   r   Nr   Fr7   r   params	found_infTr}   r   )r|   r   r   r   r   r   rQ   merged_momentumr   )'r-   r   rb   r:   r   rg   rc   r   r?   r
   VarDescVarTypeDENSE_TENSORr   r   rf   updateitemsrd   lenrK   rE   rM   r   _get_auxiliary_vareagerr   r   Value_set_auxiliary_varr   merged_momentum_rL   r@   rB   rN   rO   rF   r   )rP   r   parameters_and_gradsr   	grad_dictlr_dictr   r   param_grad_dictkvmulti_tensor_listkeyr   r   r   _r   r   r   s                       r%    _append_optimize_multi_tensor_op)Momentum._append_optimize_multi_tensor_op  s    ,	8888)+D	')rB*D11"6!!$,!!$22e;&q)//6>>A*1-22<<//<<= ""45<<^A=NO!22>B 23::2>33N14E4K4KLL*1-22<<//<<= ""45<<^A=NO!22>B 23::2>' #7* #7x"@!!$,!!$22e;&(O0>H-#** )=(B(B(D(D H} !AD(D &*%=%=o%NN&q)//6>>A*1-22<<//<<= ""45<<^A=NO!22>B 23::2>33N14E4K4KLL*1-22<<//<<= ""45<<^A=NO!22>B 23::2>; #A> 01CD$C4##C(9:Q>))Gc5G.G  !% 8 8 = %0 "2  *++ $ 7 7 DI %%

(9(96::;K;K'L  !33KF
 &%

(9(96::;K;K'L  !33KG"("9"9 ,,S1/B%N //4_E#L) NN .. <<SA / !;;C@ / ( ..#1a& "&!1!1#!6!G )$($7$7$<_$M(/	F %)$4$4S$9/$J'+':':3'?+(G #nn(,(:(:151Q1Q2)2+ 150O0O1)1+	E #040H0H0M+1}- 594L4L5)5+ 01 4?/0 **.% '#&* + S %3s   !W1Wc                   UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  S5      nU$ )Nr   r   r   r   r   r   )getrH   r@   rB   rF   rC   rD   )rP   r4   s     r%   rd   Momentum._update_param_groupA  s    #**:6
 (^^D..~>
 (^^D..~>
 '1nn#T%7%78O%P'
# &0^^"D$6$67M$N&
"  ^^H-
r$   )rH   rM   rG   r@   rE   rK   rD   rO   rC   rN   rF   rI   rB   rL   r?   )
gMbP?g?NFNNFg      ?FN)r3   zfloat | Tensor | LRSchedulerr   r.   r4   z<Sequence[Tensor] | Sequence[_MomentumParameterConfig] | Noner   rA   r2   z%float | WeightDecayRegularizer | Noner5   zGradientClipBase | NonerQ   rA   r   r.   rR   rA   r6   z
str | NonereturnNoner,   )r   r   r    r!   __doc__rj   r>   r<   rq   rv   r   r   r   rd   r#   __classcell__)rY   s   @r%   r'   r'   3   s    ]~ # 7< ">B-1 %!!&FO3FO FO
 IFO FO <FO +FO FO FO FO FO 
FO FOP
%9:
Un>@Sj r$   r'   )
__future__r   rl   typingr   rc   r   r   paddle.frameworkr   paddle.regularizerr   baser
   r   	optimizerr   collections.abcr   typing_extensionsr   r   paddle.nn.clipr   r   r   r   r   r   __all__r'   r   r$   r%   <module>r      s\    #      3 & "  (-/9+1#3 1 _y _r$   