
    ёi>                        S SK Jr  S SKrS SKJr  S SKJrJr  SSKJ	r	J
r
  SSKJr  SSKJr  S	S
KJr  \(       a5  S SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S	SKJr  S	SKJr   " S S\5      r/ r " S S\5      rg)    )annotationsN)TYPE_CHECKING)_C_opspir   )core	framework)no_grad)
name_scope   )	Optimizer)Sequence)NotRequired)Tensor)GradientClipBase)WeightDecayRegularizer)LRScheduler)_ParameterConfigc                  4    \ rS rSr% S\S'   S\S'   S\S'   Srg)_AdamaxParameterConfig&   zNotRequired[float | Tensor]beta1beta2epsilon N)__name__
__module____qualname____firstlineno____annotations____static_attributes__r       W/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/optimizer/adamax.pyr   r   &   s    ****,,r"   r   c                     ^  \ rS rSr% SrS\S'   SrSrSr        S                 SU 4S jjjr	S	 r
S
 rS rS rS rSrU =r$ )Adamax/   a  
The Adamax optimizer is implemented based on the Adamax Optimization
in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
which makes the learning rate update algorithm more stable and simple.

The parameter ``param_out`` update rule with gradient ``grad``:

.. math::

    t & = t + 1

    moment\_out & = {\beta}_1 * moment + (1 - {\beta}_1) * grad

    inf\_norm\_out & = max({\beta}_2 * inf\_norm + \epsilon, |grad|)

    learning\_rate & = \frac{learning\_rate}{1 - {\beta}_1^t}

    param\_out & = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}

Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_

The original paper does not have an ``epsilon`` attribute,
it is added here for numerical stability to prevent the division by 0 error.

Args:
    learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
        It can be a float value or a LRScheduler. The default value is 0.001.
    beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
        It should be a float number or a 0-D Tensor with shape [] and data type as float32.
        The default value is 0.9.
    beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
        It should be a float number or a 0-D Tensor with shape [] and data type as float32.
        The default value is 0.999.
    epsilon (float|Tensor, optional): A small float value for numerical stability.
        The default value is 1e-08.
    parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
        This parameter is required in dygraph mode. And you can specify different options for
        different parameter groups such as the learning rate, weight decay, etc,
        then the parameters are list of dict. Note that the learning_rate in parameter groups
        represents the scale of base learning_rate.
        The default value is None in static graph mode, at this time all parameters will be updated.
    weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization.
        It can be a int or float value as coeff of L2 regularization or
        :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
        If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already,
        the regularization setting here in optimizer will be ignored for this parameter.
        Otherwise, the regularization setting here in optimizer will take effect.
        Default None, meaning there is no regularization.
    grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
        some derived class of ``GradientClipBase`` . There are three clipping strategies
        ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
        :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
    name (str|None, optional): Normally there is no need for user to set this property.
        For more information, please refer to :ref:`api_guide_Name`.
        The default value is None.

**Notes**:
    **Currently, Adamax doesn't support sparse parameter optimization.**

Examples:
    .. code-block:: python

        >>> import paddle

        >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
        >>> linear = paddle.nn.Linear(10, 10)
        >>> inp = paddle.to_tensor(inp)
        >>> out = linear(inp)
        >>> loss = paddle.mean(out)

        >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
        >>> beta2 = paddle.to_tensor([0.99], dtype="float32")

        >>> adamax = paddle.optimizer.Adamax(
        ...     learning_rate=0.1,
        ...     parameters=linear.parameters(),
        ...     beta1=beta1,
        ...     beta2=beta2,
        ...     weight_decay=0.01
        ... )
        >>> out.backward()
        >>> adamax.step()
        >>> adamax.clear_grad()


        >>> # Note that the learning_rate of linear_2 is 0.01.
        >>> linear_1 = paddle.nn.Linear(10, 10)
        >>> linear_2 = paddle.nn.Linear(10, 10)
        >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
        >>> out = linear_1(inp)
        >>> out = linear_2(out)
        >>> loss = paddle.mean(out)
        >>> adamax = paddle.optimizer.Adamax(
        ...     learning_rate=0.1,
        ...     parameters=[{  # type: ignore
        ...         'params': linear_1.parameters()
        ...     }, {
        ...         'params': linear_2.parameters(),
        ...         'weight_decay': 0.001,
        ...         'learning_rate': 0.1,
        ...         'beta1': 0.8
        ...     }],
        ...     weight_decay=0.01,
        ...     beta1=0.9
        ... )
        >>> out.backward()
        >>> adamax.step()
        >>> adamax.clear_grad()
strtypemomentinf_normbeta1_pow_accc	                >  > Uc   eUc   eUc   eUc   eSUs=::  a  S:  d  O  [        S5      eSUs=::  a  S:  d  O  [        S5      eSU::  d  [        S5      e[        T	U ]	  UUUUUS9  SU l        X l        X0l        X@l        SU l        0 U l        UUUS	.U l	        g )
Nr   r   z.Invalid value of beta1, expect beta1 in [0,1).z.Invalid value of beta2, expect beta2 in [0,1).z.Invalid value of epsilon, expect epsilon >= 0.)learning_rate
parametersweight_decay	grad_clipnameadamaxF)r   r   r   )

ValueErrorsuper__init__r(   _beta1_beta2_epsilon_multi_precision_master_weights_default_dict)
selfr-   r   r   r   r.   r/   r0   r1   	__class__s
            r#   r5   Adamax.__init__   s     (((      """E~A~MNNE~A~MNNG|MNN'!% 	 	
 	 %! 
r"   c                F   UR                   nU R                  U5      (       a$  [        R                  R                  R
                  nU R                  U R                  XS9  U R                  U R                  XS9  U R                  U R                  UU R                  S/S9  g )N)dtyper   )r1   param
fill_valueshape)r@   _is_dtype_fp16_or_bf16r   VarDescVarTypeFP32_add_accumulator_moment_acc_str_inf_norm_acc_str_beta1_pow_acc_strr6   )r<   p	acc_dtypes      r#   _add_moments_powsAdamax._add_moments_pows   s    GG	&&y11,,11Id22AGd44aI(({{#	 	 	
r"   c                   [        U[        5      (       a  U R                  U5      nU GH  nUR                  U R                  ;   a  M   U R
                  (       ai  U R                  UR                  5      (       aI  U R                  U5      nU R                  U5        U R                  R                  UR                  5        M  U R                  UR                  5      (       a'  U R
                  (       d  [        R                  " S5        U R                  U5        U R                  R                  UR                  5        GM     g )NzAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Adam optimizer.)
isinstancedict_update_param_groupr1   _already_create_accumulatorr9   rD   r@   _create_master_weightrN   addwarningswarn)r<   blockr.   rL   master_ps        r#   _create_accumulatorsAdamax._create_accumulators   s    j$''11*=J Avv999$$)D)DQWW)M)M55a8&&x00044QVV<++AGG44--X ""1%,,008# r"   c                   [        U[        R                  [        R                  45      (       d   e[        U[        5      (       a  U R                  U5      nU R                  U R                  US   5      nU R                  U R                  US   5      nU R                  =(       a    U R                  US   R                  5      nU(       a  U R                  US   R                     OS nU R                  U R                  US   5      n[        R                  " 5       (       aT  [         R"                  " US   US   U R%                  U5      UUUUU R&                  U R(                  U R*                  U5        g US   US   U R%                  U5      UUUS.nUS   UUS.n	U(       a  XhS'   XiS'   U R&                  U R(                  U R*                  US.n
UR-                  U R.                  UU	U
SS	9nU$ )
Nr   r   )ParamGradLearningRateMomentInfNormBeta1Pow)ParamOut	MomentOut
InfNormOutMasterParamMasterParamOut)r   r   r   multi_precisionTr(   inputsoutputsattrsstop_gradient)rQ   r	   Blockr   rR   rS   _get_accumulator_masterrI   rJ   r9   rD   r@   r:   r1   rK   in_dynamic_or_pir_moder   adamax__create_param_lrr6   r7   r8   	append_opr(   )r<   rY   param_and_gradr)   r*   find_mastermaster_weightr+   rk   rl   rm   	adamax_ops               r#   _append_optimize_opAdamax._append_optimize_op   s   %)//399!=>>>>nd++!55nEN--  ."3
 //""N1$5
 ++ 
0K0K1##1

    !2!7!78 	 44##^A%6
 ++--NNq!q!%%n5" (*&q) $ 5 5n E #)F +1-#&G
 (5}%,9()==#.	E YY" ( I r"   c                   [        U[        R                  [        R                  45      (       d   e[        U[        5      (       Ga  U GH  u  p4Ub  UR
                  SL a  M  [        R                  " 5       (       ae  U R                  U R                  U5      n[        5          [        R                  " XPR                  SS5      nUR                  US5        SSS5        M  [        R                  " 5       (       ap  UR                  R                   R#                  X4/5         U R                  U R                  U5      n[        R$                  " XPR                  SS5        SSS5        GM#  UR                  R                   R#                  X4/5         ['        S5         U R                  U R                  U5      nUR)                  SSU0SU0SU R                  0SS	9  SSS5        SSS5        GM     gUS
    GHc  u  p4Ub  UR
                  SL a  M  [        R                  " 5       (       a  U R                  U R                  U5      nUR+                  SU R,                  S   5      U l        [        5          [        R                  " XPR                  SS5      nUR                  US5        SSS5        M  UR                  R                   R#                  X4/5         ['        S5         U R                  U R                  U5      nUR+                  SU R,                  S   5      U l        UR)                  SSU0SU0SU R                  0SS	9  SSS5        SSS5        GMf     g! , (       d  f       GM)  = f! , (       d  f       GM<  = f! , (       d  f       GN= f! , (       d  f       GMa  = f! , (       d  f       GM  = f! , (       d  f       Nz= f! , (       d  f       GM  = f)zUpdate Beta1 Power accumulatorNTg        Fr2   scaleXOutrj   paramsr   )rQ   r	   ro   r   listrn   in_dygraph_moderp   rK   r
   r   r|   r6   copy_in_pir_moderY   program_optimized_guardscale_r   rt   getr;   )r<   rY   parameters_and_gradsrA   gradr+   tmps          r#   _finish_updateAdamax._finish_update>  s1   %)//399!=>>>>*D113<5#6#6$#>,,..$($@$@//%M !$ll);;T &++C7	 #
 **,,,,==umL(,(D(D 33U) m[[#tL	 ML ++<<e]K"8,(,(D(D 33U) !($'#7%*M$:#*DKK"8*. (  - LK)  4D  4H=<5#6#6$#>,,..$($@$@//%M #7":":!3!3G!<#DK !$ll);;T &++C7	 # ++<<e]K"8,(,(D(D 33U) ';&>&>#T%7%7%@' !($'#7%*M$:#*DKK"8*. (  - LK#  >7 # ML -, LK0 # -, LKsn   $5M##?M6NA N	N85N.O)A$OO#
M3	6
N		
NN
N+	.
N>	
OO
O"	c                    UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  S5      nU$ )Nr   r   r   r   )r   r;   r6   r7   r8   )r<   r.   s     r#   rS   Adamax._update_param_group  sm     nnWd.@.@.IJ nnWd.@.@.IJ"y$2D2DY2OP^^H-
r"   )r6   r7   r;   r8   r:   r9   r(   )gMbP?g?g+?g:0yE>NNNN)r-   zfloat | LRSchedulerr   float | Tensorr   r   r   r   r.   z:Sequence[Tensor] | Sequence[_AdamaxParameterConfig] | Noner/   z%float | WeightDecayRegularizer | Noner0   zGradientClipBase | Noner1   z
str | NonereturnNone)r   r   r   r   __doc__r    rI   rJ   rK   r5   rN   r[   ry   r   rS   r!   __classcell__)r=   s   @r#   r%   r%   /   s    m^ IO"( .3 # %"& >B-1)
*)
 )
 	)

  )
 G)
 <)
 +)
 )
 
)
 )
V
90GRFP r"   r%   ) 
__future__r   rW   typingr   paddler   r   baser   r	   base.dygraphr
   base.frameworkr   	optimizerr   collections.abcr   typing_extensionsr   r   paddle.nn.clipr   paddle.regularizerr   lrr   r   r   __all__r%   r   r"   r#   <module>r      sY    #     " " '  (-/9+-!1 - \Y \r"   