
    ёi6                        S SK Jr  S SKJr  S SKJrJr  S SKJr  SSK	J
r
Jr  SSKJr  SS	KJr  \(       a/  S S
KJr  S SKJr  S SKJr  S SKJr  S SKJr  SSKJr   " S S\5      r/ r " S S\5      rg)    )annotations)TYPE_CHECKING)_C_opspir)global_scope   )core	framework)Variable   )	Optimizer)Sequence)Callable)NotRequired)Tensor)GradientClipBase)_ParameterConfigc                  H    \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S
rg)_LambParameterConfig%   zNotRequired[float | Tensor]beta1beta2epsilonzNotRequired[float]lamb_weight_decayz,NotRequired[Callable[[Tensor], bool] | None]exclude_from_weight_decay_fn N)__name__
__module____qualname____firstlineno____annotations____static_attributes__r       U/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/optimizer/lamb.pyr   r   %   s$    ****,,--'
 	
r#   r   c                     ^  \ rS rSrSrSrSrSrSr           S                       SU 4S jjjr	SS jr
S	 rS
 rS rS rSrU =r$ )Lamb2   u  
LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.

LAMB Optimizer is designed to scale up the batch size of training without losing
accuracy, which supports adaptive element-wise updating and accurate layer-wise
correction. For more information, please refer to `Large Batch Optimization for
Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .

The updating of parameters follows:

..  math::

    m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t

    v_t &= \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2

    m_t &= \frac{m_t}{\beta_1^t}

    v_t &= \frac{v_t}{\beta_2^t}

    r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon}

    w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})


where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
learning rate, :math:`\\lambda` the LAMB weight decay rate.

Args:
    learning_rate (float|Tensor, optional): the learning rate used to update parameters. \
        Can be a float value or a Variable with data type float32. Default 0.001.
    lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. Remind that weight_decay should be None.
    beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
        Default 0.9.
    beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
        Default 0.999.
    epsilon (float|Tensor, optional): A small float value for numerical stability. Default 1e-6.
    parameters (list|tuple|None, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
        This parameter is required in dygraph mode. And you can specify different options for \
        different parameter groups such as the learning rate, weight decay, etc, \
        then the parameters are list of dict. Note that the learning_rate in parameter groups \
        represents the scale of base learning_rate. \
        The default value is None in static graph mode, at this time all parameters will be updated.
    grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
        some derived class of ``GradientClipBase`` . There are three clipping strategies
        ( :ref:`api_paddle_base_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_base_clip_ClipGradByNorm` ,
        :ref:`api_paddle_base_clip_ClipGradByValue` ). If you want better convergence, it is recommended
        to use :ref:`api_paddle_base_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping.
    exclude_from_weight_decay_fn (Callable|None, optional): whether to skip weight decay for a parameter when this function returns True while take the parameter as input.
    multi_precision (bool, optional) - Whether to use it during weight updates multi-precision, Default False。
    always_adapt (bool, optional): whether to use Layer-wise LR adaptation. By default, skip adaptation on parameters that are
        excluded from weight decay, unless always_adapt == True, then always enable LR adaptation.
    name(str|None, optional): For detailed information, please refer to
        :ref:`api_guide_Name` . Usually name is no need to set and None by default.
Examples:
    .. code-block:: python

        >>> import paddle

        >>> inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1)
        >>> linear = paddle.nn.Linear(10, 10)
        >>> out = linear(inp)
        >>> loss = paddle.mean(out)
        >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
        >>> beta2 = paddle.to_tensor([0.85], dtype="float32")
        >>> lamb = paddle.optimizer.Lamb(
        ...     learning_rate=0.002,
        ...     beta1=beta1,
        ...     beta2=beta2,
        ...     parameters=linear.parameters(),
        ...     lamb_weight_decay=0.01
        ... )
        >>> back = out.backward()
        >>> lamb.step()
        >>> lamb.clear_grad()

moment1moment2beta1_pow_accbeta2_pow_accc                   > Uc   eUc   eUc   eUc   e[         TU ]  UUS UUS9  SU l        X0l        X@l        XPl        X l        Xl        UUUUUS.U l        0 U l	        0 U l
        Xl        Xl        g )N)learning_rate
parametersweight_decay	grad_clipnamelamb)r   r   r   r   r   )super__init__type_beta1_beta2_epsilon_lamb_weight_decay_exclude_from_weight_decay_fn_default_dict_master_weights_used_master_weights_multi_precisionalways_adapt)selfr-   r   r   r   r   r.   r0   r   multi_precisionr?   r1   	__class__s               r$   r4   Lamb.__init__   s      (((      """'! 	 	
 	"3-I*!2,H
  "$&! /(r#   c                v   Uc
  [        5       nUR                  U5      R                  5       nU R                  R	                  U5      nUbk  UR                  U5      R                  5       nUR                  5       UR                  5       :w  d   eUR                  5       UR                  5       :X  d   e X54$ S nX54$ N)r   find_var
get_tensorr=   get_dtypeshape)r@   r1   scopep_tmaster_name
master_p_ts         r$   _get_parameterLamb._get_parameter   s    = NEnnT"--///33D9"4??AJ$$&#**,666##%444  Jr#   c                n   [        U[        R                  [        R                  45      (       d  [	        S5      e[        U[
        5      (       a  U R                  U5      nU H  nUR                  U R                  ;   a  M  U R                  (       ai  U R                  UR                  5      (       aI  U R                  U5      nU R                  U5        U R                  R                  UR                  5        M  U R                  U5        U R                  R                  UR                  5        M     g )Nblock is not instance of Block.)
isinstancer
   Blockr   	TypeErrordict_update_param_groupr1   _already_create_accumulatorr>   _is_dtype_fp16_or_bf16dtype_create_master_weight_add_moments_powsadd)r@   blockr.   pmaster_ps        r$   _create_accumulatorsLamb._create_accumulators   s    %)//399!=>>=>>j$''11*=J Avv999$$)D)DQWW)M)M55a8&&x00044QVV<&&q)0044QVV< r#   c           
        UR                   nU R                  U5      (       a$  [        R                  R                  R
                  nU R                  U R                  XS9  U R                  U R                  XS9  U R                  U R                  UU[        U R                  [        5      (       a  SOU R                  S/[        R                  R                  R                  SS9  U R                  U R                  UU[        U R                  [        5      (       a  SOU R                  S/[        R                  R                  R                  SS9  g )N)rZ   ?r   cpu)r1   paramrZ   
fill_valuerJ   r5   device+?)rZ   rY   r	   VarDescVarTypeFP32_add_accumulator_moment1_acc_str_moment2_acc_str_beta1_pow_acc_strrS   r6   r   DENSE_TENSOR_beta2_pow_acc_strr7   )r@   r_   	acc_dtypes      r$   r\   Lamb._add_moments_pows   s   GG	&&y11,,11Id33QHd33QH((!$++x88dkk#%%22 	 
	
 	((#DKK::#%%22 	 
	
r#   c                R   [        U[        R                  [        R                  45      (       d  [	        S5      e[        U[
        5      (       a  U R                  U5      nSUR                  l        U R                  U R                  US   5      nU R                  U R                  US   5      nU R                  U R                  US   5      nU R                  U R                  US   5      nU R                  b  U R                  US   5      (       a  SnOU R                  nU R!                  U5      nU R"                  =(       a    U R%                  US   R&                  5      n	US   R(                  n
U	(       a)  U R*                  U
   nUR(                  U R,                  U
'   OS n[        R.                  " 5       (       aS  [0        R2                  " US   US   UUUUUUS UU R4                  U R6                  U R8                  U R:                  U	5        g US   US   UUUUUS.nUS   UUUUS.nU R4                  U R6                  U R8                  UU R:                  U	S.nU	(       a  XS	'   XS
'   U R=                  S5      nU(       a  XS'   UR?                  U R@                  UUUSS9nU$ )NrR   Tr   g        r   )ParamGradLearningRateMoment1Moment2Beta1PowBeta2Pow)ParamOut
Moment1Out
Moment2OutBeta1PowOutBeta2PowOut)r   r   r   r/   r?   rA   MasterParamMasterParamOut	found_inf
SkipUpdate)r5   inputsoutputsattrsstop_gradient)!rS   r
   rT   r   rU   rV   rW   program	_use_lamb_get_accumulator_masterrn   ro   rp   rr   r:   r9   _create_param_lrr>   rY   rZ   r1   r<   r=   in_dynamic_or_pir_moder   lamb_r6   r7   r8   r?   _get_auxiliary_var	append_opr5   )r@   r^   param_and_gradr(   r)   r*   r+   r/   lrfind_masterp_namemaster_weightr   r   r   r   lamb_ops                    r$   _append_optimize_opLamb._append_optimize_op   s   %)//399!=>>=>>nd++!55nEN"&..!!>!#4
 ..!!>!#4
 44##^A%6
 44##^A%6

 ..:22>!3DEEL22L"">2++ 
0K0K1##1
  "'' 008M0=0B0BD%%f- M++--LLq!q!!!"  (*&q) """))F +1-%%,,G == , $ 1 1#.E (5}%,9()//<I'0|$ooYY" & G Nr#   c                   UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  S5      nU$ )Nr   r   r   r   r   params)rH   r;   r6   r7   r8   r9   r:   )r@   r.   s     r$   rW   Lamb._update_param_group[  s     nnWd.@.@.IJ nnWd.@.@.IJ"y$2D2DY2OP",..!3!34G!H#
 .8^^*=>.
*  ^^H-
r#   )r6   r7   r;   r8   r:   r9   r<   r>   r=   r?   r5   )gMbP?g{Gz?rd   ri   gư>NNNFFN)r-   float | Tensorr   floatr   r   r   r   r   r   r.   z8Sequence[Tensor] | Sequence[_LambParameterConfig] | Noner0   zGradientClipBase | Noner   zCallable[[Tensor], bool] | NonerA   boolr?   r   r1   z
str | NonereturnNonerE   )r   r   r   r    __doc__rn   ro   rp   rr   r4   rO   ra   r\   r   rW   r"   __classcell__)rB   s   @r$   r&   r&   2   s    L\ ! (( ).#' # %"& -1HL %",)%,) !,) 	,)
 ,)  ,) E,) +,) 'F,) ,) ,) ,) 
,) ,)\=$
<fP r#   r&   N)
__future__r   typingr   paddler   r   paddle.base.executorr   baser	   r
   base.frameworkr   	optimizerr   collections.abcr   r   typing_extensionsr   r   paddle.nn.clipr   r   r   __all__r&   r   r#   r$   <module>r      sS    #    - " %  (-/+
/ 
 u9 ur#   