
    ёip                    ,   S SK Jr  S SKrS SKJr  S SKJr  S SKJr  S SK	r	S SK	J
r
  S SKJr  S SKJrJrJr  S S	KJr  S
SKJr  S
SKJrJr  S
SKJr  S
SKJrJrJrJr  S
SKJ r   SSK!J"r"  SSK#J$r$  \(       a  S SKJ%r%  S SK	J&r&  SSK'J(r(  / r) " S S\$5      r*g)    )annotationsN)defaultdict)Callable)TYPE_CHECKING)pir)DataType)ShardedStateDictShardedWeight$create_sharded_weight_with_new_local)Value   )_C_ops)core	framework)base)	ParameterVariablein_dynamic_or_pir_modein_pir_mode)GradientClipBase   )LRScheduler)	Optimizer)Sequence)Tensor)_AdamParameterConfigc                  .   \ rS rSr% SrS\S'   S\S'   SrSrS	rS
r	Sr
              S                             SS jjrS rS rS rS rS rS rS r\R(                  \R,                  SS j5       5       rS r    SS jrSrg)AdamW6   a  
The AdamW optimizer is implemented based on the AdamW Optimization
in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
it can resolves the problem of L2 regularization failure in the Adam optimizer.

.. math::

    \begin{aligned}
        &\hspace{5mm} t = t + 1 \\
        &\hspace{5mm} moment\_1\_out = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad \\
        &\hspace{5mm} moment\_2\_out = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad \\
        &\hspace{5mm} learning\_rate = learning\_rate * \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t} \\
        &\hspace{5mm}\textbf{if} \: \textit{amsgrad}: \\
        &\hspace{15mm} moment\_2\_max\_out = max(moment\_2\_out, moment\_2\_max) \\
        &\hspace{15mm} param\_out = param - learning\_rate * (\frac{moment\_1\_out}{\sqrt{moment\_2\_max\_out} + \epsilon} + \lambda * param) \\
        &\hspace{5mm}\textbf{else}: \: \\
        &\hspace{15mm} param\_out = param - learning\_rate * (\frac{moment\_1\_out}{\sqrt{moment\_2\_out} + \epsilon} + \lambda * param) \\
    \end{aligned}

Args:
    learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
        It can be a float value or a LRScheduler. The default value is 0.001.
    beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
        It should be a float number or a 0-D Tensor with shape [] and data type as float32.
        The default value is 0.9.
    beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
        It should be a float number or a 0-D Tensor with shape [] and data type as float32.
        The default value is 0.999.
    epsilon (float|Tensor, optional): A small float value for numerical stability.
        The default value is 1e-08.
    parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``.
        This parameter is required in dygraph mode. And you can specify different options for
        different parameter groups such as the learning rate, weight decay, etc,
        then the parameters are list of dict. Note that the learning_rate in parameter groups
        represents the scale of base learning_rate.
        The default value is None in static graph mode, at this time all parameters will be updated.
    weight_decay (int|float|Tensor, optional): The weight decay coefficient, it can be int, float or Tensor. The default value is 0.01.
    lr_ratio (Callable|None, optional): If it is not None,
        the learning rate will be updated with layer-wise learning rate ratio.
        Otherwise, the learning rate is the original.
        Default: None.
    apply_decay_param_fun (Callable|None, optional): If it is not None,
        only tensors that makes apply_decay_param_fun(Tensor.name)==True
        will be updated with weight decay. It only works when we want to specify tensors.
        Default: None.
    grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
        some derived class of ``GradientClipBase`` . There are three clipping strategies
        ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
        :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
    lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
        The accumulators are updated at every step. Every element of the two moving-average
        is updated in both dense mode and sparse mode. If the size of parameter is very large,
        then the update may be very slow. The lazy mode only update the element that has
        gradient in current mini-batch, so it will be much more faster. But this mode has
        different semantics with the original Adam algorithm and may lead to different result.
        The default value is False.
    multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
    amsgrad (bool, optional): Whether to use the AMSGrad variant of this algorithm from the paper
        `On the Convergence of Adam and Beyond <https://openreview.net/forum?id=ryQu7f-RZ>`_. Default is false.
    name (str|None, optional): Normally there is no need for user to set this property.
        For more information, please refer to :ref:`api_guide_Name`.
        The default value is None.
Notes:
    **Currently, AdamW doesn't support sparse parameter optimization.**

Examples:
    .. code-block:: python

        >>> import paddle

        >>> linear = paddle.nn.Linear(10, 10)
        >>> inp = paddle.rand([10,10], dtype="float32")
        >>> out = linear(inp)
        >>> loss = paddle.mean(out)

        >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
        >>> beta2 = paddle.to_tensor([0.99], dtype="float32")

        >>> opt = paddle.optimizer.AdamW(
        ...     learning_rate=0.1,
        ...     parameters=linear.parameters(),
        ...     beta1=beta1,
        ...     beta2=beta2,
        ...     weight_decay=0.01
        ... )
        >>> loss.backward()
        >>> opt.step()
        >>> opt.clear_grad()


        >>> # Note that the learning_rate of linear_2 is 0.01.
        >>> linear_1 = paddle.nn.Linear(10, 10)
        >>> linear_2 = paddle.nn.Linear(10, 10)
        >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
        >>> out = linear_1(inp)
        >>> out = linear_2(out)
        >>> loss = paddle.mean(out)
        >>> opt = paddle.optimizer.AdamW(
        ...     learning_rate=0.1,
        ...     parameters=[{  # type: ignore
        ...         'params': linear_1.parameters()
        ...     }, {
        ...         'params': linear_2.parameters(),
        ...         'weight_decay': 0.001,
        ...         'learning_rate': 0.1,
        ...         'beta1': 0.8
        ...     }],
        ...     weight_decay=0.01,
        ...     beta1=0.9
        ... )
        >>> loss.backward()
        >>> opt.step()
        >>> opt.clear_grad()

Nonehelperstrtypemoment1moment2moment2_maxbeta1_pow_accbeta2_pow_accNc                	   Uc   eUc   eUc   eUc   e[        U[        5      (       d  SUs=::  a  S:  d  O  [        S5      e[        U[        5      (       d  SUs=::  a  S:  d  O  [        S5      e[        U[        5      (       d  SU::  d  [        S5      e[        U[        [        45      (       d0  [        U[
        R                  [        45      (       d  [        S5      eUb  [        U[        5      (       d   e[        R                  " 5       (       du  [        R                  " 5       (       d[  [        R                  R                  5       R                  S5      S   [        R                  R!                  5       ;  a  [#        S5      eUbh  [        U[        R$                  5      (       a  [        S	['        U5       S
35      e[        U[(        5      (       a  [        S5      e[+        U5      U l        OS U l        Xl        [
        R0                  " 5       (       a  U R,                  c  [3        S5      e[        U[        [4        45      (       d  [        S['        U5       S35      eU
b   [        U
[6        5      (       d  [        S5      eS U l        U R,                  (       a  [        U R,                  S   [(        5      (       aE  U R,                   H  nSU;   a  M   S5       e   U R,                  S   S   S   R:                  U l        OU R,                  S   R:                  U l        0 U l        [?        S 5      U l         S U l!        / U l"        0 U l#        0 U l$        U RJ                  U l&        SU l        Xl'        [Q        5       U l)        Xl*        [	        U5      U l+        Xpl,        Xl-        Xl.        X l/        X0l0        X@l1        Xl2        Xl3        0 U l4        Xl5        [	        U5      UUUUU
S.U l6        / U l7        U R,                  (       aU  [        U R,                  S   [(        5      (       a3  U R,                   H"  nU Rq                  URs                  5       5        M$     OU R,                  U l7        S U l:        S U l;        0 U l<        [Q        5       U l=        U R}                  5         SU l?        SU l@        S U lA        SU lB        S U lC        g )Nr   r   z.Invalid value of beta1, expect beta1 in [0,1).z.Invalid value of beta2, expect beta2 in [0,1).z.Invalid value of epsilon, expect epsilon >= 0.z,weight_decay should be int, float or Tensor.:z#'lr_ratio' is unimplemented in CPU.zp`parameters` argument given to the optimizer should be an iterable of paddle Tensors, but got argument type is `z`.zv`parameters` argument should not get dict type, if parameter groups is needed, please set `parameters` as list of dictzNparameters argument given to the Optimizer should not be None in dygraph mode.z2learning rate should be float or LRScheduler, got z herezE'grad_clip' should be an instance of GradientClipBase's derived classparamszYparams should be set in parameters if parameter groups are optimized in different optionsc                     0 $ N r.       V/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/optimizer/adamw.py<lambda> AdamW.__init__.<locals>.<lambda>  s    r/   adamw)weight_decaybeta1beta2epsilon	lazy_mode	grad_clipF)D
isinstancer   
ValueErrorintfloatr   r   	TypeErrorr   r   is_compiled_with_cudais_compiled_with_xpupaddledevice
get_devicesplitget_all_custom_device_typeNotImplementedErrorr   r#   dictlist_parameter_list_namein_dygraph_modeAttributeErrorr   r   _dtypedtype_learning_rate_mapr   _accumulatorsr!   _opti_name_list_accumulators_holder_param_device_map
clear_gradclear_gradients_learning_rateset_params_name_apply_decay_param_fun_weight_decay_use_lowprecision_moment
_grad_clip	_lr_ratio_beta1_beta2_epsilon
_lazy_mode_multi_precision_master_weights_amsgrad_default_dict_param_groups_add_param_groupcopy_use_multi_tensorregularization_auxiliary_vars_already_create_accumulator_create_master_grad_states_use_fusion_storage_need_refusefusion_storage_fuse_buffer_versionmerged_model_params)selflearning_rater5   r6   r7   
parametersr4   use_lowprecision_momentlr_ratioapply_decay_param_funr9   r8   multi_precisionamsgradnameparam_groups                   r0   __init__AdamW.__init__   s_   & (((      """%''UQMNN%''UQMNN'5))!w,MNN,e55j9--u5?
 ?
 JKKh1111..001133MM,,.44S9!<}}??AB **OPP! *fmm44PPTU_P`Oaace  *d++' 
 $(
#3D #'D 
$$&&##+$d  -%)=>>DT-EXDYY^_   i)9::[  $..q1488#'#7#7K#{2 s2 $8 #2215h?BHH"2215;; #%
 )4!$&!!##	+E&;#"<0(?%#!# /! ",/""
  Jt/C/CA/F$M$M#33%%k&6&6&89  4 "&!5!5D!%"!+.5('')#( !"$%!#' r/   c                     X R                   U'   g r-   rk   )rs   keyvals      r0   _set_auxiliary_varAdamW._set_auxiliary_varJ  s    $'S!r/   c                @    XR                   ;   a  U R                   U   $ g r-   r   )rs   r   s     r0   _get_auxiliary_varAdamW._get_auxiliary_varM  s"    &&&'',,r/   c                   US   n[        U[        [        R                  R                  45      (       a  U/US'   O.[        U[
        5      (       a  [        S5      e[        U5      US'   U R                  R                  5        H  u  p4UR                  X45        M     [        5       nU R                   H   nUR                  [        US   5      5        M"     UR                  [        US   5      5      (       d  [        S5      eUS    H"  nUR                  SS5      UR                   S'   M$     U R                  R#                  U5        g)z
Add a param group to parameter_list.

Args:
    param_group (dict): The group of Tensors to be optimized with
    different optimization options.
r+   z`optimizer parameters should be in ordered collections,but received set, please use list instead.z7some parameters appear in more than one parameter grouprt         ?N)r:   r   r   r   ParameterMetarW   r>   rH   re   items
setdefaultrf   update
isdisjointr;   getoptimize_attrappend)rs   r|   r+   kv	param_setgroupparams           r0   rg   AdamW._add_param_groupS  s2    X&fy#((*@*@ABB%+HK!$$= 
 %)LK! &&,,.DA""1( / E	''ESx12 ( ##CH(=$>??I  !*E3>??4E0 +
 	!!+.r/   c           
     6   UR                   nU R                  U5      (       aT  U R                  (       dC  [        5       (       a  [        R
                  O#[        R                  R                  R                  n[        R                  " 5       (       GaC  SS KnUR                  SSS9nUS:X  a  U R                  U R                  U[        R                  R                  R                  S9  U R                  U R                   U[        R                  R                  R                  S9  U R"                  (       a=  U R                  U R$                  U[        R                  R                  R                  S9  OU R                  U R                  XS9  U R                  U R                   XS9  U R"                  (       a  U R                  U R$                  XS9  O_U R                  U R                  XS9  U R                  U R                   XS9  U R"                  (       a  U R                  U R$                  XS9  U R                  U R&                  UU[)        U R*                  [,        [.        45      (       a  SOU R*                  S/[        R                  R                  R0                  S	S
9  U R                  U R2                  UU[)        U R4                  [,        [.        45      (       a  SOU R4                  S/[        R                  R                  R0                  S	S
9  g )Nr   xpu_adamw_moment_dtypefp32)defaultfp16)rN   ?r   cpu)r{   r   rN   
fill_valueshaper#   rB   +?)rN   _is_dtype_fp16_or_bf16r[   r   r   FLOAT32r   VarDescVarTypeFP32r@   osgetenv_add_accumulator_moment1_acc_strFP16_moment2_acc_strrd   _moment2_acc_max_str_beta1_pow_acc_strr:   r^   r   r   DENSE_TENSOR_beta2_pow_acc_strr_   )rs   p	acc_dtyper   r   s        r0   _add_moments_powsAdamW._add_moments_powsz  s   GG	''	2211 %0MM  t||7K7K7P7P  $$&&%'YY(& &/ &" &/%%))1DLL4H4H4M4M &  %%))1DLL4H4H4M4M &  ==))11"ll2277 *  %%d&;&;Q%P%%d&;&;Q%P==))111 *  !!$"7"7!L!!$"7"7!L}}%%--q &  	(( dkkHe+<== [[#%%22 	 	
 	(( dkkHe+<== [[#%%22 	 	
r/   c                   [        U[        R                  [        R                  45      (       d   e[        U[        5      (       a  U R                  U5      nU GH  nUR                  U R                  ;   a  M   U R                  (       ai  U R                  UR                  5      (       aI  U R                  U5      nU R                  U5        U R                  R                  UR                  5        M  U R                  UR                  5      (       a'  U R                  (       d  [        R                  " S5        U R                  U5        U R                  R                  UR                  5        GM     g )NzAccumulating with FP16 or BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Adam optimizer.)r:   r   Blockr   rG   _update_param_groupr{   rl   rb   r   rN   _create_master_weightr   addwarningswarn)rs   blockru   r   master_ps        r0   _create_accumulatorsAdamW._create_accumulators  s   %)//399!=>>>>j$''11*=J Avv999$$)D)DQWW)M)M55a8&&x00044QVV<++AGG44--X ""1%,,008# r/   c                   [        U[        R                  [        R                  45      (       d   e[        U[        5      (       a  U R                  U5      nUu  p4SnU R                  b"  U R                  UR                  5      (       d  SnU R                  U R                  US   5      nU R                  U R                  US   5      nU R                  (       a  U R                  U R                  US   5      OS nU R                  U R                  US   5      n	U R                  U R                  US   5      n
U R                  =(       a    U R!                  US   R"                  5      nU(       a  U R$                  US   R                     OS nU R'                  U5      n[)        5       (       Ga2  U R*                  c  SOU R+                  US   5      n[        U R,                  [.        5      (       d  U R,                  OU R,                  R1                  S5      n[        U R2                  [.        5      (       d  U R2                  OU R2                  R1                  S5      n[5        5       (       a  U R7                  S5      OS n[8        R:                  " US   US   UUUUU	U
UUUUU R<                  UU R>                  UU R@                  SUSU R                  5      u              ng US   /US   /U/U/U/U	/U
/S.nU R7                  S5      nU(       a  UUS	'   US   /U/U/U	/U
/S
.nU R@                  SUUU R>                  U R*                  c  SOU R+                  US   5      U R                  S.n[        U R,                  [.        5      (       a  U R,                  US'   OU R,                  US'   [        U R2                  [.        5      (       a  U R2                  US'   OU R2                  US'   [        U R<                  [.        5      (       a  U R<                  US'   OU R<                  US'   U R                  (       a  U/US'   U/US'   U(       a
  UUS'   UUS'   URC                  U RD                  UUUSS9nU$ )NTFr   r   	found_infr   i  )ParamGradLearningRateMoment1Moment2Beta1PowBeta2Pow
SkipUpdate)ParamOut
Moment1Out
Moment2OutBeta1PowOutBeta2PowOut)r8   min_row_size_to_use_multithreadry   
with_decaycoeffrw   rz   Beta1Tensorr5   Beta2Tensorr6   EpsilonTensorr7   
Moment2MaxMoment2MaxOutMasterParamMasterParamOut)r#   inputsoutputsattrsstop_gradient)#r:   r   r   r   rG   r   rY   r{   _get_accumulator_masterr   r   rd   r   r   r   rb   r   rN   rc   _create_param_lrr   r]   r^   r   itemr_   r   r   r   adamw_r`   rZ   ra   	append_opr#   )rs   r   param_and_gradr   gradr   r$   r%   r&   r'   r(   find_mastermaster_weightlr	lr_ratio_r^   r_   r   _r   r   r   adamw_ops                          r0   _append_optimize_opAdamW._append_optimize_op  s}   %)//399!=>>>>nd++!55nEN$ 
''3//

;;J..!!>!#4
 ..!!>!#4
 }} (())>!+<  	 44##^A%6
 44##^A%6
 ++ 
0K0K1##1

    !2!7!78 	
 "">2 "## >>) ^^N1$56  "$++x88 [[%%a(  "$++x88 [[%%a(  9D''44  #)--q!q!""+#Aq!Q1a.  )+,'*+!##9#9*O*OF //<I'0|$ ,A./&i&i - -G "__37#.(++ ~~- q(9:==E $++x00(,}%!%g$++x00(,}%!%g$--22*.--'#'==i }}(3}|$,7=((5}%,9()YY" ' H Or/   c                Z    SR                  SSR                  U R                  5      /5      $ )N zWeight Decay, params:,)joinrX   )rs   s    r0   __str__AdamW.__str__w  s&    xx0#((4;L;L2MNOOr/   c           	        [         R                  R                  R                  R                  5       (       a  U R	                  5         g[        U R                  S   [        5      (       Gd  / nU R                   H  nUR                  (       a  M  UR                  5       c  M)  UR                  5       n[        R                  " 5       (       a?  [        US5      (       a-  UR                  5       (       a  U R                  b  [        S5      eO>[        US5      (       a-  UR!                  5       (       a  U R                  b  [        S5      eUR#                  X#45        M     U R%                  SSUS9ngU R&                   GHK  n[)        S 5      nUS    H  nUR                  (       a  M  UR                  5       c  M)  UR                  5       n[        R                  " 5       (       a?  [        US5      (       a-  UR                  5       (       a  U R                  b  [        S5      eO>[        US5      (       a-  UR!                  5       (       a  U R                  b  [        S5      eUS   R#                  X#45        M     UR+                  UR-                  5        VVs0 s H  u  pgUS:w  d  M  Xg_M     snn5        U R%                  SSUS9  GMN     gs  snnf )	a  
Execute the optimizer and update parameters once.

Returns:
    None

Examples:
    .. code-block:: python

        >>> import paddle

        >>> a = paddle.rand([2,13], dtype="float32")
        >>> linear = paddle.nn.Linear(13, 5)
        >>> # This can be any optimizer supported by dygraph.
        >>> opt = paddle.optimizer.AdamW(learning_rate = 0.01,
        ...                             parameters = linear.parameters())
        >>> out = linear(a)
        >>> out.backward()
        >>> opt.step()
        >>> opt.clear_grad()
Nr   is_selected_rowszOAdamW don't support weight_decay with sparse parameters, please set it to None.
_is_sparse)lossstartup_programparams_gradsc                     / $ r-   r.   r.   r/   r0   r1   AdamW.step.<locals>.<lambda>  s    2r/   r+   )rA   r   dygraphin_to_static_mode_declarative_stepr:   rI   rG   r   
_grad_ivarr   rK   hasattrr   rj   RuntimeErrorr   r   _apply_optimizerf   r   r   r   )rs   r   r   grad_varoptimize_opsr|   r   r   s           r0   step
AdamW.stepz  s   0 ;;##5577""$$..q1488L--&&##%1$//1H 0022#H.@AA ( 9 9 ; ; $ 3 3 ?". q# 
 $Hl;; ( 3 3 5 5 $ 3 3 ?". q#  !''(9:/ .2  //4l 0 L
  $11*:6(2E** '')5#(#3#3#5$4466 '2D E E$,$=$=$?$?$($7$7$C&2$u'" !"
 !(, ? ?$,$7$7$9$9$($7$7$C&2$u'" !" %X.55u6GH/ 30 ##&1&7&7&9K&9daQ(]TQT&9K $$t, % ;  26 Ls   K*Kc                   UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  SU R                  S   5      U l        UR                  S5      nU$ )Nr5   r6   r7   r8   r4   r+   )r   re   r^   r_   r`   ra   rZ   )rs   ru   s     r0   r   AdamW._update_param_group  s     nnWd.@.@.IJ nnWd.@.@.IJ"y$2D2DY2OP$..++K8
 (^^D..~>
  ^^H-
r/   c           	       ^^^ SmSnSS/m/ SQmUUU4S jn0 nU R                  5       n0 n[        [        UR                  5       5      5      nUR                  5        H9  u  pxUR                  R
                  U;  d  M!  XvUR                  R
                  '   M;     UR                  SS5      n	UR                  S	S5        UR                  5        H  u  pU" U
5      u  pXl   nX   nU S
U 3nX-;   aV  UR                  5       (       a0  [        UUUR                  UR                  UR                  S9UU'   Mh  [        UX5      UU'   My  [        UUSSSS9UU'   M     U	bz  U	R                  5        Hf  u  pXj   nX   nU S3nUR                  5       (       a0  [        UUUR                  UR                  UR                  S9UU'   MW  [        UX5      UU'   Mh     U$ )a  
Convert optimizer state dict to a sharded state dict based on model sharding information.

Args:
    model_sharded_state_dict (dict): Sharded state dict of the model, containing tensor metadata.

Returns:
    dict: A new optimizer state dict where weights are wrapped as ShardedWeight.
fp32_master_0momentbeta1_pow_acc_0beta2_pow_acc_0)	moment1_0	moment2_0
velocity_0c                   > TU ;   a!  [        U R                  ST-   S-   S5      5      $ TT-    H/  nU R                  U5      (       d  M  U S [        U5      S-   *  U4s  $    [	        SU  S35      e)Nr   r   zCannot split variable name: .)tuplerD   endswithlenr;   )vnamer{   _FP32_MASTER_optimizer_non_scaler_name_optimizer_scalar_names     r0   _generate_base_static_name<AdamW.sharded_state_dict.<locals>._generate_base_static_name  s    u$U[[|);c)A1EFF.1KK>>$'' !3SY]#34d:: L ;E7!DEEr/   master_weightsNLR_Schedulerr  )r   local_tensorlocal_shapeglobal_shapeglobal_offset)r   )r   z.w_0)
state_dictrG   sortedr   r  r{   popis_distr
   r   r  r   )rs   model_sharded_state_dict_MOMENT_NAMEr  optimizer_sharded_state_dictoptimizer_state_dictstatic_to_struct_mappingr   r   r  r   tensorstatic_nameoptim_state_typestruct_namesharded_weightunified_namer  r  r  s                    @@@r0   sharded_state_dictAdamW.sharded_state_dict  s    '"
&
"	F (*$#0#% #'+1134$
  -224DA~~""*BB@A)<)<= 5
 .112BDI  6 0557KC,Fs,K)K2?K5BN)]!,<+=>L />>##AN(%+$*LL%+\\&4&B&BB0> =(& 1> >K$!' $!%"&>,\:1 8B %-3356;!9!F"-d3>>##AN(%+$*LL%+\\&4&B&BB0> =(& 1>  6& ,+r/   )$rP   rR   rl   rd   rY   rk   r^   r_   re   rM   r`   rq   r\   ra   rV   rO   r]   rc   rb   rJ   ro   rQ   rS   rf   rI   rX   rn   r[   ri   rZ   rU   rp   r!   rr   rj   r#   )gMbP?r   r   g:0yE>Ng{Gz?FNNNFFFN)rt   zfloat | LRSchedulerr5   float | Tensorr6   r1  r7   r1  ru   z8Sequence[Tensor] | Sequence[_AdamParameterConfig] | Noner4   r1  rv   boolrw   z Callable[[Tensor], float] | Nonerx   zCallable[[str], bool] | Noner9   zGradientClipBase | Noner8   r2  ry   r2  rz   r2  r{   z
str | Nonereturnr    )r3  r    )r$  r	   r3  r	   )__name__
__module____qualname____firstlineno____doc____annotations__r   r   r   r   r   r}   r   r   rg   r   r   r   r   imperative_baseno_gradr   non_static_onlyr  r   r/  __static_attributes__r.   r/   r0   r   r   6   s   rh L
I  ((( .3 # %"& '+(-59>B-1 %#U(*U( U( 	U(
  U( EU( %U( "&U( 3U(  <U( +U( U( U(  !U(" #U($ 
%U(n(%/NC
J92]~P Y  Yvg,"2g, 
g,r/   r   )+
__future__r   r   collectionsr   collections.abcr   typingr   rA   r   paddle.base.libpaddler   5paddle.distributed.flex_checkpoint.dcp.sharded_weightr	   r
   r   
paddle.pirr    r   r   r   r   base.dygraphr:  base.frameworkr   r   r   r   nn.clipr   r   r   	optimizerr   r   r   adamr   __all__r   r.   r/   r0   <module>rL     sk    #  # $     * 
   " 2  '   (*
V,I V,r/   