
    ёib                   :   S SK Jr  S SKrS SKrS SKJr  S SKJr  S SKr	S SK
r
S SKJr  S SK
Jr  S SKJrJr  S SKJr  S SKJr  S S	KJrJrJrJrJrJrJrJr  S S
KJ r J!r!  SSK"J#r#J$r$  SSK%J&r&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-  SSK.J/r/  SSK0J1r1J2r2  \(       a/  S SK3J4r4J5r5  S SK6J7r7J8r8  S SK
J9r9  S SK:J;r;  SSK)J<r<J=r=   " S S\85      r>\/" \?\R                  SS9rA/ rB\C" \R                  R                  SS 5      5      rF\#R                       S S j5       rH " S S5      rIg)!    )annotationsN)defaultdict)TYPE_CHECKING)_C_ops)	parameterset_parameter)	ValueDict)core)Variable_current_expected_placedefault_main_programdevice_guardin_dygraph_modein_dynamic_or_pir_modein_pir_mode
name_scope)L2DecayWeightDecayRegularizer   )	frameworkunique_name)_get_no_grad_set_name_get_no_grad_set_valueappend_backward)	Parameter)LayerHelperLayerHelperBase)
get_logger   )LambdaDecayLRScheduler)CallableSequence)NotRequired	TypedDict)Tensor)GradientClipBase)OperatorProgramc                  4    \ rS rSr% S\S'   S\S'   S\S'   Srg	)
_ParameterConfigC   zSequence[Tensor]paramsz2NotRequired[float | WeightDecayRegularizer | None]weight_decayz0NotRequired[float | Tensor | LRScheduler | None]learning_rate N)__name__
__module____qualname____firstlineno____annotations____static_attributes__r0       Z/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/optimizer/optimizer.pyr+   r+   C   s      HHGGr7   r+   z&%(asctime)s-%(levelname)s: %(message)s)fmt$FLAGS_shard_bypass_dygraph_optimizerc                   SSK JnJn  [        5       nUR                  S:X  d   S5       eUR                  5       n	U  H  n
U
R                  U	:X  a  M   S5       e   U" U	5        U" U	5      nUc  UR                  5       R                  5       nUR                  X5      u  pUR                  X5      u  p/ nU HF  nUc  M  U	R                  R                  UR                  5      nUS:  d   eUR                  U5        MH     UR                  [!        U5      5        UR#                  U5        [%        U5      S:X  a  X4/nU$ / n['        U5       H  u  nnUR                  UUU   45        M     U$ )Nr   )	Transform	orig2primr   zHThe append_backward_new interface is designed to process only one block.z@variable in loss_list should be in current block of main program)paddle.incubate.autograd.primxr<   r=   r   
num_blockscurrent_blockblockglobal_blockall_parameters	linearize	transposeopsindexopappend	erase_opssorted
erase_dotslen	enumerate)	loss_listparameter_listno_grad_set	callbackscheckpointsdistop_contextr<   r=   programrA   elad	param_dotloss_dotloss_bar	param_bar
op_indexesvarop_indexparams_and_gradsiparams                         r8   append_backward_newrb   U   su    D"$G" R" !!#Exx5  	
N	
  
 e	5	B --/>>@,,~AI,,x;H J?yysvv.Hq= =h'	  LL
#$MM)
>a+78
  !.1HAu##UIaL$9: 2r7   c                  >   \ rS rSr% SrS\S'   S\S'   S\S'   \R                  " 5           S@           SAS
 jj5       rS r	S r
S rS rS r\R                  " 5       S 5       r\R                   SBS j5       r\R                   SCS j5       r\rSDS jrS r\R                   SES j5       r\R                   SFS j5       rSGS jrSHS jrS rS rS rS rS rS rS r      SIS  jr!S! r"S" r#S# r$S$ r% SJS% jr& SJS& jr'    S@           SKS' jjr(    SLS( jr) SJS) jr*SHS* jr+ SH     SMS+ jjr,SHS, jr-\R\                  SNSOS- jj5       r/\R\                  SNSPS. jj5       r0\R                  " 5          SQ         SRS/ jj5       r1S0 r2\R                  " 5       \R\                  SSS1 j5       5       r3S2 r4S3 r5\R                   S4 5       r6\R                   S5 5       r7S6 r8S7 r9S8 r:S9 r;\<S: 5       r=\<S; 5       r>\<S< 5       r?\<S= 5       r@\<S> 5       rAS?rBg	)T	Optimizer   a  Optimizer Base class.

Define the common interface of an optimizer.
User should not use this class directly,
but need to use one of it's implementation.

Args:
    learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
        It can be a float value or any subclass of ``LRScheduler`` .
    parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
        This parameter is required in dygraph mode. And you can specify different options for \
        different parameter groups such as the learning rate, weight decay, etc, \
        then the parameters are list of dict. Note that the learning_rate in parameter groups \
        represents the scale of base learning_rate. \
        The default value is None in static graph mode, at this time all parameters will be updated.
    weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
        It can be a int or float value as coeff of L2 regularization or \
        :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
        If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
        the regularization setting here in optimizer will be ignored for this parameter. \
        Otherwise, the regularization setting here in optimizer will take effect. \
        Default None, meaning there is no regularization.
    grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of \
        some derived class of ``GradientClipBase`` . There are three clipping strategies \
        ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
        :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
    name (str|None, optional): Normally there is no need for user to set this property.
        For more information, please refer to :ref:`api_guide_Name`.
        The default value is None.

Returns:
   Base class for optimizer.

Examples:
    .. code-block:: python

        >>> # Take the subclass adam as an example
        >>> import paddle
        >>> linear = paddle.nn.Linear(10, 10)
        >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
        >>> out = linear(inp)
        >>> loss = paddle.mean(out)
        >>> adam = paddle.optimizer.Adam(
        ...     learning_rate=0.1,
        ...     parameters=linear.parameters()
        ... )
        >>> loss.backward()
        >>> adam.step()
        >>> adam.clear_grad()

        >>> #Take the subclass sgd as an example
        >>> #optimize parameters in linear_1 and linear2 in different options.
        >>> #Note that the learning_rate of linear_2 is 0.01.
        >>> linear_1 = paddle.nn.Linear(10, 10)
        >>> linear_2 = paddle.nn.Linear(10, 10)
        >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
        >>> out = linear_1(inp)
        >>> out = linear_2(out)
        >>> loss = paddle.mean(out)
        >>> sgd = paddle.optimizer.SGD(
        ...     learning_rate=0.1,
        ...     parameters=[{
        ...         'params': linear_1.parameters()
        ...     }, {
        ...         'params': linear_2.parameters(),
        ...         'weight_decay': 0.001,
        ...         'learning_rate': 0.1
        ...     }],
        ...     weight_decay=0.01)
        >>> loss.backward()
        >>> sgd.step()
        >>> sgd.clear_grad()

WeightDecayRegularizer | NoneregularizationzLayerHelperBase | NonehelperzCallable[[bool], None]clear_gradientsNc                   Ubh  [        U[        R                  5      (       a  [        S[	        U5       S35      e[        U[
        5      (       a  [        S5      e[        U5      U l        OS U l        XPl        [        R                  " 5       (       a  U R                  c  [        S5      eUbq  [        U R                  S   [
        5      (       dO  U R                   H?  n[        US5      (       d  M  UR                  c  M%  [        R                  " SU S35          O   [        U[         ["        45      (       d  [        S	[	        U5       S
35      eUb>  [        U[        R$                  R&                  R(                  5      (       d  [        S5      e[        U[         5      (       a  [+        U5      U l        O5[        U[.        5      (       a  [+        [!        U5      5      U l        OX0l        X@l        Xl        S U l        U R                  (       a  [        U R                  S   [
        5      (       aE  U R                   H  nSU;   a  M   S5       e   U R                  S   S   S   R6                  U l        OU R                  S   R6                  U l        0 U l        [;        S 5      U l        S U l        / U l         0 U l!        0 U l"        U RF                  U l$        U R,                  U R0                  S.U l%        / U l&        U R                  (       aU  [        U R                  S   [
        5      (       a3  U R                   H"  nU RO                  URQ                  5       5        M$     OU R                  U l&        S U l)        U RU                  5       U l+        0 U l,        [[        5       U l.        0 U l/        U Ra                  5         SU l1        SU l2        S U l3        SU l4        S U l5        g )Nzp`parameters` argument given to the optimizer should be an iterable of paddle Tensors, but got argument type is `z`.zv`parameters` argument should not get dict type, if parameter groups is needed, please set `parameters` as list of dictzNparameters argument given to the Optimizer should not be None in dygraph mode.r   regularizerz{If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. The weight_decay[U] in Optimizer will not take effect, and it will only be applied to other Parameters!z2learning rate should be float or LRScheduler, got z herezE'grad_clip' should be an instance of GradientClipBase's derived classr-   zYparams should be set in parameters if parameter groups are optimized in different optionsc                     0 $ Nr0   r0   r7   r8   <lambda>$Optimizer.__init__.<locals>.<lambda>)  s    r7   )r.   	grad_clipF)6
isinstancepaddler&   	TypeErrortypedictlist_parameter_list_namer   r   AttributeErrorhasattrrk   logginginfofloatr!   nnclipr'   r   rg   int
_grad_clip_learning_rate_dtypedtype_learning_rate_mapr   _accumulatorsrh   _opti_name_list_accumulators_holder_param_device_map
clear_gradri   _default_dict_param_groups_add_param_groupcopy_use_multi_tensor_create_multi_tensor_dict_param_dict_auxiliary_varsset_already_create_accumulator_master_weights_create_master_grad_states_use_fusion_storage_need_refusefusion_storage_fuse_buffer_versionmerged_model_params)selfr/   
parametersr.   rq   namera   param_groups           r8   __init__Optimizer.__init__   s    ! *fmm44PPTU_P`Oaace  *d++' 
 $(
#3D #'D 
$$&&##+$d  '!$"6"6q"94@@!%!5!5#E=99 % 1 1 =#LL!44@>  BW!X " "6 -%)=>>DT-EXDYY^_   i)H)HII[  lE**"),"7Dc**")%*=">D".#+$..q1488#'#7#7K#{2 s2 $8 #2215h?BHH"2215;; #%
 )4!$&!!## //

  Jt/C/CA/F$M$M#33%%k&6&6&89  4 "&!5!5D "&99;!+.5(!'') $) !"$%!#' r7   c                ^    [        5       (       a  [        5       U l        O0 U l        SU l        g NF)r   r	   _master_grads_master_gradr   s    r8   r   $Optimizer._create_master_grad_statesN  s#    ==!*D!#D!r7   c                     X R                   U'   g rn   )r   )r   keyvals      r8   _set_auxiliary_varOptimizer._set_auxiliary_varV  s    $'S!r7   c                    U R                   b  [        U R                   5      OSn[        U5       Vs/ s H  n/ PM     sn[        U5       Vs/ s H  n/ PM     snS.$ s  snf s  snf )Nr   )FP32_DenseTensorFP16_DenseTensor)r   rM   range)r   n_s      r8   r   #Optimizer._create_multi_tensor_dictY  s[    '+'9'9'EC""#1-21X 6XX 6-21X 6XX 6
 	
 6 6s   AA#c                :    U R                   R                  US 5      $ rn   )r   get)r   r   s     r8   _get_auxiliary_varOptimizer._get_auxiliary_var`  s    ##''T22r7   c                0    Xl         U R                  5         g rn   )r   need_refuse)r   r   s     r8   set_merged_model_params!Optimizer.set_merged_model_paramsc  s    #6 r7   c                l   SSK Jn  [        R                  " 5       (       d  g U R                  R
                  S:w  a  g U R                  b  U R                  R                  5        HP  u  p#UR                  5        H7  u  p$UR                  U R                  5      (       a  M'  U R                  5         M9     MR     U R                  R                  5        H7  u  p#UR                  U R                  5      (       a  M'  U R                  5         M9     U R                  (       d  g [        R                  SU R                   35        U" U R                  U R                  U R                   5      U l        U =R                  S-  sl        U R%                  5         [        R                  SU R                   35        g )Nr   )FusionStorageAdamWz,refuse optimizer fuse buffer version start: z*refuse optimizer fuse buffer version end: )fusion_utilsr   r   r   	__class__r1   fused_states_bufferr   items_is_shared_buffer_withr   r   r   local_loggerwarningr   r   r   reset_need_refuse)r   r   r   vvvs        r8   _maybe_refuseOptimizer._maybe_refuseg  sc   / ((** >>""g- ##/**002WWYEA44T5M5MNN((* ' 3 ,,224//0H0HII$$& 5   :4;T;T:UV	
 ,  $$

 	!!Q&! 89R9R8ST	
r7   c                   0 n[        U R                  5      S:X  aA  [        U R                  5      S:  a(  U R                  R                  5        H	  u  p#X1U'   M     OU R                  R                  5        H  u  pEUR                  5        Hy  u  pgXqUR                  '   [
        R                  " 5       (       d  M/  [        R                  " SSS9nUS:X  d  ML  UR                  5       R                  5       XR                  S-   '   M{     M     [        U S5      (       a(  [        U R                  5      S:w  a  U R                  US'   [        U R                  [        5      (       a  U R                  R!                  5       US	'   U$ )
ac  
Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
If the optimizer never be called(minimize function), the state_dict is empty.


Returns:
    dict[str,Tensor], dict contains all the Tensor used by optimizer

Examples:
    .. code-block:: python

        >>> import paddle
        >>> emb = paddle.nn.Embedding(10, 10)

        >>> adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
        >>> state_dict = adam.state_dict()

r   xpu_adamw_moment_dtypefp32defaultfp16.SCALE_VALUEr   master_weightsLR_Scheduler)rM   r   r   r   r   r
   is_compiled_with_xpuosgetenv
get_tensorget_xpu_scale_valuer{   r   rr   r   r!   
state_dict)	r   r   r   r]   kr   	para_namevar_tmpr   s	            r8   r   Optimizer.state_dict  sF   ( 
t!!"a'C0I0I,JQ,N!66<<>	#&4  ? **002*+'')&I/6w||,0022134f2. 2V; ' 2 2 4 H H J '||n'DE +4 3 4*++4''(A-/3/C/C
+,d));77)-)<)<)G)G)IJ~&r7   c                   [        U R                  [        5      (       a]  UR                  SS5      n[        U R                  [        5      (       d
  Uc   S5       eU(       a  U R                  R                  U5        UR                  5       nSU;   a  UR                  S5        SU;   a,  [        U S5      (       a
  US   U l	        UR                  S5        Xl
        U R                  R                  5        H  u  p4UR                  5        H  u  pVUR                  U;   d   SUR                   S35       eUR                  5       nUR                  5       n[         R"                  " 5       (       aI  [$        R&                  " SS	S
9n	U	S:X  a.  UR)                  UR                  UR                  S-   S5      5        UR+                  XR                     5        M     M     g)a  
Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.

Args:
    state_dict(dict): Dict contains all the Tensor needed by optimizer

Return:
    None

Examples:
    .. code-block:: python

        >>> import paddle

        >>> emb = paddle.nn.Embedding(10, 10)

        >>> layer_state_dict = emb.state_dict()
        >>> paddle.save(layer_state_dict, "emb.pdparams")

        >>> scheduler = paddle.optimizer.lr.NoamDecay(
        ...     d_model=100, warmup_steps=100, verbose=True)
        >>> adam = paddle.optimizer.Adam(
        ...     learning_rate=scheduler,
        ...     parameters=emb.parameters())
        >>> opt_state_dict = adam.state_dict()
        >>> paddle.save(opt_state_dict, "adam.pdopt")

        >>> opti_state_dict = paddle.load("adam.pdopt")
        >>> adam.set_state_dict(opti_state_dict)

r   NzHLR_Scheduler state must be included in the state dict except LambdaDecayr   r   zoptimizer Tensor z
 not foundr   r   r   r   r         )rr   r   r!   r   r    set_state_dictr   popr{   r   r   r   r   r   valuer   r
   r   r   r   set_xpu_scale_value	set_value)
r   r   lr_state_dictr   r   r   r   r]   tensorr   s
             r8   r   Optimizer.set_state_dict  s   B d));77&NN>4@Md11;??$0 ^0 ##22=A  __&
Z'NN>*z)t.//'12B'C$NN+,$.!&&,,.DA&'ggi"	||z1 '~Z@1 mmo),,..-/YY0&.* .722&NN7<<.+H$O j67! '0 /r7   c                    U R                   $ rn   )r   r   s    r8   get_opti_var_name_list Optimizer.get_opti_var_name_list  s    ###r7   c                   ^  U 4S jn[         R                  R                  R                  5          U" 5         S S S 5        g ! , (       d  f       g = f)Nc            
     `  > TR                   c  [        R                  " 5       OTR                   n [        R                  " 5       S:w  a  U [        R                  :X  d-  [        R                  " 5       S:w  a$  U [        R                  :X  a  [        R
                  OU n [        TR                  [        5      (       Ga  TR                  5       n[        5       (       Ga  [        R                  R                  5       n[        R                  R                  5       n[        R                  " S5      n[!        TR                  5       5      n[        R                  R#                  U5         [        R$                  R&                  R)                  US9n[        R*                  R,                  R/                  / U 5      nU" XrR1                  5       5      nSUl        [5        X5        S S S 5        UR7                  U5        [        U[        R*                  R8                  5      (       d  UTR                  l        [        R                  R#                  U5         [=        X@/ 5      n	S S S 5        SW	l        SU	l        TR                  Ul         Xl!        XCl"        U	TRF                  U'   g g [        U[H        RJ                  5      (       d  [        R                  " S5      nUTR                  l        TRL                  RO                  U/ SSU S9n[H        R                  " 5       n
TR                  U
l         Xl!        UTRF                  [H        R                  " 5       '   [!        TR                  5       5      nTRL                  RQ                  U[        R$                  R&                  R)                  US9S9  g [        TR                  [         5      (       Ga>  TR                  5       n[        5       (       Ga  [        U[        R*                  R8                  5      (       a  g [S        5       n[        U [        RT                  R,                  RV                  5      (       d  [        U [        RT                  RX                  RZ                  R\                  5      (       a(  [        R*                  R,                  R^                  U    n O)[        R*                  R,                  Ra                  U 5      n [        R*                  R,                  Rc                  U / [        R                  " S5      [        R$                  R&                  Re                  [!        TR                  5      S9S9TRF                  [        R                  R                  5       '   g [        U[H        RJ                  5      (       a  g [        R                  Rg                  [        R                  " S5      / [!        TR                  5      U SS	9TRF                  [H        R                  " 5       '   g g ! , (       d  f       GN3= f! , (       d  f       GN= f)
Nfloat16bfloat16r/   r   T)r   shapepersistablestop_gradientr   initializer)r   r   r   r   r   r   r   r   r   )4r   rs   get_default_dtyper   r   float32rr   r   r!   _global_learning_rater   staticdefault_startup_programr   r   generater~   program_guardr   r   Constantpirr
   ParameterMetarB   r   r   set_parameters_fromValue	_var_namer   r   lr_schedulerlr_varlr_namer   r   r   rh   create_global_variableset_variable_initializerr   baseDataType	libpaddleVarDescVarTypevartype_to_datatypeconvert_np_dtype_to_dtype_create_persistable_valueConstantInitializercreate_global_var)	_lr_dtyper  startup_programmain_programr  lr_valuer   parameter_metainit_resultra   	main_proglrplacer   s                r8   	do_create9Optimizer._create_global_learning_rate.<locals>.do_create	  s    ;;& ((*[[  002i?%7 002j@%8    $--{;;335==&,mm&K&K&MO#)==#E#E#GL)22?CG$T%8%8%:;H44_E&,ii&;&;&D&D"* 'E ' *0)F)F	* '2*,H,H,J' 37/%k; F !44_E%ffjj.>.>??8?++5#]]88F$-g"$EE G.2+,0)484G4G1.3+/6,@E//= @ &fi.@.@AA"-"6"6"G8?++5!%!C!C!("$(,*."+ "D " %.$B$B$D	151D1D	.+1( # //%::<  %T%8%8%:;HKK88$*II$9$9$B$B"* %C % 9  D//77//1==!"fjj&6&677 7 9))V[[5E5E5N5NOO) )6;;+@+@+H+H+P+P    -3JJOO,O,O$--"	
 %+JJOO$N$N(1%& !* #JJOODD"+"$!,!5!5o!F(.		(=(=(Q(Q&+D,?,?&@ )R )	 E  //"MM>>@ ""i&8&899 #MM;;!,!5!5o!F"$"'(;(;"<"+(, <  //%::<G 8g FE  GFs   &A:XX
X
X-)rs   r	  r   dygraph_guard_if_declarative)r   r  s   ` r8   _create_global_learning_rate&Optimizer._create_global_learning_rate  s4    }	~ [[""??AK BAAs   A
Ac           	     ~   [        U[        [        45      (       d  [        S[	        U5       S35      e[        U R
                  [        5      (       a  [        S5      e[        U5      U l        U R                  5       nUb  [        5       (       aK  [        5       n[        R                  " U[        UR                  5      [        U5      UR                  U5        g[         R"                  " 5       R%                  5       nUR'                  SSU/0UR                  [        UR                  5      [        U5      S.SS	9  gg)
aO  
:api_attr: imperative

Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
this API cannot be invoked, because it will lead to conflict.

Args:
    value (float): the value of learning rate.

Returns:
    None

Examples:
    .. code-block:: python

        >>> import paddle
        >>> linear = paddle.nn.Linear(10, 10)

        >>> adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())

        >>> # set learning rate manually by python float value
        >>> lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
        >>> for i in range(5):
        ...     adam.set_lr(lr_list[i])
        ...     lr = adam.get_lr()
        ...     print("current lr is {}".format(lr))
        current lr is 0.2
        current lr is 0.3
        current lr is 0.4
        current lr is 0.5
        current lr is 0.6

zDThe type of 'value' in optimizer.set_lr must be float, but received .zhoptimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict.Nfill_constantOut)r   r   r   T)ru   outputsattrsr   )rr   r   r~   rt   ru   r   r!   RuntimeErrorr   r   r   r   full_rw   r   r   r   r   rB   	append_op)r   r   
current_lrr  rB   s        r8   set_lrOptimizer.set_lr  s&   F %#u..VW[\aWbVccde  d));77z  $El//1
!  /1))*%L$$  )==?LLN&&("ZL1!+!1!1!%j&6&6!7!&u
 #' ' 	 "r7   c                l    SSK Jn  [        X5      (       d  [        S[	        U5       S35      eXl        g)aJ  
:api_attr: imperative

Set the LRScheduler of the learning rate manually in the optimizer. If the optimizer already used LRScheduler previously,
this API will set it be the new one.

Args:
    scheduler (LRScheduler): the LRScheduler of learning rate

Returns:
    None

Examples:
    .. code-block:: python

        >>> import paddle
        >>> linear = paddle.nn.Linear(10, 10)

        >>> adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())

        >>> # set learning rate manually by class LRScheduler
        >>> scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2,4,6], gamma=0.8)
        >>> adam.set_lr_scheduler(scheduler)
        >>> lr = adam.get_lr()
        >>> print("current lr is {}".format(lr))
        current lr is 0.5

        >>> # set learning rate manually by another LRScheduler
        >>> scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.1, step_size=5, gamma=0.6)
        >>> adam.set_lr_scheduler(scheduler)
        >>> lr = adam.get_lr()
        >>> print("current lr is {}".format(lr))
        current lr is 0.1

r   )r!   zXThe type of 'scheduler' in optimizer.set_lr_scheduler must be LRScheduler, but received r"  N)paddle.optimizer.lrr!   rr   rt   ru   r   )r   	schedulerr!   s      r8   set_lr_schedulerOptimizer.set_lr_scheduler  s=    J 	4)11jkopykzj{{|}  (r7   c                x    [        U R                  [        5      (       a  U R                  $ U R                  5       $ )a  
Get current learning rate of optimizer.
If 'LRScheduler' is not used, the return value is all the same.
If 'LRScheduler' is used, the return value is the current scheduled learning rete.

Returns:
    float, The current learning rate of optimizer.

Examples:
    .. code-block:: python

        >>> # train on default dynamic graph mode
        >>> import paddle
        >>> import numpy as np
        >>> emb = paddle.nn.Embedding(10, 3)

        >>> ## example1: LRScheduler is not used, return the same value is all the same
        >>> adam = paddle.optimizer.Adam(0.01, parameters = emb.parameters())
        >>> for batch in range(10):
        ...     input = paddle.randint(low=0, high=5, shape=[5])
        ...     out = emb(input)
        ...     out.backward()
        ...     print("Learning rate of step{}: {}".format(batch, adam.get_lr())) # 0.01
        ...     adam.step()
        Learning rate of step0: 0.01
        Learning rate of step1: 0.01
        Learning rate of step2: 0.01
        Learning rate of step3: 0.01
        Learning rate of step4: 0.01
        Learning rate of step5: 0.01
        Learning rate of step6: 0.01
        Learning rate of step7: 0.01
        Learning rate of step8: 0.01
        Learning rate of step9: 0.01

        >>> ## example2: StepDecay is used, return the scheduled learning rate
        >>> scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=2, gamma=0.1)
        >>> adam = paddle.optimizer.Adam(scheduler, parameters = emb.parameters())
        >>> for batch in range(10):
        ...     input = paddle.randint(low=0, high=5, shape=[5])
        ...     out = emb(input)
        ...     out.backward()
        ...     print("Learning rate of step{}: {}".format(batch, adam.get_lr())) # 0.5->0.05...
        ...     adam.step()
        ...     scheduler.step()
        Learning rate of step0: 0.5
        Learning rate of step1: 0.5
        Learning rate of step2: 0.05
        Learning rate of step3: 0.05
        Learning rate of step4: 0.005000000000000001
        Learning rate of step5: 0.005000000000000001
        Learning rate of step6: 0.0005000000000000001
        Learning rate of step7: 0.0005000000000000001
        Learning rate of step8: 5.000000000000001e-05
        Learning rate of step9: 5.000000000000001e-05

        >>> # train on static graph mode
        >>> paddle.enable_static()
        >>> main_prog = paddle.static.Program()
        >>> start_prog = paddle.static.Program()
        >>> with paddle.static.program_guard(main_prog, start_prog):
        ...     x = paddle.static.data(name='x', shape=[None, 10])
        ...     z = paddle.static.nn.fc(x, 100)
        ...     loss = paddle.mean(z)
        ...     scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=2, gamma=0.1)
        ...     adam = paddle.optimizer.Adam(learning_rate=scheduler)
        ...     adam.minimize(loss)

        >>> exe = paddle.static.Executor()
        >>> exe.run(start_prog)
        >>> for batch in range(10):
        ...     print("Learning rate of step{}: {}".format(batch, adam.get_lr())) # 0.5->0.05->0.005...
        ...     out = exe.run(main_prog, feed={'x': np.random.randn(3, 10).astype('float32')})
        ...     scheduler.step()
        Learning rate of step0: 0.5
        Learning rate of step1: 0.5
        Learning rate of step2: 0.05
        Learning rate of step3: 0.05
        Learning rate of step4: 0.005000000000000001
        Learning rate of step5: 0.005000000000000001
        Learning rate of step6: 0.0005000000000000001
        Learning rate of step7: 0.0005000000000000001
        Learning rate of step8: 5.000000000000001e-05
        Learning rate of step9: 5.000000000000001e-05
)rr   r   r~   r   s    r8   get_lrOptimizer.get_lr  s3    l d))511&&&&&((r7   c                    UcC  [        5       (       a  [        R                  " 5       nO[        R                  R                  5       nU R
                  R                  US5      $ )z+
get global decayed learning rate
:return:
N)r   r   r   rs   r   r   r   )r   rU   s     r8   r   Optimizer._global_learning_rateW  sK    
 ?  #88: --<<>&&**7D99r7   c                    [        S5      e)zFappend optimize operator to block and return all the added optimize_opzcClass "Optimizer" cannot be used directly as an optimizer, please use its subclasses such as "Adam")NotImplementedError)r   rA   param_and_grads      r8   _append_optimize_opOptimizer._append_optimize_opc  s    !q
 	
r7   c                \   US   n[        US5      (       a  UR                  b  SUR                  ;   a  UR                  S   n[        U[        [        R
                  R                  45      (       a  U$ US:X  a  U R                  5       $ [        R                  R                  5       R                  SS9   [        R                  " S5         U R                  5       U-  sS S S 5        sS S S 5        $ U R                  5       $ ! , (       d  f       O= f S S S 5        g ! , (       d  f       g = f)Nr   optimize_attrr/         ?T)is_with_optscale_with_param_lr)r{   r=  rr   r   rs   r   r  r   r   r   _lr_schedule_guardr   r   )r   r9  ra   param_lrs       r8   _create_param_lrOptimizer._create_param_lri  s   q!E?++##/5#6#66**?;H(Xvzz/?/?$@AAs?5577 ::<OO(, P  ",,-BC#99;hF DC  --//	 DCC  s$   5DD	D
D	D
D+c                   UR                   U R                  ;   a  U R                  UR                      nU$ U R                  U5      n[        5       (       Ga  [        R
                  R                  5       n[        R
                  R                  5       n[        R
                  R                  U5         S nU" XAR                   5      n[        R                  " US5      nSUl
        [        R                  R                  X5        S S S 5        [        R
                  R                  U5         [        R                  R                  5         [        R
                  R                  UWR                   UR"                  [$        R&                  " 5       5      nUR)                  5       (       a  UR+                  UR-                  5       5        [        R.                  R0                  R                  R3                  UR5                  5       R6                  / UR5                  5       /5      n	XR9                  5       l        SUl
        S S S 5        O[:        R<                  " 5       (       a  [        R                  " US5      nX2l         O[?        U R@                  [B        5      (       d   e[        R
                  RE                  UUR                   SSSS9nU R@                  RF                  RI                  5       n
U
RK                  SSU/0SU/0UR"                  [$        RL                  RN                  RP                  S	.S
9  WU R                  UR                   '   U$ ! , (       d  f       GNT= f! , (       d  f       N;= f)Nc                    U R                  5       R                   HP  nUR                  5       S:X  d  M  XR                  5       S   :X  d  M1  UR	                  S5      R                  5       s  $    g )Nzbuiltin.set_parameterparameter_namer   )rB   rF   r   r&  operandsource)startupr   rH   s      r8   get_param_from_startup?Optimizer._create_master_weight.<locals>.get_param_from_startup  sX    ")"6"6"8"<"<B "	-D D$(HHJ7G,H$H')zz!}';';'= = #=  $r7   r   Tr   r   castXr$  )in_dtype	out_dtype)ru   inputsr%  r&  ))r   r   _gen_master_weight_var_namer   rs   r   r   r   r   rM  r   _pir_opsset_persistable_valuer   reset_insertion_point_to_startdatar   r   r
   Placeis_distset_typeru   r	  r  create_op_dist_attribute	dist_attrprocess_meshget_defining_opr   r   rr   rh   r   r  r  rB   r)  r  r  FP32)r   ra   r]   var_namer  r  rK  startup_paramstartup_varop_dist_attrrA   s              r8   _create_master_weightOptimizer._create_master_weight  s   ::---&&uzz2CB 
 77>H}}"(--"G"G"I%}}AAC]]00A$ %;'%M #)++mY"GK.2K+OO99+P! B" ]]00>JJ==? --,, #))#))

	C #**,,[%5%5%78"KK1155NN + 5 5 7 D D "!,!6!6!8 9 % ;G++-7&*CO% ?>& **,,kk%3#!$++{;;;;mm55!++# $ 6  33@@B%>"SEN$)KK%)\\%9%9%>%>	    03D  ,
w BA" ?>s   2AM,DM$
M!$
M2c                L    UR                   S-   n[        R                  " U5      $ )N_fp32_master)r   r   r   )r   ra   r_  s      r8   rR  %Optimizer._gen_master_weight_var_name  s!    ::.##H--r7   c           
        U R                  UR                  5      (       d   e[        5       (       ah  XR                  ;   a  U R                  U   nU$ [        R
                  " US5      nUR                  5       R                  SS5        X R                  U'    U$ UR                  U R                  ;   a  U R                  UR                     nU$ UR                  S-   n[        R                  " U5      nUR                  R                  UUR                  SSUR                  UR                  UR                   S9nX R                  UR                  '   U$ )Nr   master_grad_castTrf  r   )r   r   r   r   	lod_levelr   is_data)_is_dtype_fp16_or_bf16r   r   r   rs   rM  r]  set_bool_attrr   r   r   rA   
create_varr   rj  r   rk  )r   gradr]   r_  s       r8   _create_master_gradOptimizer._create_master_grad  s0   **4::6666==)))((.* 
' kk$	2##%334FM+.""4(" 
 yyD...((3 
  99~5&//9jj++!**#"nn $ 0 0 LL ,  14""499-
r7   c                    g)zCreate all accumulators needed by the parameters

Args:
    block: the block in which the loss tensor is present
    parameters: list of parameter tensors for the optimizer
Nr0   )r   rA   r   s      r8   _create_accumulatorsOptimizer._create_accumulators       	r7   c                    g)zFinish any custom updates needed
   before completing an optimization step

Args:
    block: the block in which the loss tensor is present
    parameters: list of parameter tensors for the optimizer

Returns:
    None
Nr0   )r   rA   parameters_and_gradss      r8   _finish_updateOptimizer._finish_update  s     	r7   c           
     v	   U R                   b  U R                   S-   U-   nXR                  ;   an  UR                  U R                  U   ;   aQ  [        R                  " 5       (       a  U R                  U   UR                     $ [        SU SUR                   35      eU R                  5         Uc  UR                  nUR                  S-   U-   n[        R                  " U5      nU R                  R                  U5        Uc  U R                  UR                  5      n[        5       (       a  SU;  a|  [        R                  R                   R#                  U=(       d    UR$                  UU[        R&                  R(                  R+                  [-        U5      S9UR/                  5       S9n	GO[        R                  R                   R#                  U=(       d    UR$                  UU[        R&                  R(                  R+                  [-        U5      S9S9n	GOqU R0                  c$  [3        U R4                  R6                  5      U l        [9        U R0                  [2        5      (       d   eU R0                  R;                  US	U=(       d    UR$                  [         R<                  R>                  R@                  US	S
9n	[	        5       (       ay  US:X  d  [9        U[         RB                  5      (       aT  [D        RF                  " U	U	R                  [I        [-        U5      5      U	R$                  [         RB                  " 5       5        O\[K        U5         U R0                  RM                  U	[        R&                  R(                  R+                  [-        U5      S9S9  SSS5        [        R                  " 5       (       a  [O        U RP                  5      S:  a  XRP                  ;   d   SU S35       eU	RS                  U RP                  RU                  U5      5        [         RV                  " 5       (       aW  [X        RZ                  " SSS9n
U
S:X  a<  U	R]                  5       R_                  U RP                  Ra                  US-   S5      5        XR                  U   UR                  '   U	$ ! , (       d  f       GN= f)aD  Utility function to add an accumulator for a parameter

Args:
    block: the block in which the loss tensor is present
    name: name of the accumulator
    param: parameter tensor for which accumulator is to be added
    dtype: data type of the accumulator tensor
    fill_value: value to initialize the accumulator tensor
Nr   Accumulator z already exists for parameter betar   )r   r[  r   T)r   r   r   ru   r   belong_to_optimizercpur   zOptimizer set error, z should in state dictr   r   r   r   r   r   )1ry   r   r   r   r   	Exceptionr   r   r   r   r   rI   _get_device_for_paramr   rs   r   r
   r  r   r   r   r   r~   r[  rh   r   r   r1   rr   r  r  r  DENSE_TENSORCPUPlacer   r(  strr   r  rM   r   r   r   r   r   r   r   r   r   )r   r   ra   r   
fill_valuer   ru   devicer_  r]   r   s              r8   _add_accumulatorOptimizer._add_accumulator  s   & ::!::#d*D&&&

d0066((**))$/

;;tf$B5::,O 
 =KKE::#d*''1##H->//

;F==X%jjoo>>(U[[ &		 5 5 > >#J/ !? ! $oo/ ?  jjoo>>(U[[ &		 5 5 > >#J/ !? !	 ?  {{")$..*A*ABdkk;7777++44 *u{{\\))66$( 5 C   %:fdmm#D#DIIj)*IIMMO "&)KK88$*II$9$9$B$B"'
"3 %C % 9  * ((**t001A5#'@'@@ /z9NO@ MM$";";"?"?"IJ 0022134f2. 2V;NN,@@ $ 9 9 = =$,~$=t!" 034 ,
9 *)s   A	R))
R8c                   U R                   b  U R                   S-   U-   nXR                  ;  d  UR                  U R                  U   ;  a  [        SU SUR                   35      eU R                  U   UR                     $ )zUtility function to fetch an accumulator for a parameter

Args:
    name: name of the accumulator
    param: parameter tensor for which accumulator is to be fetched

Returns:
    accumulator tensor for the parameter
r   r{   does not exist for parameter )ry   r   r   r  )r   r   ra   s      r8   _get_accumulatorOptimizer._get_accumulatorq  s     ::!::#d*D***zz!3!3D!99tf$B5::,O  !!$'

33r7   c                   U R                   b  U R                   S-   U-   nU R                  =(       a    U R                  UR                  5      nU(       a  U R                  UR
                     OUnUR
                  nXR                  ;  d  XPR                  U   ;  a  [        SU SU 35      eU R                  U   U   $ )zUtility function to fetch an accumulator for a parameter
Args:
    name: name of the accumulator
    param: parameter variable for which accumulator is to be fetched
Returns:
    accumulator variable for the parameter
r   r{  r  )ry   _multi_precisionrl  r   r   r   r   r  )r   r   ra   find_mastertarget_paramtarget_names         r8   _get_accumulator_master!Optimizer._get_accumulator_master  s     ::!::#d*D++ 
0K0KKK1
 1<D  , 	 #''***"4"4T"::tf$B;-P  !!$'44r7   c                &   U H  nUS   R                   SL d  M  US   R                  nUR                  n[        R                  R                  5       nU H5  nUR                  nXH;   d  M  UR                  U5      U R                  U'     M     M     g )Nr   F)	r   r   rF   r
   op_proto_and_checker_makerkOpDeviceAttrNameinput_arg_namesattrr   )	r   rw  target_blockr9  
param_namerF   device_attr_namerH   r  s	            r8   _update_param_device_map"Optimizer._update_param_device_map  s    2Na ..%7+A.33
"&&33EEG ! B&(&8&8O!4=?WW,>..z:   3r7   c                F    S nXR                   ;   a  U R                   U   nU$ rn   )r   )r   r  r  s      r8   r  Optimizer._get_device_for_param  s(    ///++J7Fr7   c           	        [         R                  " 5       R                  5       nUn[         R                  " 5       R                  5       nUR                  UR                  :w  aC  UR
                  S:w  d   S5       e[         R                  " 5       R                  UR
                     n[        UR                  5      n[        U R                  R                  5      U l        U R                  5         U R                  (       Ga  U R                  R                  S;   Ga  [        U R                  S   U   5      S:X  a  [        U R                  S   U   5      S:X  a  [!        U["        5      (       aF  US:X  d   eU R%                  UU Vs/ s H  nUS   R&                  (       a  M  US   PM      snU5        OQU R)                  U5        U R%                  UUS    Vs/ s H  nUS   R&                  (       a  M  US   PM      snU5        [         R*                  " 5       (       a  U R-                  UUUS9  GOEU R/                  X5        / nU HI  n	U	S   R&                  (       a  M  U	S	   c  M!  UR1                  U	S   5        UR1                  U	S	   5        MK     US   R2                  R4                  R7                  U5         [9        S5         U R;                  US   R<                  5      n
[?        U
5         U R-                  UUUS9  S
S
S
5        S
S
S
5        S
S
S
5        GOY[         R*                  " 5       (       d-  [!        U[@        5      (       a  US   OUnU R/                  X5        [!        U["        5      (       a  [B        RD                  R                   RG                  5          SnU H  u  p[I        US5      (       d  M  Sn  O   U(       a5  [B        RJ                  RL                  RN                  RQ                  XU5        O<U RS                  UU Vs/ s H  nUS   R&                  (       a  M  US   PM      sn5        S
S
S
5        OURU                  5       nUS    Vs/ s H  nUS   R&                  (       a  M  US   PM      snUS'   [B        RD                  R                   RG                  5          U RS                  XO5        S
S
S
5        [         R*                  " 5       (       Ga  U RW                  S5      nS[B        RX                  R[                  5       ;   a(  Ub%  UR]                  5       (       a  UR_                  5       nU(       a=  [!        U[`        Rb                  Rd                  5      (       a  U Rg                  SS5        GO[!        U[`        Rb                  Rd                  5      (       a  U Rg                  SS5        [!        U["        5      (       a`  U Ri                  5         U HH  n	U	S	   b  U	S   Rk                  5       (       d  M#  U	S   R&                  SL d  M7  U Rm                  XI5        MJ     GOCUS    H  n	U	S	   b  U	S   Rk                  5       (       d  M#  U	S   R&                  SL d  M7  0 nU	US'   URo                  URq                  5        VVs0 s H  u  nnUS:w  d  M  UU_M     snn5        U Rm                  UU5        M     OU H  n	U	S	   c  M  U	S   R2                  R4                  R7                  U	5         [9        S5         U	S   R&                  SL aC  U R;                  U	S   R<                  5      n
[?        U
5         U Rm                  XI5      nS
S
S
5        S
S
S
5        S
S
S
5        M     U Rs                  XA5        [B        RD                  R`                  Ru                  S5        [        UR                  5      nURw                  UU5      $ s  snf s  snf ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       N= fs  snf ! , (       d  f       GNa= fs  snf ! , (       d  f       GNx= fs  snnf ! , (       d  f       N= f! , (       d  f       GN = f! , (       d  f       GM  = f)  Add optimization operators to update gradients to tensors.

Args:
  parameters_and_grads(list(tuple(Tensor, Tensor))):
    a list of (tensor, gradient) pair to update.

Returns:
  return_op_list: a list of operators that will complete one step of
    optimization. This will include parameter update ops, global step
    update ops and any other custom ops required by subclasses to manage
    their internal state.
zFcurrent block is not global_block, but it doesn't have backward block.)MomentumAdamr   r   r   r-   param_group_idxr   N	optimizerF_need_shard_autoT	found_infxpu)<r   r   rB   r@   idxbackward_block_idxblocksrM   rF   r   r   r1   rh   r  r   r   rr   rw   _multi_tensor_initr   _update_param_groupr    _append_optimize_multi_tensor_opr  rI   rA   rU   _optimized_guardr   r  r   r   rv   rs   r	  r  r{   distributedauto_parallelfully_shardshard_accumulatorsrs  r   r   r  
get_devicerX  _local_valuer
   eagerr&   r   r   _is_initializedr:  updater   rx  _set_warmup
_slice_ops)r   rw  r  rB   r  r@   startpparam_grad_listr9  r  params_grads_device_map_need_shardra   r   params_acc_dictr  param_grad_dictr   r   optimize_opends                         r8   _create_optimization_pass#Optimizer._create_optimization_pass  s   4 !557DDF#!668FFH 0 00 33r9 X9 %99;BB00L L$$%!$.."9"9:))+ !!!dnn&=&= B
 '

 D$$%78IJaO(();<_MN 2D99*a///++$ &:%9#$Q4#5#5 !AaD%9
 ( ,,-AB++$ &:(%C%C#$Q4#5#5 !AaD%C
 ( ((**55 ($3 6  --(
 #%&:N*1-;;;*1-9'..~a/@A'..~a/@A '; $A&,,44EE' {+!778J8O8OPF%f-==(0,; >  . ,  ,,.. ""6== )2- (
 --+ .55[[**GGI"'K$8"5*<==*.K! %9 #**88DDWW0 11( *>)=A'(t'9'9 !%!)= JI( #7";";"= -X6-6Q4-- AaD6-)
 [[**GGI--lL J ((** 33K@	V]]5577!-!))++ ) 6 6 8I!)TZZ->->??//TB!)TZZ->->??//UC!"6==**,.BN
 !/q 1 9'5a'8'H'H'J'J (-a0>>%G $ 8 8$0!" /C /C8.LN .q 1 9'5a'8'H'H'J'J (-a0>>%G24<J 9 / 6 6 5I4N4N4P%&4PDAq+,= )-14P%&!" !% 8 8$0/!"! /M( ';N%a(0 &q)//77HH* #;/)!,::eC%)%?%? .q 1 6 6&F ".f!5.2.F.F$0/" "6 0 	 ';( 	L?$$U+,""#&&uc22MD .- ,+ J JI*-
 JIV%&. "6!5 0/ s   _
#	_
_
5	_
=`	*_53_#_5`#` A``.	`7``/:	`/1`4a'aa/<a
a	a$a/#
_2-_55
`	?`
``
`,4
a
aa
a,'a//
a?	c           	     `   [         R                  R                  5       R                  5       nUnUR                  S   nU R                  5         [        U[        5      (       a<  U R                  UU Vs/ s H  ofS   R                  (       a  M  US   PM     sn5        OSUR                  5       nUS    Vs/ s H  nUS   R                  (       a  M  US   PM      snUS'   U R                  XG5        [        U[        5      (       a7  U H0  nUS   c  M  US   R                  SL d  M  U R                  XH5        M2     O{US    Hr  nUS   c  M  US   R                  SL d  M  0 n	XS'   U	R                  UR                  5        V
Vs0 s H  u  pU
S:w  d  M  X_M     snn
5        U R                  XI5        Mt     U R                  XA5        [         R                  R                   R#                  S5        UR                  R%                  U5      S-   nUR                  US $ s  snf s  snf s  snn
f )r  r  r   r-   r   NF)rs   r   r   rB   rF   r  rr   rw   rs  r   r   r:  r  r   rx  r	  r
   r  rG   )r   rw  r  rB   r  last_opr  r  r9  r  r   r   start_indexs                r8   _pir_create_optimization_pass'Optimizer._pir_create_optimization_pass  s      }}99;HHJ#""2&))+ *D11%%3N3!Q4;M;M13N
 3779O )2)2At)) !2)OH%
 %%lD*D11"6!!$,!!$22e;,,\J	 #7 #7x"@!!$,!!$22e;&(O0>H-#** )=(B(B(D(D H} !AD(D ,,\K #A" 	L?$$U+"&&,,W59--O O)*s$   3H 
	H 
6H%	H%H*H*c                   Sn[         R                  " 5       (       a  OU R                  X5      nU R                  c  UR                  U l        [         R                  " 5       (       af  U(       a  UOU R
                  n/ n[        R                  R                  U5      n	[        U	5       H  u  pUc  M
  UR                  Xz   U45        M!     U$ Uc&  [        R                  R                  R                  /nO[        U[         5      (       d   eUR"                  R$                  n[&        R(                  " UR*                  5      S:X  d   SUR*                   S35       eU(       a  UOU R
                  n[        R,                  R/                  X5         [1        5       (       a  UcA  UR3                  5       R5                  5       nU Vs/ s H  nUR6                  SL d  M  UPM     nn/ n[        R8                  R:                  R=                  XUS9n	[        U	5       H  u  pUc  M
  UR                  Xz   U45        M!     O-SSKJ n  U" 5       (       a  [C        U/XvU5      nO[E        XXe5      nSSS5        U$ s  snf ! , (       d  f       W$ = f)	a;  
The first part of ``minimize``, do auto-diff to append backward operations for
the current program.

Args:
    loss (Tensor): ``loss`` tensor to run optimizations.
    startup_program (Program|None, optional): :ref:`api_paddle_static_Program` for
        initializing parameters in ``parameters``. The default value
        is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
    parameters (list[Tensor]|list[str]|None, optional): List of ``Tensor`` or ``Tensor.name`` to update
        to minimize ``loss``. The default value is None, at this time all parameters
        will be updated.
    no_grad_set (set[Tensor]|set[str]|None, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
        to be updated. The default value is None.
    callbacks (list|None, optional): list of callable objects to run when appending backward
        operator for one parameter. The default value is None.

Return:
    list[tuple[Tensor, Tensor]], list of (param, grad) tensor pairs, param is ``Parameter``,
        grad is the gradient value corresponding to the parameter.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> x = paddle.arange(26, dtype="float32").reshape([2, 13])

        >>> linear = paddle.nn.Linear(13, 5)
        >>> # This can be any optimizer supported by dygraph.
        >>> adam = paddle.optimizer.Adam(learning_rate = 0.01,
        ...                             parameters = linear.parameters())
        >>> out = linear(x)
        >>> out.backward()
        >>> adam.step()
        >>> adam.clear_grad()
Nr   zJThe number of elements of loss should be 1, but the current loss.shape is zh, whose number of elements is not 1. Maybe that you should call paddle.mean to process the current loss.F)no_grad_varsr   )prim_enabled)#r   r   _get_no_grad_setr   r   rx   r
   r  get_all_gradsrN   rI   rs   r   r   error_clip_callbackrr   rw   rA   rU   npprodr   r   r   r   rB   rC   r   autogradir_backwardro  paddle.incubate.autograd.utilsr  rb   r   )r   lossr  r   rQ   rR   act_no_grad_setrP   params_gradsgradsrG   ro  rU   program_all_paramsra   r  s                   r8   backwardOptimizer.backward  s]   X $$&&"33DFO ;;**DK$$&&+5Z4;O;ON LJJ,,^<E(/# '')>(EF  0X Q  #YY^^??@	!)T2222jj((G774::&!+ \]a]g]g\h iV V+ ,6Z4;O;ON,,WF==%- $002AAC +
 *<*);$22e; "); ' *
 $&L"OO77<<? = E (1'7+(//1F0MN (8 L#~~':!FNY( (7 /(7 G< /* GF< s+   5I9I4I4"=I9#AI94I99
Jc                j   [        U S5      (       d  [        US S9nU R                  b  U R                  U5      nO)[        R                  R
                  R                  U5      nU R                  XR                  5      n[        5       (       a  U R                  U5      nU$ U R                  U5      nU$ )a  
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.

Args:
    params_grads (list[tuple[Tensor, Tensor]]): list of (param, grad) pair to do optimization.

Returns:
    list: A list of operators appended to the current program.

Examples:
    .. code-block:: python

        >>> import paddle

        >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
        >>> linear = paddle.nn.Linear(10, 10)
        >>> out = linear(inp)
        >>> loss = paddle.mean(out)
        >>> optimizer = paddle.optimizer.Adam(learning_rate=0.1,
        ...         parameters=linear.parameters())
        >>> params_grads = optimizer.backward(loss)
        >>> optimizer.apply_gradients(params_grads)

_sortedc                     U S   R                   $ Nr   )r   )xs    r8   ro   +Optimizer.apply_gradients.<locals>.<lambda>j  s    adiir7   )r   )r{   rK   r   rs   r   r   append_gradient_clip_opsappend_regularization_opsrg   r   r  r  )r   r  optimize_opss      r8   apply_gradientsOptimizer.apply_gradientsK  s    < tY''!,4GHL ??&??<8L!99>>BB<PL 55--
 ====lKL   99,GLr7   c                p   [         R                  " 5       (       a  [        (       a  g[        5       (       Ga  [        R
                  R                  [        R
                  R                  5       [        R
                  R                  5       5         [        R                  R                  R                  R                  5       nU(       a3  [        R                  R                  R                  R                  U5        [        U[        5      (       a:  U R                   b  U R!                  U5      nU R#                  X0R$                  5      nO8US   nUb  U" US   5      US'   U R#                  US   U R$                  5      US'   ['        5       (       a  U R)                  X4S9nOU R+                  X4S9nSSS5        U$ US:X  d   eUR,                  R.                  n[        R
                  R                  X5         U R1                  U5      nSSS5        U$ ! , (       d  f       W$ = f! , (       d  f       W$ = f)a  
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
    loss (Tensor): loss tensor to run optimizations.
    startup_program (Program): startup_program for initializing parameters
        in `parameters`.
    params_grads (list): list of (param, grad) pair to do optimization.
Returns:
    list: A list of operators appended to the current program.
Nrq   r-   r  r   )r   r    g_shard_bypass_dygraph_optimizerr   rs   r   r   r   r   r  r  auto_dp_utilsin_auto_dp_mode'_convert_fake_replicate_grad_to_partialrr   rw   r   r  rg   r   r  r  rA   rU   r  )	r   r  r  r  r  auto_dprq   r  rU   s	            r8   _apply_optimizeOptimizer._apply_optimize}  s    $$&&+K+K!##,,224557 !,,::HHXXZ&&44BBjj$ lD112'+|'D#'#A#A$&9&9$L !-[ 9I ,1:(22X. .2-K-K$X.0C0C.L* ==#'#E#E$ $F $L $(#A#A$ $B $L?N 	 #a'''jj((G,,WF#33LA GO N  GFs   D"H8H&
H#&
H5c                8  ^  Ub2  [        US5      (       a  [        US5      (       a  UR                  c  Uc  U$ SnU 4S jnU" X5      n[        US5      (       a*  UR                  b  UR                  XUR                  5      nOUb  U" XUR                  5      nUc   e[        5       (       a  [        R
                  " X$/5      $ UnUR                  [        R                  R                  R                  :X  a}  UR                  R                  UR                  [        R                  " 5       -   UR                  UR                  UR                   [        R                  R                  R"                  S9nSX$/0nSU/0nUR                  R%                  SXxS9  U$ )	z`Create and add backward regularization Operators

Function helper of append_regularization_ops.
Nrk   c                H  > U nU R                   UR                   :w  a  TR                  =(       a    TR                  U R                   5      nU(       a4  [        TR                  5      S:w  a  TR                  U R
                     nU$ U R                  UR                   5      nU$ r  )r   r  rl  rM   r   r   astype)ra   ro  r  r  r   s       r8   get_target_paramBOptimizer._create_regularization_of_grad.<locals>.get_target_param  s     L{{djj()) A33EKK@  3t';';#<#A#'#7#7

#CL   $)<<

#;Lr7   )r   r   r   rj  ru   rN  r$  sum)ru   rQ  r%  )r{   rk   rA   r   r   add_nru   r
   r  r  SELECTED_ROWSrn  r   kNewGradSuffixr   r   rj  r  r)  )	r   ra   ro  rg   regularization_termr  new_gradrQ  r%  s	   `        r8   _create_regularization_of_grad(Optimizer._create_regularization_of_grad  sn    <E=11E=11e6G6G6O&K"	  !-5-((U->->-J"'"3"3E"L'"0djj"I"...!##<< ;<<HyyDLL00>>>
  ::00T%8%8%::++++#oo--:: 1  D67Fxj)GJJ  eF LOr7   c                j   / n[         R                  " 5       (       d  [        5       (       a1  U H)  u  pEU R                  XEU5      nUR	                  XF45        M+     U$ Sn[         R
                  " S5         U H  u  pEU(       d,  UR                  b  Ub  Sn[        R                  " SU S35        UR                  R                  R                  XE/5         U R                  XEU5      nUR	                  XF45        SSS5        M     SSS5        U$ ! , (       d  f       M  = f! , (       d  f       U$ = f)a  Create and add backward regularization Operators

Creates and adds backward regularization operators in the BlockDesc.
This will add gradients of the regularizer function to the gradients
of the parameters and return these modified gradients. This is the
same as implementing weight decay in optimizers for regularization.

Args:
    parameters_and_grads (list[tuple[Tensor,Tensor]]): A list of (parameters, gradients) pairs
        that need to be regularized.
    regularization (WeightDecayRegularizer|None, optional): A global regularizer. If the parameter is not
        set. It will be applied with regularizer.

Returns:
    list[tuple[Tensor,Tensor]]: list of (parameters, gradients) \
        pair with the regularized gradient

Raises:
    Exception: Unknown regularization type
Frg   NTzyIf regularizer of a Parameter has been set by 'base.ParamAttr' or 'base.WeightNormParamAttr' already. The Regularization[rl   )r   r   r   r  rI   r   rk   r|   r}   rA   rU   r  )r   rw  rg   r_   ra   ro  r  repeat_regularizers           r8   r  #Optimizer.append_regularization_ops  s2   2 $$&&+--3>> !''(9:	  40  % "'%%&67#7KE.!--9*6-1*22@1A  BWX ,,==umL#'#F#F!$ )//0AB	 ML $8 8"   ML 87"  s%   5A!D#%D;D#
D D##
D2c                   [        5       (       ar  [        U5      nUR                  R                  R	                  5       R                  5       nU Vs/ s H  oDR                  SL d  M  UPM     nnUR                  U5        U$ [        U5      nUR                  R                  R	                  5       R                  5       nU Vs1 s H   nUR                  SL d  M  UR                  iM"     nnUR                  U5        U$ s  snf s  snf NT)
r   r   rA   rU   rB   rC   r   r  r   r   )r   r  rQ   r   ra   param_no_trainables         r8   r  Optimizer._get_no_grad_set-  s    ==0=K++88:IIKJ#-"#-%1D1D1L:  " 12/<K++88:IIKJ ("'E&&$. 

'  " 12!""s   C?'C?DDc                   / nU R                   b"  [        U R                   S   [        5      (       d8  U R                    H'  nUR                  (       a  M  UR	                  U5        M)     OCU R
                   H3  nUS    H'  nUR                  (       a  M  UR	                  U5        M)     M5     U H  nUR                  U5        M     g)a
  
Clear the gradients of all optimized parameters for model.

If not, new gradient will accumulat on previous gradient.

There are two method to clear grad: set_to_zero or delete grad.

Args:
    set_to_zero (bool, optional): If set grads to zero or not, default is True.

Returns:
    None

Examples:
    .. code-block:: python

        >>> import paddle

        >>> a = paddle.arange(26, dtype="float32").reshape([2, 13])
        >>> linear = paddle.nn.Linear(13, 5)
        >>> # This can be any optimizer supported by dygraph.
        >>> adam = paddle.optimizer.Adam(learning_rate = 0.01,
        ...                             parameters = linear.parameters())
        >>> out = linear(a)
        >>> out.backward()
        >>> adam.step()
        >>> adam.clear_grad()

Nr   r-   )rx   rr   rv   r   rI   r   clear_gradient)r   set_to_zero
param_listr  r   s        r8   r   Optimizer.clear_gradC  s    > 
'z  #T0
 0
 ))%%a( *  $11$X.A???"))!, /  2
 A[) r7   c                ,    U R                  U(       + S9  g )N)r  )r   )r   set_to_nones     r8   	zero_gradOptimizer.zero_gradr  s    O4r7   c                    [        U[        [        R                  R                  45      (       d   S5       eU(       a  UOU R
                  nU R                  UUUUS9nU R                  XUS9nXv4$ )aQ  
Add operations to minimize ``loss`` by updating ``parameters``.

Args:
    loss (Tensor): A ``Tensor`` containing the value to minimize.
    startup_program (Program|None, optional): :ref:`api_paddle_static_Program` for
        initializing parameters in ``parameters``. The default value
        is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
    parameters (list[Tensor]|list[str]|None, optional): List of ``Tensor`` or ``Tensor.name`` to update
        to minimize ``loss``. The default value is None, at this time all parameters
        will be updated.
    no_grad_set (set[Tensor]|set[str]|None, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
        to be updated. The default value is None.

Returns:
    tuple[list[Operator],list[tuple[Tensor, Tensor]]], A list of operators appended
        by minimize and a list of (param, grad) tensor pairs, param is
        ``Parameter``, grad is the gradient value corresponding to the parameter.
        In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
        indicate program pruning. If so, the program will be pruned by ``feed`` and
        ``fetch_list`` before run, see details in ``Executor``.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> linear = paddle.nn.Linear(10, 10)
        >>> input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
        >>> out = linear(input)
        >>> loss = paddle.mean(out)

        >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
        >>> beta2 = paddle.to_tensor([0.99], dtype="float32")

        >>> adam = paddle.optimizer.Adam(learning_rate=0.1,
        ...         parameters=linear.parameters(),
        ...         weight_decay=0.01)
        >>> loss.backward()
        >>> adam.minimize(loss)
        >>> adam.clear_grad()

zThe loss should be an Tensor.)r  r   rQ   )r  r  )rr   r   rs   r   r  rx   r  r  )r   r  r  r   rQ   rP   r  r  s           r8   minimizeOptimizer.minimizev  s    d $6::+;+; <== 	
+	
= (2t7K7K}}+%#	 % 
 ++ , 
 ))r7   c                  ^ [         R                  R                  5       R                  5       R	                  5       n[        U R                  S   [        5      (       a   S5       eU R                   Vs1 s H  o"R                  iM     snmU Vs/ s H  o"R                  (       d  M  UPM     nn[        [        U4S jU5      5      nU Vs/ s H  o"UR                  4PM     nnU R                  U5      ngs  snf s  snf s  snf )zG
In declarative mode, we forward `call step` to `call apply_gradients`
r   zQOnly list of parameters is supported while using optimizer in @paddle.jit.static.c                H   > U R                   T;   =(       a    [        U S5      $ )Nro  )r   r{   )r  selected_paramss    r8   ro   -Optimizer._declarative_step.<locals>.<lambda>  s    !&&O3J68JJr7   N)rs   r   r   rB   rC   rr   rx   rv   r   	trainablerw   filterro  r  )r   r-   ra   r   r  r  r  s         @r8   _declarative_stepOptimizer._declarative_step  s    
 MM..0==?NNP 	 d2215t<< 	
_	
< 483G3GH3G%::3GH)/C??e
CJ

 :DD

+D++L9 IC Es   3C7C<'C<Dc           	     ~   [         R                  R                  R                  R                  5       (       a  U R	                  5         g[        U R                  S   [        5      (       d  / nU R                   H  nUR                  (       a  M  [        U SS5      (       a@  [        US5      (       a-  UR                  b  UR                  X"R                  45        Md  Mf  Mh  [        US5      (       a+  UR                  b  UR                  X"R                  45        M  UR                  5       c  M  UR                  5       nUR                  X#45        M     U R                  SSUSS9  g[        U R                  5       H  u  pE[!        S 5      nUS    HN  nUR                  (       a  M  UR                  5       c  M)  UR                  5       nUS   R                  X#45        MP     UR#                  UR%                  5        VVs0 s H  u  pgUS:w  d  M  Xg_M     snn5        U R                  SSUUS9  M     gs  snnf )	a)  
Execute the optimizer and update parameters once.

Returns:
    None

Examples:
    .. code-block:: python

        >>> import paddle

        >>> a = paddle.arange(26, dtype="float32").reshape([2, 13])
        >>> linear = paddle.nn.Linear(13, 5)
        >>> # This can be any optimizer supported by dygraph.
        >>> adam = paddle.optimizer.Adam(learning_rate = 0.01,
        ...                         parameters = linear.parameters())
        >>> out = linear(a)
        >>> out.backward()
        >>> adam.step()
        >>> adam.clear_grad()
Nr   enable_tensor_fusionF	main_grad)r  r  r  r  c                     / $ rn   r0   r0   r7   r8   ro    Optimizer.step.<locals>.<lambda>
  s    2r7   r-   )rs   r	  dygraphin_to_static_moder  rr   r   rv   r   getattrr{   r  rI   
_grad_ivarr  rN   r   r  r   )r   r  ra   grad_varr  r   r   r   s           r8   stepOptimizer.step  s    0 ;;##5577""$$,,Q/66L++&&4!7??{33!OO7$++UOO,DE 8 4
 E;//EOO4O ''(@A'')5#(#3#3#5$++U,=>! ,$    $) !	 !  %.d.@.@$A *:6(2E** '')5#(#3#3#5$X.55u6GH 3 ##&1&7&7&9K&9daQ(]TQT&9K $$$(!-$'	 %  %B Ls   H9H9c                   US   n[        U[        5      (       a  U/US'   O.[        U[        5      (       a  [        S5      e[	        U5      US'   U R
                  R                  5        H  u  p4UR                  X45        M     [        5       nU R                   H   nUR                  [        US   5      5        M"     UR                  [        US   5      5      (       d  [        S5      eUS    HP  nUS   n[        U[        5      (       a  [        U5      n	OUn	Xl        UR                  SS5      UR                   S'   MR     U R                  R#                  U5        g)z
Add a param group to parameter_list.

Args:
    param_group (dict): The group of Tensors to be optimized with
    different optimization options.
r-   z`optimizer parameters should be in ordered collections,but received set, please use list instead.z7some parameters appear in more than one parameter groupr.   r/   r>  N)rr   r   r   rt   rw   r   r   
setdefaultr   r  
isdisjoint
ValueErrorr~   r   rk   r   r=  rI   )
r   r   r-   r   r   	param_setgroupra   r.   rg   s
             r8   r   Optimizer._add_param_group  sO    X&fi((%+HK!$$= 
 %)LK! &&,,.DA""1( / E	''ESx12 ( ##CH(=$>??I  !*E&~6L,..!(!6!- .3>??4E0 + 	!!+.r7   c                    g)z
Update the param group with new entry
Args:
    parameters (dict): The extra group of Tensors to be optimized with
    different optimization options. Only used in child class.
Nr0   )r   r   s     r8   r  Optimizer._update_param_groupH  ru  r7   c                    g)ay  
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file.

Args:
    target_block: the block in which the loss tensor is present
    parameters: list of parameter tensors for the optimizer
Nr0   )r   r  r   r  s       r8   r  Optimizer._multi_tensor_initQ  s     	r7   c                    g)z=
For Multi Tensor, append optimize merged_operator to block.
Nr0   )r   r  rw  r  s       r8   r  *Optimizer._append_optimize_multi_tensor_op]  ru  r7   c                    [        U[        R                  R                  [        R                  45      (       d   S5       e[        U[        R                  R                  5      (       aU  U[        R                  R                  R
                  :H  =(       d'    U[        R                  R                  R                  :H  $ U[        R                  R                  :H  =(       d    U[        R                  R                  :H  $ )z
check the dtype is fp16 or the dtype is bf16
:param dtype: instance of core.VarDesc.VarType
:return: True if dtype is one of fp16 or bf16, False otherwise
zIThe dtype should be an instance of core.VarDesc.VarType or core.DataType.)	rr   r
   r  r  r
  FP16BF16FLOAT16BFLOAT16)r   r   s     r8   rl   Optimizer._is_dtype_fp16_or_bf16f  s     %$,,"6"6!FGG 	
W	
G eT\\1122--222 6DLL00555 ... 3DMM222r7   c                2    SU l         U R                  5         g r  )r   r   r   s    r8   use_fusion_storageOptimizer.use_fusion_storagez  s    #' r7   c                &    U R                   U l        g rn   )r   r   r   s    r8   r   Optimizer.need_refuse~  s     44r7   c                    SU l         g r   )r   r   s    r8   r   Optimizer.reset_need_refuse  s
    !r7   c                    U R                   $ rn   )r   r   s    r8   fused_buffer_versionOptimizer.fused_buffer_version  s    (((r7   c                J    U R                   c  g U R                   R                  $ rn   )r   bufferr   s    r8   r   Optimizer.fused_states_buffer  s#    &"")))r7   c                J    U R                   c  g U R                   R                  $ rn   )r   buffer_ipc_metar   s    r8   fused_states_buffer_ipc_meta&Optimizer.fused_states_buffer_ipc_meta  s#    &""222r7   c                J    U R                   c  g U R                   R                  $ rn   )r   accumulators_metar   s    r8   fused_states_accumulators_meta(Optimizer.fused_states_accumulators_meta  s#    &""444r7   c                J    U R                   c  g U R                   R                  $ rn   )r   master_weights_metar   s    r8    fused_states_master_weights_meta*Optimizer.fused_states_master_weights_meta  s#    &""666r7   )r   r   r   r   r   r   r   r   r   r   r   r   r   ry   r   r   r   r   r   rx   r   r   ri   r   rh   r   rg   )NNNN)r/   zfloat | LRSchedulerr   z4Sequence[Tensor] | Sequence[_ParameterConfig] | Noner.   z%float | WeightDecayRegularizer | Nonerq   zGradientClipBase | Noner   z
str | NonereturnNone)rQ  dict[str, Tensor])r   rS  rQ  rR  )rQ  z	list[str])r   r~   rQ  rR  )r/  r!   rQ  rR  )rQ  r~   rn   )Ng        NNN)r   )r  r&   r  Program | Noner   list[Tensor] | list[str] | NonerQ   set[Tensor] | set[str] | NonerR   z list[Callable[..., None]] | NonerQ  list[tuple[Tensor, Tensor]])r  rW  rQ  zlist[Operator])rw  rW  rg   rf   rQ  rW  )T)r  boolrQ  rR  )r  rX  rQ  rR  )NNN)
r  r&   r  rT  r   rU  rQ   rV  rQ  z2tuple[list[Operator], list[tuple[Tensor, Tensor]]])rQ  rR  )Cr1   r2   r3   r4   __doc__r5   imperative_baseno_gradr   r   r   r   r   r   r   r   dygraph_onlyr   r   load_state_dictr   r  r+  r0  r3  r   r:  rC  rc  rR  rp  rs  rx  r  r  r  r  r  r  r  r  r  r  r  r  r  non_static_onlyr   r  r  r  r#  r   r  r  r  rl  r9  r   r   propertyr@  r   rG  rK  rO  r6   r0   r7   r8   rd   rd      s   IV 21""++ LP>B-1v(*v( Iv( <	v(
 +v( v( 
v( v(p"(
3 $
 $
L + +Z D8 D8L %O$AF A AF *( *(XY)v
:
02CJ.6" sj4*54  56^3B 56H.Z +/6:596:ii (i 4	i
 3i 4i 
%iV070	0f DE:x=D 9=3 93  63  
%	3 j, ,* ,*\ 5 5  +/6:59B*B* (B* 4	B*
 3B* 
<B* B*H:* G  GR+/Z 	 	  (5" ) ) * *
 3 3
 5 5
 7 7r7   rd   )NNNNN)J
__future__r   r|   r   collectionsr   typingr   numpyr  rs   paddle.autogradr  rZ  r   paddle._pir_opsr   r   paddle.autograd.backward_utilsr	   paddle.baser
   paddle.base.frameworkr   r   r   r   r   r   r   r   paddle.regularizerr   r   r	  r   r   base.backwardr   r   r   base.frameworkr   base.layer_helperr   r   base.log_helperr   r  r    r!   collections.abcr"   r#   typing_extensionsr$   r%   r&   paddle.nn.clipr'   r(   r)   r+   r1   INFOr   __all__r   environr   r  static_onlyrb   rd   r0   r7   r8   <module>ru     s   & #  	 #     )  4 4 	 	 	 ? ) 
 ' < ( (28/2H9 H gll H
 #&JJNN91=$  
  , ,^Y 7 Y 7r7   