
    ёi'y                    0   S SK Jr  S SKrS SKrS SKJr  S SKJr  S SKJ	r	J
r
Jr  S SKJr  S SKrSSKJr  S	S
KJr  \	(       a  S SKJr  S SKJr  S SKJr  S SKJr  S	SKJr  / r " S S\5      r " S S\5      rS rS rSS jr     SS jr! " S S\5      r"g)    )annotationsN)defaultdict)reduce)TYPE_CHECKINGNoReturn	TypedDict)NotRequired   )	framework   )	Optimizer)Sequence)Tensor)GradientClipBase)WeightDecayRegularizer)_ParameterConfigc                      \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   S\S'   S\S'   S\S'   S\S'   Srg)_LbfgsState*   int
func_evalsn_iterr   dalphazlist[Tensor]old_ykold_skroH_diagprev_flat_gradfloat	prev_losszNotRequired[list[Tensor]]al N__name__
__module____qualname____firstlineno____annotations____static_attributes__r#       V/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/optimizer/lbfgs.pyr   r   *   s?    OKIMN!!r+   r   c                       \ rS rSr% S\S'   Srg)_LbfgsStateDict8   r   stater#   Nr$   r#   r+   r,   r.   r.   8   s    r+   r.   c                     [         R                  R                  5       (       a2  [        R                  " S5      S:w  a  [
        R                  " S5        ggg)z-Check and warn about TF32 acceleration statusNVIDIA_TF32_OVERRIDE0zWarning! TF32 Tensor Cores are enabled by default on some NVIDIA GPUs for faster computation, but may compromise numerical precision in specific cases, particularly with the L-BFGS optimizer.To disable it, set: NVIDIA_TF32_OVERRIDE=0N)paddledeviceis_compiled_with_cudaosgetenvwarningswarnr#   r+   r,   check_tf32_overrider;   <   sD     	++--II,-49	
 5 	.r+   c                $    X-  R                  SS9$ )z
NOTE: This is a temporary workaround for unstable result computed by `paddle.dot`,
which will be reverted when the problem is fixed."
axis)sumxys     r,   dotrD   I   s    
 E;;B;r+   c                :   Ub  Uu  pxOX::  a  X4OX04u  pxX%-   SX-
  -  X-
  -  -
  n	U	S-  X%-  -
  n
U
S:  a_  U
R                  5       nX::  a  X3U -
  X[-   U	-
  XR-
  SU-  -   -  -  -
  nOX U-
  X+-   U	-
  X%-
  SU-  -   -  -  -
  n[        [        X5      U5      $ Xx-   S-  $ )a-  Cubic interpolation between (x1, f1, g1) and (x2, f2, g2).
    Use two points and their gradient to determine a cubic function and get the minimum point
    between them in the cubic curve.

Reference:
    Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
    pp59: formula 3.59

Args:
    x1, f1, g1: point1's position, value and gradient.
    x2, f2, g2: point2's position, value and gradient.
    bounds: bounds of interpolation area

Returns:
    min_pos: the minimum point between the specified points in the cubic curve.
   r
   r   g       @)sqrtminmax)x1f1g1x2f2g2bounds
xmin_bound
xmax_boundd1	d2_squared2min_poss                r,   _cubic_interpolaterW   Q   s    $ !'
J-/X"B8
	1=BG,	,BAIA~^^8G2"'AF:J(KLLGG2"'AF:J(KLLG3w+Z88'3..r+   c           
        UR                  5       R                  5       nUR                  5       nU " XU5      u  pSn[        X5      nSXEU4u  nnnnSnSnUU
:  a  XXr-  U-  -   :  d  US:  a%  UU:  a  UU/nUU/nUUR                  5       /nUU/nO[        U5      U* U-  ::  a  U/nU/nU/nSnOUS:  a  UU/nUU/nUUR                  5       /nUU/nOaUSUU-
  -  -   nUS-  nUn[	        UUUUUUUU4S9nUnUnUR                  5       nUnU " XU5      u  pUS-  n[        X5      nUS-  nUU
:  a  M  UU
:X  a
  SU/nXL/nX]/nSnWS   US   ::  a  S	OS
u  nnU(       Gd  UU
:  Ga  [        WS   US   -
  5      U-  U	:  a  GO[	        US   US   WS   US   US   US   5      nS[        U5      [        U5      -
  -  n [        [        U5      U-
  U[        U5      -
  5      U :  ax  U(       d  U[        U5      :  d  U[        U5      ::  aP  [        U[        U5      -
  5      [        U[        U5      -
  5      :  a  [        U5      U -
  nO[        U5      U -   nSnOSnOSnU " XU5      u  pUS-  n[        X5      nUS-  nXXr-  U-  -   :  d	  UUU   :  a6  UUU'   UUU'   UR                  5       WU'   UUU'   US   US   ::  a  S	OS
u  nnOj[        U5      U* U-  ::  a  SnO2UUU   UU   -
  -  S:  a   UU   UU'   UU   UU'   WU   UU'   UU   UU'   UUU'   UUU'   UR                  5       WU'   UUU'   U(       d	  UU
:  a  GM  WU   nUU   nWU   nXX.4$ )a{  Implements of line search algorithm that satisfies the strong Wolfe conditions using double zoom.

Reference:
    Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
    pp60: Algorithm 3.5 (Line Search Algorithm).

Args:
    obj_func: the objective function to minimize. ```` accepts a multivariate input and returns a scalar.
    xk (Tensor): the starting point of the iterates.
    alpha (Scalar): the initial step size.
    d (Tensor): search direction.
    loss (scalar): the initial loss
    grad (Tensor): the initial grad
    c1 (Scalar): parameter for sufficient decrease condition.
    c2 (Scalar): parameter for curvature condition.
    tolerance_change (Scalar): terminates if the change of function value/position/parameter between
        two iterations is smaller than this value.
    max_ls(int): max iteration of line search.
    alpha_max (float): max step length.

Returns:
    loss_new (Scaler): loss of obj_func at final alpha.
    grad_new, (Tensor): derivative of obj_func at final alpha.
    alpha(Tensor): optimal step length, or 0. if the line search algorithm did not converge.
    ls_func_evals (Scaler): number of objective function called in line search process.

Following summarizes the essentials of the strong Wolfe line search algorithm.
Some notations used in the description:

    - `func` denotes the objective function.
    - `obi_func` is a function of step size alpha, restricting `obj_func` on a line.

        obi_func = func(xk + alpha * d),
        where xk is the position of k'th iterate, d is the line search direction(decent direction),
        and a is the step size.
    - alpha : substitute of alpha
    - a1 is alpha of last iteration, which is alpha_(i-1).
    - a2 is alpha of current iteration, which is alpha_i.
    - a_lo is alpha in left position when calls zoom, which is alpha_low.
    - a_hi is alpha in right position when calls zoom, which is alpha_high.

Line Search Algorithm:
    repeat
        Compute obi_func(a2) and derphi(a2).
        1. If obi_func(a2) > obi_func(0) + c_1 * a2 * obi_func'(0) or [obi_func(a2) >= obi_func(a1) and i > 1],
            alpha= zoom(a1, a2) and stop;

        2. If |obi_func'(a2)| <= -c_2 * obi_func'(0),
            alpha= a2 and stop;

        3. If obi_func'(a2) >= 0,
            alpha= zoom(a2, a1) and stop;

        a1 = a2
        a2 = min(2 * a2, a2)
        i = i + 1
    end(repeat)

zoom(a_lo, a_hi) Algorithm:
    repeat
        aj = cubic_interpolation(a_lo, a_hi)
        Compute obi_func(aj) and derphi(aj).
        1. If obi_func(aj) > obi_func(0) + c_1 * aj * obi_func'(0) or obi_func(aj) >= obi_func(a_lo),
            then a_hi <- aj;
        2.
            2.1. If |obi_func'(aj)| <= -c_2 * obi_func'(0), then alpha= a2 and stop;

            2.2. If obi_func'(aj) * (a2 - a1) >= 0, then a_hi = a_lo

            a_lo = aj;
    end(repeat)

reference: https://github.com/pytorch/pytorch
r   r   FTg{Gz?
   )rP   r=   )r   r   )r   r   g?)absrI   clonerD   rW   rH   )!obj_funcxkr   r   lossgradgtdc1c2tolerance_changemax_lsd_normloss_newgrad_newls_func_evalsgtd_newt_prevf_prevg_prevgtd_prevdonels_iterbracket	bracket_f	bracket_gbracket_gtdmin_stepmax_steptmpinsuf_progresslow_poshigh_posepss!                                    r,   _strong_wolfer{   u   s|   p UUW[[]F::<D!"Q/HM(G )*4s';$FFFHDG
F
bj3../aKH.uoG*I!12I#W-Kw<B39$gG!
I!
IDa<uoG*I!12I#W-K 456>222:"h'
 !%b3h"1a F
f &e*$	$	
 N"+A,)B-"?VGXw'wqzGAJ&'&03CC #AJaLNAJaLN
" S\CL01s7|e#US\%9:S@#g,!6%3w<:Ous7|+,s53w<3G/HHL3.EL3.E!&!%"N%b3h"1 rzC//09W-- !&GH"*Ih"*.."2Ih$+K!#A,)A,6F GX 7|sSy(GH-0@@AQF$+G$4!&/&8	(#&/&8	(#(3G(<H%  %GG!)Ig!)!1Ig#*K M w'R GE!H!Hu33r+   c                     ^  \ rS rSrSr           S                       SU 4S jjjrSS jrSS jrS rS r	S r
S	 rS
 r\R                  SS j5       r S SS jjrSrU =r$ )LBFGSie  a  
The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
Closely related is the Newton method for minimization. Consider the iterate update formula:

.. math::
    x_{k+1} = x_{k} + H_k \nabla{f_k}

If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then
it's a quasi-Newton. In practice, the approximated Hessians are obtained
by only using the gradients, over either whole or part of the search
history, the former is BFGS, the latter is L-BFGS.

Reference:
    Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).

Args:
    learning_rate (float, optional): learning rate .The default value is 1.
    max_iter (int, optional): maximal number of iterations per optimization step.
        The default value is 20.
    max_eval (int|None, optional): maximal number of function evaluations per optimization
        step. The default value is max_iter * 1.25.
    tolerance_grad (float, optional): termination tolerance on first order optimality
        The default value is 1e-5.
    tolerance_change (float, optional): termination tolerance on function
        value/parameter changes. The default value is 1e-9.
    history_size (int, optional): update history size. The default value is 100.
    line_search_fn (string|None, optional): either 'strong_wolfe' or None. The default value is strong_wolfe.
    parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
        This parameter is required in dygraph mode. The default value is None.
    weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
        It can be a int or float value as coeff of L2 regularization or \
        :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
        If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
        the regularization setting here in optimizer will be ignored for this parameter. \
        Otherwise, the regularization setting here in optimizer will take effect. \
        Default None, meaning there is no regularization.
    grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of \
        some derived class of ``GradientClipBase`` . There are three clipping strategies \
        ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
        :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
    name (str|None, optional): Normally there is no need for user to set this property.
        For more information, please refer to :ref:`api_guide_Name`.
        The default value is None.

Return:
    loss (Tensor): the final loss of closure.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import numpy as np

        >>> paddle.disable_static()
        >>> np.random.seed(0)
        >>> np_w = np.random.rand(1).astype(np.float32)
        >>> np_x = np.random.rand(1).astype(np.float32)

        >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)]
        >>> # y = 2x
        >>> targets = [2 * x for x in inputs]

        >>> class Net(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...         w = paddle.to_tensor(np_w)
        ...         self.w = paddle.create_parameter(shape=w.shape, dtype=w.dtype, default_initializer=paddle.nn.initializer.Assign(w))
        ...
        ...     def forward(self, x):
        ...         return self.w * x
        ...
        >>> net = Net()
        >>> opt = paddle.optimizer.LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters())
        >>> def train_step(inputs, targets):
        ...     def closure():
        ...         outputs = net(inputs)
        ...         loss = paddle.nn.functional.mse_loss(outputs, targets)
        ...         print('loss: ', loss.item())
        ...         opt.clear_grad()
        ...         loss.backward()
        ...         return loss
        ...     opt.step(closure)
        ...
        >>> for input_np, target_np in zip(inputs, targets):
        ...     input = paddle.to_tensor(input_np)
        ...     target = paddle.to_tensor(target_np)
        ...     train_step(input, target)
c                   > [        5         Uc  US-  S-  nXl        X l        X0l        X@l        XPl        X`l        Xpl        [        U[        R                  5      (       a  [        S[        U5      -   5      e[        [        5      U l        [         TU ]E  SUU	U
US9  [        U R$                  S   [        5      (       d  U R$                  U l        O([)        U R*                  5       H  u  pUS   U l        M     S U l        g )N      z^parameters argument given to the optimizer should be an iterable of Tensors or dicts, but got       ?)learning_rate
parametersweight_decay	grad_clipnamer   params)r;   r   max_itermax_evaltolerance_gradrc   history_sizeline_search_fn
isinstancer4   r   	TypeErrortyper   dictr0   super__init___parameter_list_params	enumerate_param_groups_numel_cache)selfr   r   r   r   rc   r   r   r   r   r   r   idxparam_group	__class__s                 r,   r   LBFGS.__init__  s     	!|q(H*  , 0(,j&--00<>B:>NO 
 !&
!% 	 	
 $..q1488//DL$-d.@.@$A *84 %B !r+   c                x    0 nU R                   R                  5        H  u  p#UR                  X#05        M     SU0$ )aP  Returns the state of the optimizer as a :class:`dict`.

Return:
    state, a dict holding current optimization state. Its content
    differs between optimizer classes.

Examples:
    .. code-block:: python

        >>> import paddle

        >>> paddle.disable_static()

        >>> net = paddle.nn.Linear(10, 10)
        >>> opt = paddle.optimizer.LBFGS(
        ...     learning_rate=1,
        ...     max_iter=1,
        ...     max_eval=None,
        ...     tolerance_grad=1e-07,
        ...     tolerance_change=1e-09,
        ...     history_size=100,
        ...     line_search_fn='strong_wolfe',
        ...     parameters=net.parameters(),
        >>> )

        >>> def train_step(inputs, targets):
        ...     def closure():
        ...         outputs = net(inputs)
        ...         loss = paddle.nn.functional.mse_loss(outputs, targets)
        ...         opt.clear_grad()
        ...         loss.backward()
        ...         return loss
        ...
        ...     opt.step(closure)
        ...
        >>> inputs = paddle.rand([10, 10], dtype="float32")
        >>> targets = paddle.to_tensor([2 * x for x in inputs])

        >>> n_iter = 0
        >>> while n_iter < 20:
        ...     loss = train_step(inputs, targets)
        ...     n_iter = opt.state_dict()["state"]["func_evals"]
        ...     print("n_iter:", n_iter)
r0   )r0   itemsupdate)r   packed_statekvs       r,   
state_dictLBFGS.state_dict  s@    \ JJ$$&DA' ' &&r+   c                n    U R                   c  [        S U R                  S5      U l         U R                   $ )Nc                &    XR                  5       -   $ N)numel)totalps     r,   <lambda>LBFGS._numel.<locals>.<lambda>+  s    !2r+   r   )r   r   r   )r   s    r,   _numelLBFGS._numel'  s4    $ &2DLL!!D    r+   c                   / nU R                    Hd  nUR                  c'  [        R                  " U5      R	                  S/5      nOUR                  R	                  S/5      nUR                  U5        Mf     [        R                  " USS9$ )Nr=   r   r>   )r   r_   r4   
zeros_likereshapeappendconcat)r   viewsr   views       r,   _gather_flat_gradLBFGS._gather_flat_grad0  sn    Avv~((+33RD9vv~~rd+LL  }}U++r+   c           	     >   SnU R                    Hv  nUR                  / :w  a  [        S UR                  5      OSn[        R                  " UR                  X#X5-    R                  UR                  5      U-  5      U5      nX5-  nMx     X0R                  5       :X  d   eg )Nr   c                
    X-  $ r   r#   rA   s     r,   r   !LBFGS._add_grad.<locals>.<lambda>>  s    r+   r   )r   shaper   r4   assignaddr   r   )r   r   	directionoffsetr   r   s         r,   	_add_gradLBFGS._add_grad;  s    A;<77b=F-qww7aEv~6>>qwwG%O 	A OF  &&&r+   c                `    U R                    Vs/ s H  oR                  5       PM     sn$ s  snf r   )r   r[   )r   r   s     r,   _clone_paramLBFGS._clone_paramH  s"    #'<<0<a	<000s   +c                n    [        U R                  U5       H  u  p#[        R                  " X25        M     g r   )zipr   r4   r   )r   params_datar   pdatas       r,   
_set_paramLBFGS._set_paramK  s%    DLL+6HAMM%# 7r+   c                    U R                  X45        [        U" 5       5      nU R                  5       nU R                  U5        XV4$ r   )r   r    r   r   )r   closurerB   r   r   r^   	flat_grads          r,   _directional_evaluateLBFGS._directional_evaluateO  s<    u WY**,	r+   c           
       ^ ^ [         R                  " 5          [         R                  " 5       " T5      mT R                  nT R                  nT R
                  nT R                  nT R                  nT R                  nT R                  nT R                  n	U	R                  SS5        U	R                  SS5        T" 5       n
[        U
5      nSnU	S==   S-  ss'   T R                  5       nUR                  5       R                  5       U:*  nU(       a  U
sSSS5        $ U	R!                  S5      nU	R!                  S5      nU	R!                  S5      nU	R!                  S	5      nU	R!                  S
5      nU	R!                  S5      nU	R!                  S5      nU	R!                  S5      nSnUU:  Ga  US-  nU	S==   S-  ss'   U	S   S:X  a7  UR#                  5       n/ n/ n/ n[         R$                  " SU
R&                  S9nGOUR)                  U5      nUR+                  [         R$                  " UUR&                  S95      n[-        UU5      nUS:  a  [/        U5      U:X  a3  UR1                  S5        UR1                  S5        UR1                  S5        UR3                  U5        UR3                  U5        UR3                  SU-  5        U[-        UU5      -  n[/        U5      nSU	;  a	  S/U-  U	S'   U	S   nUR#                  5       n[5        US-
  SS5       HK  n[-        UU   U5      UU   -  UU'   [         R6                  " UR9                  UU   UU   * -  5      U5        MM     [         R*                  " UU5      =nn[5        U5       HJ  n[-        UU   U5      UU   -  n [         R6                  " UR9                  UU   UU   U -
  -  5      U5        ML     Uc  UR;                  5       nO[         R6                  " UU5        UnU	S   S:X  a/  [=        SSUR                  5       R?                  5       -  5      U-  nOUn[-        X5      n!U!U* :  a  GOWSn"Ubp  US:w  a  [A        S5      eT RC                  5       n#UU 4S jn$[E        U$U#UXUU!5      u  pnn"T RG                  UU5        UR                  5       R                  5       U:*  nOyT RG                  UU5        UU:w  aa  [         R                  " 5          [        T" 5       5      nSSS5        T R                  5       nUR                  5       R                  5       U:*  nSn"UU"-  nU	S==   U"-  ss'   U(       a  OOUU-  R                  5       R                  5       U::  a  O)[        UU-
  5      U:  a  OX:  a  OUU:X  a  O	UU:  a  GM  XS'   UU	S'   UU	S'   UU	S	'   UU	S
'   UU	S'   UU	S'   UU	S'   SSS5        U
$ ! , (       d  f       N= f! , (       d  f       W
$ = f)a.  Performs a single optimization step.

Args:
    closure (callable): A closure that reevaluates the model
    and returns the loss.

Examples:
    .. code-block:: python

        >>> import paddle

        >>> paddle.disable_static()

        >>> inputs = paddle.rand([10, 10], dtype="float32")
        >>> targets = paddle.to_tensor([2 * x for x in inputs])

        >>> net = paddle.nn.Linear(10, 10)
        >>> opt = paddle.optimizer.LBFGS(
        ...     learning_rate=1,
        ...     max_iter=1,
        ...     max_eval=None,
        ...     tolerance_grad=1e-07,
        ...     tolerance_change=1e-09,
        ...     history_size=100,
        ...     line_search_fn='strong_wolfe',
        ...     parameters=net.parameters(),
        >>> )

        >>> def closure():
        ...     outputs = net(inputs)
        ...     loss = paddle.nn.functional.mse_loss(outputs, targets)
        ...     print("loss:", loss.item())
        ...     opt.clear_grad()
        ...     loss.backward()
        ...     return loss
        ...
        >>> opt.step(closure)
r   r   r   r   Nr   r   r   r   r   r   r   r!   r   )dtypeg|=r"   r=   strong_wolfez only 'strong_wolfe' is supportedc                *   > TR                  TXU5      $ r   )r   )rB   r   r   r   r   s      r,   r\   LBFGS.step.<locals>.obj_func  s    #'#=#= '1$ r+   )$r4   no_gradenable_gradr   r   r   r   rc   r   r   r0   
setdefaultr    r   rZ   rI   getneg	to_tensorr   subtractmultiplyrD   lenpopr   ranger   r   r[   rH   r@   RuntimeErrorr   r{   r   )%r   r   r   r   r   r   rc   r   r   r0   	orig_lossr^   current_evalsr   opt_condr   r   r   r   r   r   r   r!   r   rC   sysnum_oldr"   qirbe_ir`   rh   x_initr\   s%   ``                                   r,   step
LBFGS.stepV  s   R ^^((*73G ..M}}H}}H!00N#44!00N,,LJJE\1-Xq)  	I#DM,1$..0I }}**,>H  7 < 		#AIIg&EYYx(FYYx(F4BYYx(F"YY'78N		+.IF8#!h1$
 ?a'!AFFB#--cIF "**>:A

6#3#3E#IJAQBEzv;,6"JJqM"JJqMFF1I a(a(		#(+ "$c!Qi "&kG5('+f|&;dtB "A"7Q;B7 #F1Iq 1BqE 91aeeF1I"Q%,@&A1E 8 #OOAv66A"7^"6!9a02a58aeeF1IA,F&GK , ")%.__%6NMM)^< 	 ?a'Cy}}':':'<!<=M  *E )' *** !"!-%7*+MNN!%!2!2!4
 AN$feQiA= NN5!,(}}224FH NN5!,)#//1#(#3D 2$($:$:$<	#,==?#6#6#8N#J() .l#}4#  I??$((*.>>ti'(+;; !,X%C 8#F #J"E'N$E(O$E(OE$K$E(O&4E"#!*E+g j K 21a j s2   C:W+OW+4WB!W+)'W+
W(	$W++
W:c                    [        S5      e)z}Empty method. LBFGS optimizer does not use this way to minimize ``loss``. Please refer 'Examples' of LBFGS() above for usage.zeLBFGS optimizer does not use this way to minimize loss. Please refer 'Examples' of LBFGS() for usage.)NotImplementedError)r   r^   startup_programr   no_grad_sets        r,   minimizeLBFGS.minimize6  s     "s
 	
r+   )
r   r   r   r   r   r   r   r0   rc   r   )r      NgHz>&.>d   NNNNN)r   r    r   r   r   z
int | Noner   r    rc   r    r   r   r   
str | Noner   z4Sequence[Tensor] | Sequence[_ParameterConfig] | Noner   z%float | WeightDecayRegularizer | Noner   zGradientClipBase | Noner   r   returnNone)r   r.   )r   r   )r   r   )NNN)r   r   )r%   r&   r'   r(   __doc__r   r   r   r   r   r   r   r   r   non_static_onlyr   r   r*   __classcell__)r   s   @r,   r}   r}   e  s	   Xx  ## $"&%)KO>B-11!1! 1! 	1!
 1!  1! 1! #1! I1! <1! +1! 1! 
1! 1!f2'h!,'1$ ] ]@ HL
	
 
r+   r}   r   )g-C6?g?r      )#
__future__r   r7   r9   collectionsr   	functoolsr   typingr   r   r   typing_extensionsr	   r4   baser   	optimizerr   collections.abcr   r   paddle.nn.clipr   paddle.regularizerr   r   __all__r   r.   r;   rD   rW   r{   r}   r#   r+   r,   <module>r	     s    # 	  #  5 5 )    (/9+
") "i 

 !/X 
m4`W
I W
r+   