
    ϑi)                        S SK Jr  S SKJrJr  S SKrS SKrSSKJ	r	  SSK
JrJrJr  \(       a  S SKJr  S SKJr           S
                       SS	 jjrg)    )annotations)TYPE_CHECKINGLiteralN   )strong_wolfe)_value_and_gradient&check_initial_inverse_hessian_estimatecheck_input_type)Callable)Tensorc                  ^ ^^^^^^^	^ T	S;  a  [        ST	 S35      eSn[        USU5        [        R                  " UR                  S   T	S9mUc  TnO[        USU5        [        U5        [        R                  " U5      n[        R                  " UR                  5       5      n[        T U5      u  p[        R                  " S	/S	S
S9n[        R                  " S	/SS
S9n[        R                  " S	/SSS9n[        R                  " S	/SSS9nU4S jnUU	UUUU UU4S jn[        R                  R                  R                  UUUUUUXX/S9  UUXX4$ )ao  
Minimizes a differentiable function `func` using the BFGS method.
The BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
Closely related is the Newton method for minimization. Consider the iterate update formula:

.. math::
    x_{k+1} = x_{k} + H_k \nabla{f_k}

If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then
it's a quasi-Newton. In practice, the approximated Hessians are obtained
by only using the gradients, over either whole or part of the search
history, the former is BFGS, the latter is L-BFGS.

Reference:
    Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp140: Algorithm 6.1 (BFGS Method).

Args:
    objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar.
    initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` .
    max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
    tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
    tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9.
    initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. If not given, will use an identity matrix of order N, which is size of ``initial_position`` . Default value: None.
    line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the future. Default value: 'strong wolfe'.
    max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50.
    initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0.
    dtype ('float32' | 'float64', optional): data type used in the algorithm, the data type of the input parameter must be consistent with the dtype. Default value: 'float32'.
    name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None.

Returns:
    output(tuple):

        - is_converge (bool): Indicates whether found the minimum within tolerance.
        - num_func_calls (int): number of objective function called.
        - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regarding to the initial position.
        - objective_value (Tensor): objective function value at the `position`.
        - objective_gradient (Tensor): objective function gradient at the `position`.
        - inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`.

Examples:
    .. code-block:: python
        :name: code-example1

        >>> # Example1: 1D Grid Parameters
        >>> import paddle
        >>> # Randomly simulate a batch of input data
        >>> inputs = paddle. normal(shape=(100, 1))
        >>> labels = inputs * 2.0
        >>> # define the loss function
        >>> def loss(w):
        ...     y = w * inputs
        ...     return paddle.nn.functional.square_error_cost(y, labels).mean()
        >>> # Initialize weight parameters
        >>> w = paddle.normal(shape=(1,))
        >>> # Call the bfgs method to solve the weight that makes the loss the smallest, and update the parameters
        >>> for epoch in range(0, 10):
        ...     # Call the bfgs method to optimize the loss, note that the third parameter returned represents the weight
        ...     w_update = paddle.incubate.optimizer.functional.minimize_bfgs(loss, w)[2]
        ...     # Use paddle.assign to update parameters in place
        ...     paddle. assign(w_update, w)

    .. code-block:: python
        :name: code-example2

        >>> # Example2: Multidimensional Grid Parameters
        >>> import paddle
        >>> def flatten(x):
        ...     return x. flatten()
        >>> def unflatten(x):
        ...     return x.reshape((2,2))
        >>> # Assume the network parameters are more than one dimension
        >>> def net(x):
        ...     assert len(x.shape) > 1
        ...     return x.square().mean()
        >>> # function to be optimized
        >>> def bfgs_f(flatten_x):
        ...     return net(unflatten(flatten_x))
        >>> x = paddle.rand([2,2])
        >>> for i in range(0, 10):
        ...     # Flatten x before using minimize_bfgs
        ...     x_update = paddle.incubate.optimizer.functional.minimize_bfgs(bfgs_f, flatten(x))[2]
        ...     # unflatten x_update, then update parameters
        ...     paddle.assign(unflatten(x_update), x)
)float32float64z?The dtype must be 'float32' or 'float64', but the specified is .minimize_bfgsinitial_positionr   dtype initial_inverse_hessian_estimater   int64shape
fill_valuer   Fboolc                   > U T:  U) -  $ )N )	kdoneis_convergenum_func_callsxkvalueg1Hk	max_iterss	           i/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/incubate/optimizer/functional/bfgs.pycondminimize_bfgs.<locals>.cond   s    I$&&    c           	       >^ [         R                  " Xv5      * nTS:X  a  [        TUUTTTS9u  ppO[        ST S35      eX;-  nX-  nX-
  nXL-   nU
n[         R                  " US5      n[         R                  " US5      n[         R
                  " X5      m[         R                  R                  R                  TS:H  U4S jU4S j5      nTX-  UR                  5       -  -
  nTX-  UR                  5       -  -
  n[         R                  " [         R                  " X5      U5      X-  UR                  5       -  -   nU S	-  n [         R                  R                  U[        R                  S
9n[         R                  R                  U[        R                  S
9n[         R                  " UUT:  -  UT:  -  U5        [         R                  " X5        [         R                  " XS:H  -  U5        XX#XEXg/$ )Nr   )fr!   pkr%   initial_step_lengthr   zNCurrently only support line_search_fn = 'strong_wolfe', but the specified is ''r   g        c                 2   > [         R                  " S/ST S9$ )Nr   g     @@r   )paddlefullr   s   r&   <lambda>-minimize_bfgs.<locals>.body.<locals>.<lambda>   s    FKKqcfEJr)   c                    > ST -  $ )N      ?r   )rhok_invs   r&   r2   r3      s	    C(Nr)   r   )p)r0   matmulr   NotImplementedError	unsqueezedotstaticnnr'   tlinalgnormnpinfassign)r   r   r   r    r!   r"   r#   r$   r,   alphag2ls_func_callsskykrhokVk_transposeVkgnormpk_normr6   Ir   r-   line_search_fnmax_line_search_itersobjective_functolerance_changetolerance_grads                      @r&   bodyminimize_bfgs.<locals>.body   s   mmB## ^+.: /$7/+E"m &`ao`ppqr  	' ZWWb!$b!$::b%}}$$OJ"
 49rttv--RTTV##MM&--92>i"$$& ! 	
 	
Q ""2"0--$$R266$2EN*+w9I/IJD	
 	d(dsl+T2bHHr)   )r'   rT   	loop_vars)
ValueErrorr
   r0   eyer   r	   rC   detachr   r1   r<   r=   
while_loop)rQ   r   r%   rS   rR   r   rO   rP   r-   r   nameop_namer$   r!   r"   r#   r    r   r   r   r'   rT   rN   s   ` ``` ````            @r&   r   r   $   sb   F **MeWTUV
 	
 G%'97C

#))!,E:A'/+,(,.	

 	//OP	7	8B	'..0	1B#NB7IE[[sqHN 	1#!7;A;;aSU&AD++QCEHK'7I 7Ir MMdKBK   
 299r)   )	2   gHz>g&.>Nr   r]   r5   r   N)rQ   zCallable[[Tensor], Tensor]r   r   r%   intrS   floatrR   r_   r   zTensor | NonerO   zLiteral['strong_wolfe']rP   r^   r-   r_   r   zLiteral['float32', 'float64']r[   z
str | Nonereturnz0tuple[bool, int, Tensor, Tensor, Tensor, Tensor])
__future__r   typingr   r   numpyrA   r0   line_searchr   utilsr   r	   r
   collections.abcr   r   r   r   r)   r&   <module>rg      s    # )   %  (  "6:.<!#!$+4C:.C:C: C: 	C:
 C: '4C: ,C: C: C: )C: C: 6C:r)   