
    x-j<4                        d dl mZ d dlmZ d dlZd dlmZmZ d dlm	Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ erd dlmZ d dlmZ d dlmZ g Z G d de          ZdS )    )annotations)TYPE_CHECKINGN)	frameworkunique_name)base)Variable)LayerHelper)in_pir_mode)	Optimizer)create_parameter)Tensor)Operator)Programc                       e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   dZ	 	 	 d$d% fdZ fdZej	        e
j        d&d                        Zd Zd Zd Ze
j        	 	 	 d'd(d#            Z xZS ))	LookAheada  
    This implements the Lookahead optimizer of the
    paper : https://arxiv.org/abs/1907.08610.

    Lookahead keeps two sets of params: the fast_params and
    the slow_params. inner_optimizer update fast_params every
    training step. Lookahead updates the slow_params and fast_params
    every k training steps as follows:

    .. math::

        slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})

        fast\_param_t &=  slow\_param_t

    Args:
        inner_optimizer (Optimizer): The optimizer that update fast params step by step.
        alpha (float, optional): The learning rate of Lookahead. The default value is 0.5.
        k (int, optional): The slow params is updated every k steps. The default value is 5.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    Examples:

        .. code-block:: python

            >>> import numpy as np
            >>> import paddle
            >>> import paddle.nn as nn

            >>> BATCH_SIZE = 16
            >>> BATCH_NUM = 4
            >>> EPOCH_NUM = 4

            >>> IMAGE_SIZE = 784
            >>> CLASS_NUM = 10
            >>> # define a random dataset
            >>> class RandomDataset(paddle.io.Dataset): # type: ignore[type-arg]
            ...     def __init__(self, num_samples):
            ...         self.num_samples = num_samples
            ...     def __getitem__(self, idx):
            ...         image = np.random.random([IMAGE_SIZE]).astype('float32')
            ...         label = np.random.randint(0, CLASS_NUM - 1,
            ...                                 (1, )).astype('int64')
            ...         return image, label
            ...     def __len__(self):
            ...         return self.num_samples

            >>> class LinearNet(nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...         self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
            ...         self.bias = self._linear.bias
            ...     @paddle.jit.to_static
            ...     def forward(self, x):
            ...         return self._linear(x)

            >>> def train(layer, loader, loss_fn, opt):
            ...     for epoch_id in range(EPOCH_NUM):
            ...         for batch_id, (image, label) in enumerate(loader()):
            ...             out = layer(image)
            ...             loss = loss_fn(out, label)
            ...             loss.backward()
            ...             opt.step()
            ...             opt.clear_grad()
            ...             print("Train Epoch {} batch {}: loss = {}".format(
            ...                 epoch_id, batch_id, np.mean(loss.numpy())))
            >>> layer = LinearNet()
            >>> loss_fn = nn.CrossEntropyLoss()
            >>> optimizer = paddle.optimizer.SGD(learning_rate=0.1, parameters=layer.parameters())
            >>> lookahead = paddle.incubate.LookAhead(optimizer, alpha=0.2, k=5)

            >>> # create data loader
            >>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
            >>> loader = paddle.io.DataLoader(
            ...     dataset,
            ...     batch_size=BATCH_SIZE,
            ...     shuffle=True,
            ...     drop_last=True,
            ...     num_workers=2)

            >>> # doctest: +SKIP('The run time is too long to pass the CI check.')
            >>> train(layer, loader, loss_fn, lookahead)

    r   inner_optimizerfloatalphaintkstrtyper	   helperslow      ?   Nname
str | NonereturnNonec                .   |
J d            d|cxk    rdk    sn J d            t          |t                    r|dk    s
J d            || _        | j        j        Ct          j                                                                                                        }n| j        j        }t                      
                    ||d d |           || _        || _        d| _        t          | j        j                  | _        d | _        d | _        d S )	Nzinner optimizer can not be None              ?zBalpha should be larger or equal to 0.0, and less or equal than 1.0r   zk should be a positive integer)learning_rate
parametersweight_decay	grad_clipr   	lookahead)
isinstancer   r   _parameter_listpaddlestaticdefault_main_programglobal_blockall_parameterssuper__init__r   r   r   r	   	__class____name__r   _global_step_var_k_var)selfr   r   r   r   r%   r2   s         c/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/incubate/optimizer/lookahead.pyr1   zLookAhead.__init__   s+    **,M***e""""s"""""P #"" !S!!Ma!eee-Mee+./72244!! J -=J! 	 	
 	
 	
 
	!$."9:: $    c                    t                                          ||           | j                            ||           d S N)r0   _set_auxiliary_varr   )r6   keyvalr2   s      r7   r;   zLookAhead._set_auxiliary_var   s<    ""3,,,//S99999r8   c                6   | j                                          |                                  g }| j        D ]I}|j        s
|                                +|                                }|                    ||f           J|                     dd|           dS )a  
        Execute the optimizer and update parameters once.

        Returns:
            None

        Examples:

            .. code-block:: python

                >>> import paddle
                >>> inp = paddle.rand([1,10], dtype="float32")
                >>> linear = paddle.nn.Linear(10, 1)
                >>> out = linear(inp)
                >>> loss = paddle.mean(out)
                >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
                >>> lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
                >>> loss.backward()
                >>> lookahead.step()
                >>> lookahead.clear_grad()

        N)lossstartup_programparams_grads)r   step_increment_global_varr*   	trainable
_grad_ivarappend_apply_optimize)r6   rA   paramgrad_vars       r7   rB   zLookAhead.step   s    2 	!!###""$$$) 	7 	7E? !!- ++--##UH$5666t, 	 	
 	
 	
 	
 	
r8   c                    t          |t          j        t          j        j        f          sJ |D ]}|                     | j        |           d S r:   )r)   r   Blockr+   pir_add_accumulator	_slow_str)r6   blockr%   ps       r7   _create_accumulatorszLookAhead._create_accumulators   sX    %)/6:3C!DEEEEE 	5 	5A!!$.!4444	5 	5r8   c           
        t                      rx| j        Pt          ddgt          j        d          dt
          j        j                            dd                    | _        t          j	        | j        d          | _        d S | j        <t
          j
                            t          j        d          dgd	dd
          | _        | j                            dd| j        gid| j        giddi           d S )Nint32   lookahead_stepFr"   value	force_cpudtypeshaper   rD   initializerr#   r   Tr   r[   rW   rZ   persistable	incrementXOutrB   )r   inputsoutputsattrs)r
   r4   r   r   generater+   nnr\   ConstantInitializerr_   r,   create_global_varr   	append_op)r6   s    r7   rC   zLookAhead._increment_global_var   s"   == 	$,(8!#$-.>??# &	 5 I I!U !J ! !) ) )% %+$4T5JC$P$PD!!!$,(.(G(G$-.>??#! $ )H ) )% K!! d345!6 78sm	 "     r8   c                   t          j        dgdd          }t          j        dgdd          }t                      r^t	          ddgt          j        d          dt           j        j        	                    t          | j                  d          	          }n<t           j                            t          j        d          dg| j        dd
          }t          j        | j        |          }t          j        | j        |          }t          j        |d          }t          j        ||          }t          j        |d          }|                     | j        |d                   }	||d         z  d|z
  |	z  z   }
t          j        |
|	           | j        |d         z  d| j        z
  |	z  z   }
||
z  d|z
  |d         z  z   }t          j        ||d                    ||
z  d|z
  |	z  z   }t          j        ||	           d S )NrT   rS   lookahead_ones)r[   rZ   r   lookahead_zeroslookahead_kFrV   rY   Tr]   float32)rZ   r   r#   )r+   oneszerosr
   r   r   re   rf   r\   rg   r   r   r,   rh   	remainderr4   equalcast_get_accumulatorrN   assignr   )r6   rO   param_and_gradone_varzero_vark_varmodcond_1cond_2slow_vartmp_var	tmp_var_1s               r7   _append_optimize_opzLookAhead._append_optimize_op   s   +QCw=MNNN<#W+<
 
 
 == 	$c )-88"I1EE--5 F    EE M33 )-88cf  4  E t4e<<d3W==V9555c8,,V9555((9JKK>!,,F
h/FFgx(((*~a00C$*4D3PPW$F
nQ6G'GG	i!2333W$F
h'>>	i*****r8   r?   r   r@   Program | Noner%   list[Tensor] | list[str] | Noneno_grad_setset[Tensor] | set[str] | None2tuple[list[Operator], list[tuple[Tensor, Tensor]]]c                   t          |t          t          j        j        f          s
J d            | j                            ||||          \  }}|                                  |                     |||          }||fS )a  
        Add operations to minimize ``loss`` by updating ``parameters``.

        Args:
            loss (Tensor): A ``Tensor`` containing the value to minimize.
            startup_program (Program, optional): :ref:`api_paddle_static_Program` for
                initializing parameters in ``parameters``. The default value
                is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
                to minimize ``loss``. The default value is None, at this time all parameters
                will be updated.
            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                to be updated. The default value is None.

        Returns:
            tuple: tuple (optimize_ops, params_grads), A list of operators appended
            by minimize and a list of (param, grad) tensor pairs, param is
            ``Parameter``, grad is the gradient value corresponding to the parameter.
            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
            indicate program pruning. If so, the program will be pruned by ``feed`` and
            ``fetch_list`` before run, see details in ``Executor``.

        Examples:

            .. code-block:: python

                >>> import paddle

                >>> inp = paddle.rand([1, 10], dtype="float32")
                >>> linear = paddle.nn.Linear(10, 1)
                >>> out = linear(inp)
                >>> loss = paddle.mean(out)
                >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
                >>> lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
                >>> loss.backward()
                >>> lookahead.minimize(loss)
                >>> lookahead.clear_grad()

        zThe loss should be an Tensor.)r@   r%   r   )r@   rA   )	r)   r   r+   rL   Valuer   minimizerC   rG   )r6   r?   r@   r%   r   optimize_opsrA   _s           r7   r   zLookAhead.minimize&  s    ^ $6:+; <== 	
 	
+	
 	
=
 &*%9%B%B+!#	 &C &
 &
"l 	""$$$  / ! 
 
 \))r8   )r   r   N)
r   r   r   r   r   r   r   r   r   r    )r   r    )NNN)
r?   r   r@   r   r%   r   r   r   r   r   )r3   
__module____qualname____doc____annotations__rN   r1   r;   r   dygraph_onlyimperative_baseno_gradrB   rQ   rC   r   r   __classcell__)r2   s   @r7   r   r   $   sN        U Un LLL
FFFIIII
 $ $ $ $ $ $ $L: : : : : $
 $
 $
  $
L5 5 5  <*+ *+ *+X  +/6:59@* @* @* @* @* @* @* @* @*r8   r   )
__future__r   typingr   r+   paddle.baser   r   paddle.base.dygraphr   r   paddle.base.frameworkr   paddle.base.layer_helperr	   paddle.frameworkr
   paddle.optimizerr   paddle.pir.corer   r   r   paddle.staticr   __all__r    r8   r7   <module>r      s;   # " " " " "              . . . . . . . . 7 7 7 7 7 7 * * * * * * 0 0 0 0 0 0 ( ( ( ( ( ( & & & & & & , , , , , , &......%%%%%% C* C* C* C* C*	 C* C* C* C* C*r8   