
    ϑi6                     P    S SK r S SKJr  S SKJrJrJrJrJrJ	r	  / r
 " S S5      rg)    N)core)Variabledefault_main_programdefault_startup_programdevice_guardin_dygraph_modeprogram_guardc                   t    \ rS rSrSrSrSS jrS rS r    SS jr	S	 r
S
 rS rS rS rS r SS jrSrg)GradientMergeOptimizer   a	  
Gradient Merge, also called as Gradient Accumulation,
is a training strategy for larger batches. With this strategy,
the parameter will not be updated until specific steps.

For each step, the forward network and the backward network
will run to calculate the gradient of the parameters.

For every k step, the optimization network will run,
applying a specific optimization method (such as SGD, Adam)
to the parameters.

Args:
    inner_optimizer (Optimizer): The specific optimization (such as SGD, Adam)
        which update the parameters
    k_steps (int): the update period of the parameters
    avg (bool): whether to average the gradients of each mini-batch,
        the default value is `True`

Examples:
    .. code-block:: pycon

        >>> import paddle
        >>> import numpy as np
        >>> paddle.enable_static()

        >>> def gen_data(batch_size):
        ...     return {
        ...         "x": np.random.random(size=(batch_size, 32)).astype('float32'),
        ...         "y": np.random.random(size=(batch_size, 1)).astype('int64'),
        ...     }

        >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
        ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
        ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
        ...     cost = paddle.nn.functional.cross_entropy(
        ...         input=prediction,
        ...         label=input_y,
        ...         reduction='none',
        ...         use_softmax=False,
        ...     )
        ...     sum_cost = paddle.mean(cost)
        ...     return sum_cost, fc_1, prediction

        >>> input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
        >>> input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
        >>> cost, fc_1, pred = mlp(input_x, input_y)
        >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
        >>> sgd = paddle.incubate.optimizer.GradientMergeOptimizer(sgd, k_steps=4, avg=True)
        >>> sgd.minimize(cost)

        >>> place = paddle.CPUPlace()
        >>> exe = paddle.static.Executor(place)
        >>> exe.run(paddle.static.default_startup_program())

        >>> for i in range(10):
        ...     cost_val = exe.run(
        ...         feed=gen_data(32),
        ...         program=paddle.static.default_main_program(),
        ...         fetch_list=[cost.name],
        ...     )
        ...     print("step=%d, cost=%f" % (i, cost_val[0]))
grad_merge_cond_namec                     [        5       (       a  [        S5      eUc   S5       e[        U[        5      (       a  US:  d   S5       eXl        X l        SU l        X0l        S U l        g )NzIn dygraph, we don't support GradientMergeOptimizer.You can do Gradient merge by yourself with k-times forward + backward, and one-time optimizer.minimize()zinner optimizer can not be Noner   z$k_steps should be a positive integergradient_merge)	r   	Exception
isinstanceintinner_optimizerk_stepstypeavg_optimize_ops)selfr   r   r   s       h/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/incubate/optimizer/gradient_merge.py__init__GradientMergeOptimizer.__init__a   su    4  *M,MM*'3''GaK 	
2	
7  /$	!    c                     Xl         g N)r   )r   r   s     r   _set_k_steps#GradientMergeOptimizer._set_k_stepst   s    r   c                     Xl         g r   )r   )r   r   s     r   _set_avgGradientMergeOptimizer._set_avgw   s    r   Nc                     [        U[        5      (       d   S5       eUb   S5       eUb   S5       eU R                  R                  XS9nU$ )NThe loss should be an Variable.zCThe parameter_list should be None when using GradientMergeOptimizerz@The no_grad_set should be None when using GradientMergeOptimizer)startup_program)r   r   r   backward)r   lossr&   parameter_listno_grad_set	callbacksparams_gradss          r   r'   GradientMergeOptimizer.backwardz   ss     $))L+LL)% 	
Q	
% " 	
N	
" ++44 5 
 r   c                     UR                   R                  n[        XB5         U R                  U5      nS S S 5        U$ ! , (       d  f       W$ = fr   )blockprogramr	   apply_gradients)r   r(   r&   r,   r0   optimize_opss         r   apply_optimize%GradientMergeOptimizer.apply_optimize   sC    **$$74//=L 5 54s	   >
Ac                    [         R                  n[         R                  R                  R                  nUR	                  5       UR
                  ;   a8  [        UR                  5       UR                  5          5      [        U5      :X  a  gg)NTF)	r   op_proto_and_checker_makerOpRoleBackwardkOpRoleVarAttrName
attr_namesr   	all_attrskOpRoleAttrName)r   opop_makerr'   s       r   _is_the_backward_op*GradientMergeOptimizer._is_the_backward_op   sk    222299BB&&(BMM9cLLN83356?
]? r   c                    [         R                  nUR                  nU R                  U5      (       d   SU SUR                   35       eUR
                  nUR                  5       UR                  5          nUR                  U;   d   SUR                   SU 35       eUR                  U;   d   SUR                   SU 35       eUR                  UR                  5        UR                  UR                  5        [        U5      S:  a!  UR                  UR                  5       U5        g UR                  UR                  5       5        g )Nzgrad.op=z0 is not the backward op which produces the grad=z)when using GradientMergeOptimizer, param=z must be in var_attr=z(when using GradientMergeOptimizer, grad=   )r   r6   r=   r?   namer/   r;   r9   removelen	_set_attr_remove_attr)r   paramgradr>   r=   r/   var_attrs          r   _remove_op_role_var*GradientMergeOptimizer._remove_op_role_var   s)   22WW''++ 	
rdJ499+V	
+ 

<<>("="="?@zzX% 	
7

|CXYaXbc	
% yyH$ 	
6uzzlBWX`Wab	
$
 	

#		"x=1LL446AOOH779:r   c                 T   Xl         [        R                  nUR                  R                  nUR                  U R                  UR                  5        UR                  UR                  5       U5        UR                  UR                  5       UR                  UR                  /5        g r   )
r=   r   r6   r7   r8   rF   GRAD_MERGE_COND_NAMErC   r<   r9   )r   r=   rH   rI   condr>   r'   s          r   _add_gm_op_role_var*GradientMergeOptimizer._add_gm_op_role_var   sv    22??++ 	T..		:
X--/:
X002UZZ4KLr   c           	         [         R                  R                  SS/[        U R                  5      SSSS9n[         R                  R                  SS/SSSSS9n[         R                  R                  SS/SSSSS9nUR                  S	S/S
S9n[        S5         [         R                  " USS9  UR                  SXBS.SU0SS0S9  UR                  SXCS.SU0S9  S S S 5        U$ ! , (       d  f       U$ = f)Ngradient_merge_krB   int32T)rC   shapevaluedtypepersistable	force_cpugradient_merge_zeror   gradient_merge_stepgradient_merge_condbool)rC   rU   rW   cpu      ?)xrV   elementwise_modXYOutaxisr   inputsoutputsattrsequal)r   ri   rj   )	paddlestaticcreate_global_varr   r   
create_varr   	increment	append_op)r   
main_block
k_step_varzero_varstep_varcond_vars         r   _get_gm_cond_var'GradientMergeOptimizer._get_gm_cond_var   s=   ]]44##dll# 5 

 ==22&# 3 
 ==22&# 3 
 ((&qc ) 
 % xs3  &%7)rl	 !    %5) !  !" # ! " s   AC''
C6c           
        ^ ^^ [        5       m[        5       nTR                  5       nUR                  5       nT R                  U5      nU HO  u  pgUR                  [
        R                  R                  R                  :w  d   S5       eT R                  Xg5        MQ     U VV	s0 s H  u  pUR                  U	_M     n
nn	U
R                  5       n0 n/ mU H  u  pgUR                  nUR                  U5      nUc   eUR                  US-   UR                  UR                  SS9nXU'   UR                  US-   UR                  UR                  SS9nUR!                  SSU0UR                  UR                  [#        S5      S.S	9  UR!                  S
XS.SU0SS0S9nT R%                  UXoU5        TR'                  Xo/5        M     UUU 4S jn[(        R*                  R,                  R/                  UUS S9  T R0                  $ s  sn	nf )Nz@SELECTED_ROWS is not supported in GradientMergeOptimizer for nowz@GRAD@GradientMergeT)rC   rU   rW   rX   fill_constantre   r   )rU   rW   rV   )r   rj   rk   elementwise_addrb   rf   rg   rh   c            
        > TR                   n TR                  5       nUR                  U 5        [        R                  nTR
                  (       ap  T Hj  u  p4UR                  SSU0SU0STR                  -  SSS.S9  UR                  R                  UR                  5       UR                  R                  5        Ml     T H  u  p4Xl        M     TR                  R                  T5      Tl        T Hw  u  p4["        R$                  R'                  UR(                  UR*                  SUS	9  UR                  R                  UR                  5       UR                  R,                  5        My     g )
Nscalerc   re   r_   g        F)r~   biasbias_after_scalerh   )rU   rW   rV   out)current_block_idxcurrent_block_set_forward_block_idxr   r6   r   rr   r   r=   rF   r<   r7   r8   r/   r   r1   r   rm   tensorr{   rU   rW   Optimize)cur_block_idx	cur_blockr>   rH   new_gradmain_programnew_params_gradsr   s        r   true_apply_gradientCGradientMergeOptimizer.apply_gradients.<locals>.true_apply_gradient:  sJ   (::M$224I ,,];66Hxx'7OE''$ #X!& 1%(4<<%7$'05	 ( 	 KK)) 002HOO4L4L (8  $4
 "+ $4 "&!5!5!E!E "D
 $4++"..".. 	 ,  %%,,.0H0H $4r   )true_fnfalse_fn)r   r   global_blockrx   r   r   VarDescVarTypeSELECTED_ROWSrK   rC   keysvarrp   rU   rW   rr   floatrP   appendrm   rn   nnrO   r   )r   r,   r&   rs   startup_blockrO   rH   rI   kvparam_to_gradparam_namesparam_to_gradient_merge
param_name	param_vargradient_merge_varstartup_gradient_merge_varnew_grad_opr   r   r   s   `                  @@r   r1   &GradientMergeOptimizer.apply_gradients   s1   +-13!..0
'446$$Z0 (KE::!5!5!C!CC RC $$U1 ( 2>>v>#((*"$ (KEJ"z2I(((!+!6!6"77oooo 	 "7 " 3EJ/)6)A)A"77oooo 	 *B *& ##$ :;&__&__"1X $  %..&!; 23rl	 / K $$U ##U$?@M (P.	b 	d,?$O!!!E ?s   #Hc                     [        U[        5      (       d   S5       eU R                  UUUUS9nU R                  XUS9nXe4$ )Nr%   )r&   r)   r*   )r&   r,   )r   r   r'   r3   )r   r(   r&   r)   r*   r,   r2   s          r   minimizeGradientMergeOptimizer.minimizeo  sc     $))L+LL)}}+)#	 % 
 ** + 
 ))r   )r   r   r   r   r   )rB   T)NNNN)NNN)__name__
__module____qualname____firstlineno____doc__rN   r   r   r"   r'   r3   r?   rK   rP   rx   r1   r   __static_attributes__ r   r   r   r      s`    >@ 2"& *;0M3js"l LP*r   r   )rm   paddle.baser   paddle.base.frameworkr   r   r   r   r   r	   __all__r   r   r   r   <module>r      s*        a* a*r   