
    ϑiH                         S SK r S SKrS SKJrJr  S SKJr  S SKJrJ	r	  S SK
Jr  S SKJr  S SKJr  S rS	 r " S
 S\5      rg)    N)coreunique_name)global_scope)Variable
name_scope)LayerHelper)ClipGradByGlobalNorm)	Optimizerc                    [         R                  S   nUR                  S5       Vs/ s H)  oUR                  5       (       d  M  UR                  5       PM+     nnXA   nU Vs/ s H  owU:w  d  M
  XG   PM     nnUR	                  U5      n	[
        R                  " S5      n
U R                  U
S[        R                  R                  R                  S9n[        R                  " 5       (       a  U R                  S0 SU0U	UUUS.S	9  O[        R                  " 5       (       a  U R                  S
0 SU0U	UUUS.S	9  Oa[        R                   R#                  5       R$                  [        R&                  R)                  5       ;   a  U R                  S0 SU0U	UUUS.S	9  U R                  SSU00 [+        U5      U	USR-                  U5      S.S	9  U R                  [
        R                  " S5      S9nU R                  SSU0SS0S9  U R                  SSU0SU0U[        R                   R.                  R0                  S.S	9  U R                  SSU0SU0S9  U$ s  snf s  snf )NPADDLE_TRAINER_ENDPOINTS,comm_idT)namepersistabletypec_gen_nccl_idOut)rankendpointother_endpointsring_idr   inputsoutputsattrsc_gen_bkcl_idc_gen_xccl_idc_comm_initX)nranksr   r   	endpointstmp)r   fill_constantvalue   )r   r   r   
all_reducexout)r   reduce_typec_sync_calc_stream)r   r   r   )osenvironsplitstripindexr   generate
create_varr   VarDescVarTypeRAWis_compiled_with_cuda	append_opis_compiled_with_xpupaddledistributedParallelEnvdevice_typedeviceget_all_custom_device_typelenjoinReduceOpSUM)blockr   ranksr   epsepcur_epr	other_eps
local_rankcomm_var_namecomm_id_vartmp_vars                p/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/incubate/optimizer/distributed_fused_lamb.pyinit_communicatorrN      s   
**/
0C #		#
="((*:288:C
=YF!&4At)I4T"J((3M""4<<3G3G3K3K # K !!## K(""#,"		 	 
	
 
	"	"	$	$ K(""#,"		 	 
	
 	&&(44==335	6 	 K(""#,"		 	 
	
 
OO[!%j#	
	  
 K$8$8$?@G	OOug&6wl   
OOW~ !--66::
	   
OO!W~   
 N[ >4s   II!	I.Ic                 F    U H  nU R                  SSU0SU0SU0S9  M     g )N	broadcastr'   r(   r   r   )r6   )rB   
parametersr   ps       rM   broadcast_parametersrS   l   s8    8AJ7	 	 	
     c                      ^  \ rS rSr                 SU 4S jjrS rS rS rS rS r	S	 r
SS
/S4S jrSS jrS rS rS rSrU =r$ )DistributedFusedLambx   Nc                   > [         R                  " 5       (       a   S5       e[        TU ]  US US9  X0l        X@l        XPl        Ub  UOSU l        Ub)  [        U[        5      (       d   S5       eUR                  nOSnUU l        Ub  UOSU l        Xl        Xl        Xl        S U l        Xl        Xl        Xl        Xl        UU l        U R$                  S:  d   e[-        S5      U l        S	U l        U R.                  R2                  R5                  5       nUR7                  [8        R:                  " S
5      S/[<        R>                  R@                  RB                  S9U l"        S U l#        U R$                  S:  aN  UR7                  [8        R:                  " S5      S/[<        R>                  R@                  RB                  S9U l$        OS U l$        0 U l%        g )Nz2DistributedFusedLamb does not support dygraph mode)learning_rate	grad_clipr           z>Only ClipGradByGlobalNorm is supported in DistributedFusedLambg      r%   distributed_fused_lambT	found_inf)r   shapedtypestop_update)&r8   in_dynamic_modesuper__init___beta1_beta2_epsilon_weight_decay
isinstancer	   	clip_norm_max_global_grad_norm
_alignment_clip_after_allreduce_is_grad_scaled_by_nranks_exclude_from_weight_decay_fn_scale_use_master_param_norm_gradient_accumulation_steps_use_master_acc_grad_nproc_per_node_use_hierarchical_allreducer   helper_supports_check_nan_infmain_programglobal_blockr1   r   r0   r   r2   r3   BOOL
_found_inf_step_stop_update_param_to_master_param)selfrY   lamb_weight_decaybeta1beta2epsilonrQ   rZ   exclude_from_weight_decay_fnclip_after_allreduceis_grad_scaled_by_nranks	alignmentuse_master_param_normgradient_accumulation_stepsuse_master_acc_gradnproc_per_nodeuse_hierarchical_allreducer   max_global_grad_norm
main_block	__class__s                       rM   rd   DistributedFusedLamb.__init__y   s   ( ))++ 	
@	
+ 	}4P!2!>C 	  i)=>> P> $-#6#6 #' %9"'0'<)"%9")A&-I*&;#,G)$7!-+E(00A555!":;'+$[[--::<
$//%%k2#,,&&++ 0 

 
,,q0 * 5 5 ))-8cll**// !6 !D !%D&(#rT   c                 8    U R                   b  U R                   $ S$ )NF)r}   r   s    rM   _get_stop_update_var)DistributedFusedLamb._get_stop_update_var   s    $($5$5$At  LuLrT   c                     Xl         g N)r|   )r   steps     rM   	_set_stepDistributedFusedLamb._set_step   s    
rT   c                 ^    U R                   c  U R                  SSS9U l         U R                   $ )Nr   int64r`   )r|   _create_persistable_varr   s    rM   _get_or_create_step(DistributedFusedLamb._get_or_create_step   s-    ::55fG5LDJzzrT   c                 f    Uc   e[        U[        5      (       d  U R                  U5      nXl        g r   )ri   r   _create_scale_from_constantrp   )r   scales     rM   
_set_scaleDistributedFusedLamb._set_scale   s0       %**44U;ErT   c                     [         R                  " S5      n[        R                  R	                  US/S[        U5      SS9$ )Nglobal_scaler%   float32T)r   r_   r`   r$   r   )r   r0   r8   staticcreate_global_varfloat)r   r$   r   s      rM   r   0DistributedFusedLamb._create_scale_from_constant   sC    ##N3}}..#, / 
 	
rT   c                 `    U R                   c  U R                  S5      U l         U R                   $ )Ng      ?)rp   r   r   s    rM   _get_or_create_scale)DistributedFusedLamb._get_or_create_scale   s(    ;;::3?DK{{rT   r\   r   c                 P   U R                   R                  R                  5       nUb  [        R                  " U5      nUR                  UUUSSS9nU R                   R                  R                  5       nUR                  UR                  UR                  UR                  SSS9nU$ )NT)r   r_   r`   r   stop_gradient)
rv   startup_programry   r   r0   r1   rx   r   r_   r`   )r   r   r_   r`   startup_blockstartup_varr   main_vars           rM   r   ,DistributedFusedLamb._create_persistable_var   s    33@@B''-D#.. / 
 [[--::<
((!!#### ) 
 rT   c                 J   Uc
  [        5       nU R                  R                  U5      nUc   eUR                  U5      R	                  5       nUR                  5       [        R                  :X  d   eUR                  U5      R	                  5       nUR                  5       [        R                  :X  a(  UR                  5       UR                  5       :X  d   eUS 4$ UR                  5       [        R                  :X  d   eUR                  5       UR                  5       :X  d   eXT4$ r   )r   r~   getfind_var
get_tensor_dtyper8   r   _ptrfloat16r_   )r   r   scopemaster_parammaster_param_tparam_ts         rM   _get_parameter#DistributedFusedLamb._get_parameter   s    = NE2266t<'''5@@B$$&&..888..&113>>v~~-<<>^%8%8%::::D= >>#v~~555==?n&:&:&<<<<**rT   c                 &    U R                  U5        g r   )apply_gradients)r   params_gradss     rM   apply_optimize#DistributedFusedLamb.apply_optimize
  s    \*rT   c                 2   / nU H  u  p4UR                  X4/5        M     US   R                  R                  R                  U5         [	        S5         U R                  U5        S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nr   	optimizer)extendrB   program_optimized_guardr   _apply_gradients_impl)r   r   	flattenedrR   gs        rM   r   $DistributedFusedLamb.apply_gradients  sx    	 DAaV$ ! aL&&77	B{#&&|4 $ CB## CBs$   BA7&B7
B	B
Bc                 f   U HE  u  p#UR                   [        R                  R                  R                  :X  d   S5       eSUl        MG     U R                  S5      nU R                  S5      nU R                  SSS9nU R                  SSS9n/ nU HJ  u  p#U R                  S	5      n	U	R                  U R                  UR                  '   UR                  U	5        ML     U R                  S
5      n
SU
l
        U R                  S5      nSUl
        U R                  S5      nU R                  S5      nU R                  SSS9nSUl
        U R                  SSS9nU R                  SSS9nSUl
        U R                  SSS9nSUl
        U R                  SSS9nSUl
        U R                  S:  a5  U R                  S5      /nU R                  SSS9/nU R                  SSS9/nO/ n/ n/ nU R                  5       n[        R                  R                  5       n[        R                  R!                  5       nU R"                  c  UnOU R"                  nUU-  S:X  d   S5       eUU:  nUU-  n[%        UU-  5      n[%        UU-  5      n/ nU R&                  R(                  R+                  5       nUS:  a1  [-        UU[/        [1        U5      5      S5      n UR                  U 5        Sn!US:  a  [3        U5      S::  a  U(       a  [/        [1        UU-  US-   U-  5      5      n"[-        UUU"S5      n UR                  U 5        U R4                  (       aF  UU:  a@  Sn![/        [1        UU-  UU5      5      n#[-        UUU#US   S-   5      n UR                  U 5        U R7                  5       n$U VV%s/ s H  u  nn%UPM
     n&nn%U V%Vs/ s H  u  n%o3PM	     n'n%nS/[3        U&5      -  n(U R8                  b2  [;        U&5       H#  u  n)nU R9                  U5      (       d  M  SU(U)'   M%     U' HH  nUR=                  UR                  UR                   UR>                  UR
                  UR@                  S9  MJ     US:  a  [C        UU&US   5        URE                  SU&U'S.0 S U/_S!U/_S"U/_S#U/_S$U
/_S%U/_S&U/_S'U/_S(U$/_S)U/_S*U&_S+U_S,U'_S-U/_S.U/_S/U/_S0U/_S1U/0EU RF                  U(       a  UOUU(       a  UOUU(S2S2U RH                  U RJ                  S3.S49  U R&                  RL                  R+                  5       n*U RO                  5         S n+U HI  n,U+c  U RQ                  U,5      n+M  U RQ                  U,5      n-[S        U+5      [S        U-5      :X  a  MD   S55       e   U+c   eU*RE                  S60 S U/_S!U/_S"U/_S#U/_S7U+/_S$U
/_S%U/_S&U/_S'U/_S(U$/_S)U/_S8U&_S9U'_S/U/_S-U/_S.U/_S0U/_U/U/U
/U/U/U/U&U'U RT                  /UUUU RV                  b  U RV                  O/ U/S:.U RX                  U RH                  U RJ                  U RZ                  U R\                  U R^                  UUUU R`                  U Rb                  U R                  U Rd                  U!S;.S49n.U./$ s  sn%nf s  snn%f )<NzOnly support dense gradientTfp32_fused_paramfp32_fused_gradfp16_fused_paramr   r   fp16_fused_gradmaster_weightmoment1moment2beta1powbeta2pow
param_infoint32fused_offsetsfp32_partial_fused_offsetsfp16_partial_fused_offsetsparam_orderr%   fp32_acc_fused_gradfp16_acc_fused_gradacc_stepr   r   z2nranks should be exactly divided by nproc_per_nodeFr\   )r   r   r`   r   r_   distributed_fused_lamb_init)ParamGradFP32FusedParamFP32FusedGradFP16FusedParamFP16FusedGradMoment1Moment2Beta1PowBeta2PowGlobalScale	ParamInfoParamOutMasterParamOutGradOutFP32ShardFusedParamOffsetsFP16ShardFusedParamOffsetsFusedParamOffsets
ParamOrderStepr[   )r   r   r    apply_weight_decayr   r   r   r   r   z7The learning rate for each parameter should be the samer]   LearningRater   r   )FP32FusedParamOutFP16FusedParamOut
Moment1Out
Moment2OutBeta1PowOutBeta2PowOutr   r   FoundInfFP32AccFusedGradFP16AccFusedGradAccStep
StopUpdater   )weight_decayr   r   r   r   r   r   r    ring_idsr   r   	acc_stepsr   r   )3r   r   r2   r3   DENSE_TENSORr   r   r   r~   appendis_distributedrr   r   r8   r9   get_rankget_world_sizert   intrv   r   ry   rN   listranger>   ru   r   ro   	enumerater1   r`   r_   rS   r6   rl   re   rf   rx   _create_global_learning_rate_create_param_lridr{   r}   rh   rg   rk   rm   rq   rn   rs   )/r   r   rR   r   r   r   r   r   master_paramsmaster_pr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r   shard_inside_noderI   node_idnode_numr  r   r   r   local_group_ranksouter_group_ranksr   _paramsgradsr   ir   lrp_gnew_lrlamb_ops/                                                  rM   r   *DistributedFusedLamb._apply_gradients_impl  s    DA66T\\11>>> -> !AM	 !  778JK667HI77i 8 
 66Y 7 
  DA33ODH2:--D''/  * !
 ..y9!%..y9!%//
;//
;11,g1N
$(
!447 5 
 &*%A%A( &B &
" 59"1%)%A%A( &B &
" 59"122=2P%)",,q0,,-BC# ,,) - #
 44Zw4OPH"$"$H'')!!**,##224'#N!11N&!+ 	
@	
+ #^3N*
d^+,v./33@@BA:'tT%-%8!G OOG$%*"a<CMQ.3D $g.10NO! (t%6G OOG$//F^4K-1*$($/H%! ,!4):HRL1<L ())+ ,-1!-+,|tq!|,S3v;.--9!&)155a88,-&q) * A$$VVVVggMMgg %   A: D. #3"4/!2 !#3"4  /!2	
 G9 G9 XJ XJ w j\ F !- 5 -/I.J -/I.J  $m_!" {m#$ %* "__&7
T,=.6&8	5 	  $	
L [[--::<
))+Cz**3/..s3"vF+ M+   ~~&&) #3"4/!2 !#3"4  /!2	
  G9 G9 XJ XJ w j\   $m_ -/I.J  -/I.J!" {m#( '7%7&6%7&i&i (z (z" !__-$7$7#)-):):)FD%%B!& !% 2 2==(,(B(B(,(B(B $)-)D)D,0,J,J!>>'+'@'@.HO ' 7
p y .,s   +Z'Z-)rl   re   rf   rm   rg   ro   r{   rr   rn   rk   rt   r~   rp   r|   r}   rw   ru   rs   rq   rh   rv   )gMbP?g{Gz?g?g+?gư>NNNTT   Tr%   TNFNr   )__name__
__module____qualname____firstlineno__rd   r   r   r   r   r   r   r   r   r   r   r   __static_attributes____classcell__)r   s   @rM   rV   rV   x   s     %)!!%"$% #(%G)RM


 ,0t9 *+&+5m mrT   rV   )r+   r8   paddle.baser   r   paddle.base.executorr   paddle.base.frameworkr   r   paddle.base.layer_helperr   	paddle.nnr	   paddle.optimizerr
   rN   rS   rV    rT   rM   <module>r3     s9    
  ) - 6 0 * &Od	
L9 LrT   