
    x-jH                         d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d Zd	 Z G d
 de          ZdS )    N)coreunique_name)global_scope)Variable
name_scope)LayerHelper)ClipGradByGlobalNorm)	Optimizerc                 ~  
 t           j        d         
d 
                    d          D             

         }
fd|D             }|                              }t	          j        d          }|                     |dt          j        j	        j
                  }t          j                    r!|                     di d	|i||||d
           nt          j                    r!|                     di d	|i||||d
           nat          j                                        j        t          j                                        v r |                     di d	|i||||d
           |                     dd|ii t)          |          ||d                    
          d           |                     t	          j        d                    }	|                     dd	|	iddi           |                     dd|	id|	i|t          j        j        j        d           |                     dd|	id	|	i           |S )NPADDLE_TRAINER_ENDPOINTSc                 ^    g | ]*}|                                 |                                 +S  )strip).0eps     p/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/incubate/optimizer/distributed_fused_lamb.py
<listcomp>z%init_communicator.<locals>.<listcomp>   s-    
=
=
=""((**
=288::
=
=
=    ,c                 ,    g | ]}|k    |         S r   r   )r   repsranks     r   r   z%init_communicator.<locals>.<listcomp>   s"    444A!t))Q)))r   comm_idT)namepersistabletypec_gen_nccl_idOut)r   endpointother_endpointsring_idr   inputsoutputsattrsc_gen_bkcl_idc_gen_xccl_idc_comm_initX)nranksr   r"   	endpointstmp)r   fill_constantvalue   )r   r%   r&   
all_reducexout)r"   reduce_typec_sync_calc_stream)r   r$   r%   )osenvironsplitindexr   generate
create_varr   VarDescVarTypeRAWis_compiled_with_cuda	append_opis_compiled_with_xpupaddledistributedParallelEnvdevice_typedeviceget_all_custom_device_typelenjoinReduceOpSUM)blockr   ranksr"   cur_ep	other_eps
local_rankcomm_var_namecomm_id_vartmp_varr   s    `        @r   init_communicatorrT      s   
*/
0C
=
=		#
=
=
=CYF44444444IT""J(33M""4<3G3K #  K !## &
 K(""#,"	 	 	 
	
 
	
 
	
 
	
 
	"	$	$ 
 K(""#,"	 	 	 
	
 
	
 
	
 
	
 	&&((4=3355	6 	6 	 K(""#,"	 	 	 
	
 
	
 
	
 
OO[!%jj#	
 
	  
 
 
 K$8$?$?@@G	OOug&6wl     
OOW~ !-6:
 
	     
OO!W~     
 Nr   c                 N    |D ]!}|                      dd|id|id|i           "d S )N	broadcastr2   r3   r"   r#   )r@   )rL   
parametersr"   ps       r   broadcast_parametersrY   l   sV     
 
8AJ7	 	 	
 	
 	
 	

 
r   c                        e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Zd Zd Zd Zd Zd Zd Z	ddgdfdZ
ddZd Zd Zd Z xZS )DistributedFusedLambMbP?{Gz??+?ư>NT   r0   Fc                    t          j                    r
J d            t                                          |d |           || _        || _        || _        ||nd| _        |'t          |t                    s
J d            |j
        }nd}|| _        ||nd| _        |	| _        |
| _        || _        d | _        || _        || _        || _        || _        || _        | j        dk    sJ t-          d          | _        d	| _        | j        j                                        }|                    t9          j        d
          dgt<          j        j         j!                  | _"        d | _#        | j        dk    rE|                    t9          j        d          dgt<          j        j         j!                  | _$        nd | _$        i | _%        d S )Nz2DistributedFusedLamb does not support dygraph mode)learning_rate	grad_clipr           z>Only ClipGradByGlobalNorm is supported in DistributedFusedLambg      r0   distributed_fused_lambT	found_inf)r   shapedtypestop_update)&rB   in_dynamic_modesuper__init___beta1_beta2_epsilon_weight_decay
isinstancer	   	clip_norm_max_global_grad_norm
_alignment_clip_after_allreduce_is_grad_scaled_by_nranks_exclude_from_weight_decay_fn_scale_use_master_param_norm_gradient_accumulation_steps_use_master_acc_grad_nproc_per_node_use_hierarchical_allreducer   helper_supports_check_nan_infmain_programglobal_blockr;   r   r:   r   r<   r=   BOOL
_found_inf_step_stop_update_param_to_master_param)selfrc   lamb_weight_decaybeta1beta2epsilonrW   rd   exclude_from_weight_decay_fnclip_after_allreduceis_grad_scaled_by_nranks	alignmentuse_master_param_normgradient_accumulation_stepsuse_master_acc_gradnproc_per_nodeuse_hierarchical_allreducer   max_global_grad_norm
main_block	__class__s                       r   rn   zDistributedFusedLamb.__init__y   s   ( )++ 	
 	
@	
 	
+ 	}4PPP!2!>C 	  i)=>>  P > $-#6  #' %9"'0'<))"%9")A&-I*&;#,G)$7!-+E(0A5555!":;;'+$[-::<<
$//%k22#,&+ 0 
 

 
,q00 * 5 5 )-88cl*/ !6 ! !D !%D&(###r   c                 "    | j         | j         ndS )NF)r   r   s    r   _get_stop_update_varz)DistributedFusedLamb._get_stop_update_var   s    $($5$At  uLr   c                     || _         d S N)r   )r   steps     r   	_set_stepzDistributedFusedLamb._set_step   s    


r   c                 V    | j         |                     dd          | _         | j         S )Nr   int64rj   )r   _create_persistable_varr   s    r   _get_or_create_stepz(DistributedFusedLamb._get_or_create_step   s,    :55fG5LLDJzr   c                 p    |J t          |t                    s|                     |          }|| _        d S r   )rs   r   _create_scale_from_constantrz   )r   scales     r   
_set_scalezDistributedFusedLamb._set_scale   s>       %** 	<44U;;Er   c                     t          j        d          }t          j                            |dgdt          |          d          S )Nglobal_scaler0   float32T)r   ri   rj   r/   r   )r   r:   rB   staticcreate_global_varfloat)r   r/   r   s      r   r   z0DistributedFusedLamb._create_scale_from_constant   sH    #N33}..#,, / 
 
 	
r   c                 R    | j         |                     d          | _         | j         S )Ng      ?)rz   r   r   s    r   _get_or_create_scalez)DistributedFusedLamb._get_or_create_scale   s'    ;::3??DK{r   rf   r   c                 0   | j         j                                        }|t          j        |          }|                    |||dd          }| j         j                                        }|                    |j        |j        |j	        dd          }|S )NT)r   ri   rj   r   stop_gradient)
r   startup_programr   r   r:   r;   r   r   ri   rj   )r   r   ri   rj   startup_blockstartup_varr   main_vars           r   r   z,DistributedFusedLamb._create_persistable_var   s    3@@BB'--D#.. / 
 
 [-::<<
((!## ) 
 
 r   c                    |t                      }| j                            |          }|J |                    |                                          }|                                t          j        k    sJ |                    |                                          }|                                t          j        k    r0|                                |                                k    sJ |d fS |                                t          j	        k    sJ |
                                |
                                k    sJ ||fS r   )r   r   getfind_var
get_tensor_dtyperB   r   _ptrfloat16ri   )r   r   scopemaster_parammaster_param_tparam_ts         r   _get_parameterz#DistributedFusedLamb._get_parameter   s   = NNE266t<<'''55@@BB$$&&&.8888..&&1133>>v~--<<>>^%8%8%:%:::::D= >>##v~5555==??n&:&:&<&<<<<<N**r   c                 0    |                      |           d S r   )apply_gradients)r   params_gradss     r   apply_optimizez#DistributedFusedLamb.apply_optimize
  s    \*****r   c                 <   g }|D ]\  }}|                     ||g           |d         j        j                            |          5  t	          d          5  |                     |           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )Nr   	optimizer)extendrL   program_optimized_guardr   _apply_gradients_impl)r   r   	flattenedrX   gs        r   r   z$DistributedFusedLamb.apply_gradients  s0   	  	% 	%DAqaV$$$$aL&77	BB	5 	5{##	5 	5 &&|444		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5s6   BA9-B9A=	=B A=	BBBc                 >   |D ]5\  }}|j         t          j        j        j        k    s
J d            d|_        6|                     d          }|                     d          }|                     dd          }|                     dd          }g }|D ]C\  }}|                     d	          }	|	j        | j        |j        <   |	                    |	           D|                     d
          }
d|
_
        |                     d          }d|_
        |                     d          }|                     d          }|                     dd          }d|_
        |                     dd          }|                     dd          }d|_
        |                     dd          }d|_
        |                     dd          }d|_
        | j        dk    rG|                     d          g}|                     dd          g}|                     dd          g}ng }g }g }|                                 }t          j                                        }t          j                                        }| j        |}n| j        }||z  dk    s
J d            ||k    }||z  }t%          ||z            }t%          ||z            }g }| j        j                                        }|dk    rAt-          ||t/          t1          |                    d          } |	                    |            d}!|dk    rt3          |          dk    r|rt/          t1          ||z  |dz   |z                      }"t-          |||"d          } |	                    |            | j        rY||k    rSd}!t/          t1          ||z  ||                    }#t-          |||#|d         dz             } |	                    |            |                                 }$d |D             }%d |D             }&dgt3          |%          z  }'| j        /t;          |%          D ]\  }(}|                     |          rd|'|(<    |&D ]5}|                    |j        |j         |j        |j        |j                    6|dk    rtC          ||%|d                    |"                    d |%|&d!i d"|gd#|gd$|gd%|gd&|
gd'|gd(|gd)|gd*|$gd+|gd,|%d-|d.|&d/|gd0|gd1|gd2|gd3|gi| j#        |r|n||r|n||'d4d4| j$        | j%        d56           | j        j&                                        })| '                                 d }*|D ]Y}+|*| (                    |+          }*| (                    |+          },tS          |*          tS          |,          k    s
J d7            Z|*J |)"                    d8i d"|gd#|gd$|gd%|gd9|*gd&|
gd'|gd(|gd)|gd*|$gd+|gd:|%d;|&d1|gd/|gd0|gd2|g|g|g|
g|g|g|g|%|&| j*        g|||| j+        | j+        ng |gd<| j,        | j$        | j%        | j-        | j.        | j/        |||| j0        | j1        | j        | j2        |!d=6          }-|-gS )>NzOnly support dense gradientTfp32_fused_paramfp32_fused_gradfp16_fused_paramr   r   fp16_fused_gradmaster_weightmoment1moment2beta1powbeta2pow
param_infoint32fused_offsetsfp32_partial_fused_offsetsfp16_partial_fused_offsetsparam_orderr0   fp32_acc_fused_gradfp16_acc_fused_gradacc_stepr   r   z2nranks should be exactly divided by nproc_per_nodeFrf   c                     g | ]\  }}|S r   r   )r   rX   _s      r   r   z>DistributedFusedLamb._apply_gradients_impl.<locals>.<listcomp>  s    ---1!---r   c                     g | ]\  }}|S r   r   )r   r   r   s      r   r   z>DistributedFusedLamb._apply_gradients_impl.<locals>.<listcomp>  s    ,,,tq!,,,r   )r   r   rj   r   ri   distributed_fused_lamb_init)ParamGradFP32FusedParamFP32FusedGradFP16FusedParamFP16FusedGradMoment1Moment2Beta1PowBeta2PowGlobalScale	ParamInfoParamOutMasterParamOutGradOutFP32ShardFusedParamOffsetsFP16ShardFusedParamOffsetsFusedParamOffsets
ParamOrderStepre   )r   r   r+   apply_weight_decayr   r   r   r   r#   z7The learning rate for each parameter should be the samerg   LearningRater   r   )FP32FusedParamOutFP16FusedParamOut
Moment1Out
Moment2OutBeta1PowOutBeta2PowOutr   r   FoundInfFP32AccFusedGradFP16AccFusedGradAccStep
StopUpdater   )weight_decayr   r   r   r   r   r   r+   ring_idsr   r   	acc_stepsr   r   )3r   r   r<   r=   DENSE_TENSORr   r   r   r   appendis_distributedr|   r   rB   rC   get_rankget_world_sizer~   intr   r   r   rT   listrangerH   r   r   ry   	enumerater;   rj   ri   rY   r@   rv   ro   rp   r   _create_global_learning_rate_create_param_lridr   r   rr   rq   ru   rw   r{   rx   r}   ).r   r   rX   r   r   r   r   r   master_paramsmaster_pr   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   r   shard_inside_noderP   node_idnode_numr	  r   r"   r   local_group_ranksouter_group_ranksr   paramsgradsr   ir   lrp_gnew_lrlamb_ops.                                                 r   r   z*DistributedFusedLamb._apply_gradients_impl  ss	     	! 	!DAq6T\1>>>>- ?>> !AMM778JKK667HII77i 8 
 
 66Y 7 
 
   	+ 	+DAq33ODDH2:-D'/  ****..y99!%..y99!%//
;;//
;;11,g1NN
$(
!447 5 
 
 &*%A%A( &B &
 &
" 59"1%)%A%A( &B &
 &
" 59"122=2PP%)",q00,,-BCC# ,,) -  #
 44Zw4OOPHH"$"$H''))!**,,#2244'#NN!1N&!+++@ ,++ #^3N*
d^+,,v.//3@@BBA::'tT%--%8%8! G OOG$$$%*"a<<CMMQ..3D. $g.10NOO! ! (t%6 G OOG$$$/ )F^4K4K-1*$($/HH% %! ,!4):HRL1<L  ((())++-----,,|,,,S3v;;.-9!&)) . .155a88 .,-&q) 	 	A$$VVgMg %     A:: DDD.  #3"4/!2 !#3"4  /!2	
 G9 G9 XJ XJ w j\ F !- 5 -/I.J -/I.J  $m_!" {m#$ % * "_&7A

T,=I..6&8	 	5 	  $	
 $	
 $	
L [-::<<
))+++ 	 	Cz**3//..s33"vvF+++M ,+++ ~~~&&) #3"4/!2 !#3"4  /!2	
  G9 G9 XJ XJ w j\   $m_ -/I.J  -/I.J!" {m#( '7%7&6%7&i&i (z (z" !_-$7$7#)-):)FD%%B! & !% 2=(,(B(,(B $)-)D,0,J!>'+'@.H O ' 7
 7
p yr   )r\   r]   r^   r_   r`   NNNTTra   Tr0   TNFNr   )__name__
__module____qualname__rn   r   r   r   r   r   r   r   r   r   r   r   __classcell__)r   s   @r   r[   r[   x   s=        %)!!%"$% #(%G) G) G) G) G) G)RM M M    
  
 
 
  
 ,0t9    *+ + + +&+ + +5 5 5m m m m m m mr   r[   )r6   rB   paddle.baser   r   paddle.base.executorr   paddle.base.frameworkr   r   paddle.base.layer_helperr   	paddle.nnr	   paddle.optimizerr
   rT   rY   r[   r   r   r   <module>r/     s    
			  ) ) ) ) ) ) ) ) - - - - - - 6 6 6 6 6 6 6 6 0 0 0 0 0 0 * * * * * * & & & & & &O O Od	
 	
 	
L L L L L9 L L L L Lr   