
    Αi                         S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJ	r	J
r
  \R                  R                  r " S S5      r " S S	\5      r " S
 S\5      r " S S\5      r " S S\5      rg)    N)unique_name)wait_server_ready)core)default_main_programdefault_startup_programc                   Z    \ rS rSrSrS rS rS rS r SS jr	S r
S	 rS
 rS rS rSrg)
Collective    c                     Xl         S U l        S U l        S U l        S U l        S U l        S U l        S U l        [        R                  nUR                  5       U l        UR                  5       U l        g N)nrings	endpointscurrent_endpointother_endpointsnranksrankstartup_programmain_programr   op_proto_and_checker_makerkOpRoleAttrNameop_role_keykOpRoleVarAttrNameop_role_var_key)selfr   op_makers      h/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/transpiler/collective.py__init__Collective.__init__   si     $#	# 22#335'::<    c                    [        U[        5      (       a  UR                  S5      nXl        Uc  [	        5       U l        X l        Uc  [        5       U l        [        U5      U l        U R                  S:X  a+  U R                  S:w  a  U R                  S:w  a  [        S5      eUS:  a  [        S5      eX0l        XT;  a  [        SU[        U5      5      eX@l        XPl        U(       a'  [        U5      nUS S  nUR                  U5        Xl        X`l        U R                  R#                  5       U R                  l        U R'                  5         U R
                  R#                  5       U R
                  l        U R)                  5         g )	N,   single_process_multi_threadboxz the number of endpoints must > 1r   zrank must >= 0z current endpoint %s is not in %s)
isinstancestrsplitr   r   r   r   lenr   mode
ValueErrorr   r   r   remover   	wait_portclone_origin_program_transpile_startup_program_transpile_main_program)	r   r   r   r   r   r   r-   r   r   s	            r   	transpileCollective.transpile,   sO    i%%!,I."#:#<D ( 4 6D)nKK1		::		U"?@@!8-..	,2 I  # 0^F'lO""#34#2 "/3/C/C/I/I/K,''),0,=,=,C,C,E)$$&r    c                     [        S5      e)Nz'call the inherited method of subclasses)NotImplementedErrorr   s    r   r1   "Collective._transpile_main_programe   s    !"KLLr    c           	          [        U R                  5       HK  nU R                  U R                  U R                  U R
                  U R                  UU R                  5        MM     U R                  5         g r   )	ranger   _init_communicatorr   r   r   r   r-   _broadcast_params)r   ring_ids     r   r0   %Collective._transpile_startup_programh   s[    T[[)G##$$%%		 * 	 r    c                 p   SR                  U5      n[        U5      n	US S  n
U
R                  U5        UR                  5       nUS:X  a  U(       a  [	        U
5        UR                  5       n[
        R                  " 5       (       a  UR                  [        R                  " S5      S[
        R                  R                  R                  S9nUR                  S0 SU0SUS	US
U
U R                  [        R                   0S9  U(       d5  UR                  SSU00 SU	SUSUU R                  [        R                   0S9  g UR                  SSU00 SU	SUSUU R                  [        R                   0S9  g [
        R"                  " 5       (       a  UR                  [        R                  " S5      S[
        R                  R                  R                  S9nUR                  S0 SU0SUS	US
U
U R                  [        R                   0S9  UR                  SSU00 SU	SUSUSUU R                  [        R                   0S9  g [$        R&                  R)                  5       R*                  [$        R,                  R/                  5       ;   a  UR                  [        R                  " S5      S[
        R                  R                  R                  S9nUR                  S0 SU0SUS	US
U
U R                  [        R                   0S9  UR                  SSU00 SU	SUSUSUU R                  [        R                   0S9  g g )Nr"   r   nccl_idT)namepersistabletypec_gen_nccl_idOutr   endpointr   rB   inputsoutputsattrsc_comm_initXr   r<   c_comm_init_multitrainer	ntrainers
trainer_idbkcl_idc_gen_bkcl_idr   xccl_idc_gen_xccl_id)joinr)   r,   global_blockr   r   is_compiled_with_cuda
create_varr   generateVarDescVarTypeRAW	append_opr   OpRoleForwardis_compiled_with_xpupaddledistributedParallelEnvdevice_typedeviceget_all_custom_device_type)r   programr   r   r   r<   r-   has_multitrainerendpoints_strr   r   blocknccl_id_varbkcl_id_varxccl_id_vars                  r   r:   Collective._init_communicatort   sM    +Y#A,/0$$&19o.$$&%%''** )))4 \\))-- + K
 OO$,D 0%$$fnn		  
 $&- &!7((&..		   
 3-#V$d!7((&..		   
 &&((** )))4 \\))-- + K
 OO$,D 0%$$fnn		  
 OO"[)fDw$$fnn	   **,88}}779:  ** )))4 \\))-- + K
 OO$,D 0%$$fnn		  
 OO"[)fDw$$fnn	  ':r    c                    U R                   R                  5       nSnUR                  5        H\  nUR                  (       a  M  US-   U R                  -  nUR                  SSU0SU0SUSSU R                  [        R                  0S	9  M^     [        U R                  5       H4  nUR                  S
SW0SU0SX R                  [        R                  0S	9  M6     g )Nr#   	broadcastxoutr<   rootr   rF   c_sync_comm_streamrK   rD   )
r   rT   iter_parametersis_distributedr   r[   r   r\   r]   r9   )r   rh   r<   params       r   r;   Collective._broadcast_params   s    $$113**,E##{dkk1GOO U|wA$$fnn	  	 -  T[[)GOO)U| '+;+;V^^L	   *r    c                    U R                   UR                  ;  a  g[        UR                  5       U R                      5      nU[        [        R
                  5      -  =(       a    U[        [        R                  5      -  $ )NF)r   
attr_namesint	all_attrsr\   BackwardLoss)r   opop_roles      r   _is_loss_grad_opCollective._is_loss_grad_op  sY    2==0bllnT%5%567V__--L'C<L2LLr    c                     U R                   UR                  ;   =(       a@    [        UR                  5       U R                      5      [        [        R
                  5      -  $ r   )r   ry   rz   r{   r\   r|   r   r~   s     r   _is_backward_opCollective._is_backward_op  J    2==0 !SLLN4++,6
 6! 	!r    c                 x    SUR                   ;   =(       a%    SUR                   ;   =(       a    SUR                   ;   $ )NParamGradLearningRate)input_namesr   s     r   _is_update_opCollective._is_update_op  s5    r~~% 1"..(1"..0	
r    c                     U R                   UR                  ;   =(       a@    [        UR                  5       U R                      5      [        [        R
                  5      -  $ r   )r   ry   rz   r{   r\   Optimizer   s     r   _is_optimizer_opCollective._is_optimizer_op  r   r    )r   r   r   r   r   r   r   r   r   r   r-   N)F)__name__
__module____qualname____firstlineno____doc__r   r2   r1   r0   r:   r;   r   r   r   r   __static_attributes__ r    r   r	   r	      sA    =7'rM
!( zx6M!

!r    r	   c                   4    \ rS rSrSrS	S jrS rS rS rSr	g)
GradAllReducei#  r   c                 <    [         R                  X5        SU l        g )Ngrad_allreduce)r	   r   r*   r   r   s     r   r   GradAllReduce.__init__&  s    D)$	r    c                 D    U R                  5         U R                  5         g r   )_insert_scale_loss_grad_ops_insert_allreduce_opsr6   s    r   r1   %GradAllReduce._transpile_main_program*  s    ((*""$r    c                    U R                   R                  5       n[        [        [	        UR
                  5      5      5       H|  u  p#U R                  U5      (       d  M  UR                  UR                  S      nUR                  US-   SSU0SU0SSU R                  -  U R                  [        R                  0S9  M~     g)	
In order to keep the learning rate consistent in different numbers of
training workers, we scale the loss grad by the number of workers
r   r#   scalerK   rD         ?rF   N)r   rT   reversedlist	enumerateopsr   varsoutput_arg_names
_insert_opr   r   r\   r|   )r   rh   idxr~   loss_grad_vars        r   r   )GradAllReduce._insert_scale_loss_grad_ops.  s    
 !!..0Yuyy%9 :;GC$$R(( %

2+>+>q+A B  !G /"M2t{{!2((&// ! 	 <r    c                    U R                   R                  5       nSnS n[        [        [	        UR
                  5      5      5       GH|  u  pEU R                  U5      (       d  M  U R                  UR                  ;   d  M:  UR                  5       U R                     n[        U5      S:X  a  Mh  [        U5      S-  S:X  d   eUn[        S[        U5      S5       H  nUR                  Xh      n	UR                  XhS-         nU	R                  (       a  M;  Xt:X  a;  US-  nUR                  USSU0SU0U R                  [         R"                  0S9  US-  nUS-   U R$                  -  nUR                  US	S
U0SU0SUS[&        R(                  R*                  R,                  U R                  [         R"                  0S9  M     GM     Uc  g [	        UR
                  5       Hn  u  pEU R/                  U5      (       d  M  [        U R$                  5       H8  nUR                  XB-   SSU0SU0SUU R                  [         R"                  0S9  M:       g    g )Nrn   r      r#   c_sync_calc_streamrK   rD   rF   
all_reducerp   rq   r<   reduce_typers   )r   rT   r   r   r   r   r   r   ry   r{   r)   r9   r   ru   r   r   r\   r|   r   r_   r`   ReduceOpSUMr   )
r   rh   r<   gradr   r~   op_role_varoffsetirv   s
             r   r   #GradAllReduce._insert_allreduce_opsB  s6   !!..0Yuyy%9 :;GC$$R((((BMM9 llnT-A-AB{#q(;'!+q000q#k"2A6A!JJ{~6E ::ka%&89D++ }!(("!5$';%*DM#'#3#3V__"E )  !  '{dkk9G$$) #T{!&%w)6+=+=+F+F+J+J ,,foo % 
) 7 <X < +GC$$R(($T[[1G$$1 #T{!&%w ,,foo % 	  2  ,r    )r*   Nr   )
r   r   r   r   r   r   r1   r   r   r   r   r    r   r   r   #  s    %%(@r    r   c                   4    \ rS rSrSrS	S jrS rS rS rSr	g)
LocalSGDi  r   c                 J    [         R                  X5        SU l        SU l        g )Nz	@SNAPSHOT	local_sgd)r	   r   snapshot_keyr*   r   s     r   r   LocalSGD.__init__  s    D)'	r    c                    [         R                  U 5        U R                  R                  5       n/ nUR	                  5        H'  nUR
                  (       a  M  UR                  U5        M)     U Hj  nUR                  U R                  UR                  5      UR                  SSS9nUR                  SSU/0SU/0U R                  [        R                  0S9  Ml     g )NT)r@   shaperA   stop_gradientassignrK   rD   rF   )r	   r0   r   rT   rt   ru   appendrV   snapshot_namer@   r   r[   r   r\   r]   )r   rh   non_dist_paramsrv   snapshots        r   r0   #LocalSGD._transpile_startup_program  s    --d3$$113**,E'''&&u- - %E''''

3kk "	 ( H OOeW~
+''8	   %r    c                     XR                   -   $ r   )r   )r   
param_names     r   r   LocalSGD.snapshot_name  s    ----r    c                    U R                   R                  5       n/ nSn[        [        [	        UR
                  5      5      5       GH~  u  pEU R                  U5      (       d  M  UR                  UR                  S5      S      nUR                  (       a  MR  UR                  U R                  UR                  5      UR                  SSUR                  S9nUR                  US-   SU/U/S.S	U/0U R                   ["        R$                  0S
9  UR                  US-   SSU0S	U0U R                   ["        R$                  0S
9  US-   U R&                  -  nUR                  US-   SSU/0SU/0SUS[(        R*                  R,                  R.                  U R                   ["        R$                  0S
9  UR1                  Xg45        GM     [3        U R&                  5       H4  nUR5                  SSW0S	U0SX0R                   ["        R$                  0S
9  M6     [        U5       H  nUS   nUS   nUR5                  SSU/0S	U/0SSU R6                  -  U R                   ["        R$                  0S
9  UR5                  SU/U/S.S	U/0U R                   ["        R$                  0S
9  UR5                  SSU/0S	U/0U R                   ["        R$                  0S
9  M     g )Nrn   r   r   T)r@   r   rA   r   dtyper#   elementwise_sub)rK   YrD   rF   r   r   rK      r   rp   rq   r<   r   rs   r   r   r   )r   rT   r   r   r   r   r   r   inputru   rV   r   r@   r   r   r   r   r\   r   r   r_   r`   r   r   r   r9   r[   r   )	r   rh   ordered_param_snapshotr<   r   r~   rv   r   param_snapshots	            r   r1    LocalSGD._transpile_main_program  s   !!..0!#Yuyy%9 :;GC!!"%%

288G#4Q#78'' ++++EJJ7++ $"&++ ,    !G*"*5':"UG,++V__= !    !G-<"EN++V__= !  #Q;$++5  !G%%>"UG,!7%v'9'9'B'B'F'F((&// ! 
 '--u.?@S <V T[[)GOO)U| '+;+;V__M	   * ''=>N"1%E%a(HOOeW~(S4;;.$$foo	   OO&&Zug6(''9	   OOeW~
+''9	  % ?r    )r*   r   Nr   )
r   r   r   r   r   r   r0   r   r1   r   r   r    r   r   r     s     
..Nr    r   c                   <    \ rS rSrSrS rS rS rS rS r	S r
S	rg
)SingleProcessMultiThreadi  z"
single process multi thread mode
c                 :   [         R                  U S5        SU l        [        [        R
                  " SS5      5      U l        [        [        R
                  " SS5      5      U l        [        [        R
                  " SS5      R                  S5      5      U l
        g )	Nr#   r$   PADDLE_FUSE_ALLREDUCE1PADDLE_LOSS_SCALEFLAGS_selected_gpusz0,1,2,3,4,5,6,7r"   )r   r   r*   rz   osgetenvfuse_allreduce
loss_scaler)   r(   gpu_numsr6   s    r   r   !SingleProcessMultiThread.__init__  sq    tQ'1	!")),CS"IJbii(;SABII+->?EEcJ
r    c           
         Sn[        U R                  5      S:  a7  [        U R                   Vs1 s H  o"R                  S5      S   iM     sn5      nUS:  a  Xl        [	        S5        [	        SU R
                  5        [	        SU R                  5        [	        SU R                   SU R                   35        [        U R                  5       HL  nU R                  U R                  U R
                  U R                  U R                  UU R                  S	5        MN     g SU l        [	        S
5        U R                  R                  5       nUR                  SSS0S9  g s  snf )Nr   r#   :2begin to _transpile_startup_program for multi-nodecurrent_endpoint: total endpoints: rank: , ring_id: T3begin to _transpile_startup_program for single-nodecomm_init_allr<   rB   rI   )r)   r   r(   r   printr   r   r   r9   r:   r   r-   rT   r[   )r   	nodes_numrp   r<   rh   s        r   r0   3SingleProcessMultiThread._transpile_startup_program  s   	t~~"dnnEnWWS\!_nEFIq=#KFG&(=(=>%t~~6F499+[>? -''(())NNIINN . DKGH((557EOOAOG- Fs   E c                 "   U R                  5       nU R                  S:X  a  US:X  a  g U R                  (       a  U R                  U5        US:X  a  g U R                  S:  a  [	        SU 35        U R                  5         g U R                  5         g )Nr   z(begin used fuse_allreduce param count = )_get_update_param_countr   r   r   r   _insert_fuse_allreduce_opsr   )r   	param_cnts     r   r1   0SingleProcessMultiThread._transpile_main_program!  sz    002	??aIN??,,Y7>"<YKHI++-&&(r    c                    SnU R                   R                  5       n[        [        [	        UR
                  5      5      5       H  u  p4U R                  U5      (       d  M  U R                  UR                  ;  a  M9  UR                  5       U R                     n[        U5      S:X  a  Mg  [        U5      S-  S:X  d   e[        S[        U5      S5       H,  nUR                  XV      nUR                  (       a  M'  US-   nM.     M     U$ )z
get need update param count
r   r   r#   )r   rT   r   r   r   r   r   r   ry   r{   r)   r9   r   ru   )r   param_countrh   r   r~   r   r   rv   s           r   r   0SingleProcessMultiThread._get_update_param_count4  s     !!..0Yuyy%9 :;GC''++##2==8,,.)=)=>K;1${#a'1,,,1c+.2

;>2'')Ao	 3 <  r    c                    US:  a  SU R                   -  U R                  -  nOSU R                  -  n[        SU 35        U R                  R	                  5       n[        [        [        UR                  5      5      5       Hn  u  pEU R                  U5      (       d  M  UR                  UR                  S      nUR                  US-   SSU0SU0SX R                  [        R                  0S9  Mp     g	)
r   r   r   z*begin _insert_scale_loss_grad_ops scale = r#   r   rK   rD   rF   N)r   r   r   r   rT   r   r   r   r   r   r   r   r   r   r\   r|   )r   r   r   rh   r   r~   r   s          r   r   4SingleProcessMultiThread._insert_scale_loss_grad_opsL  s    
 q=$++%5E$--'E:5'BC!!..0Yuyy%9 :;GC((,,!JJr':':1'=>Ma]+.'7'7I  	 <r    c                    U R                   R                  5       nSnSn/ nSn[        [        [	        UR
                  5      5      5       H  u  pgU R                  U5      (       d  M  U R                  UR                  ;   d  M9  UR                  5       U R                     n[        U5      S:X  a  Mg  [        U5      S-  S:X  d   eUn	[        S[        U5      S5       Ha  n
UR                  X      nUR                  XS-         nUR                  (       a  M;  X:X  d  MB  UR                  U5        [        XYS-   5      nMc     M     Uc  gU R                   S:X  a  UR#                  USSUS   0SUS   0U R$                  [&        R(                  0S	9  US-  nUS-   U R*                  -  nUR#                  US
SU0SU0SX R$                  [&        R(                  0S	9  US-  nUR#                  USSUS   0SUS   0SX R$                  [&        R(                  0S	9  US-  ngUnUR-                  SS/S[.        R0                  R2                  R4                  SS9nSS[.        R0                  R2                  R4                  S.nUR#                  USSU0XS.US	9  US-  nUR#                  USSU0SU0U R$                  [&        R(                  0S	9  US-  nUS-   U R*                  -  nUR#                  USSU0SU0SUU R$                  [&        R(                  S[6        R8                  R:                  R<                  0S	9  US-  nUR#                  USSU0SU0SX R$                  [&        R(                  0S	9  US-  ng)+
insert coalesce_tensor and all reduce ops
rn   Nr   r   r#   r   rK   rD   rF   c_allreduce_xsumr<   rs   fused_outputFTr@   r   rA   r   r   )	copy_dataset_constantr   coalesce_tensorInputOutputFusedOutputr   rp   rq   r   )r   rT   r   r   r   r   r   r   ry   r{   r)   r9   r   ru   r   maxr   r   r   r\   r|   r   rV   r   rX   rY   FP32r_   r`   r   r   )r   rh   r<   r   input_gradsglobal_offsetr   r~   r   r   r   rv   output_gradsr   coalesce_tensor_attrss                  r   r   3SingleProcessMultiThread._insert_fuse_allreduce_opsc  s    !!..0Yuyy%9 :;GC$$R((((BMM9 llnT-A-AB{#q(;'!+q000q#k"2A6A!JJ{~6E ::ka%&89D++ }#**40(+MA:(F 7 <$ <!#)[^,A/''9   QM{dkk1G'[), '+;+;V__M   QM)[^,A/ '+;+;V__M   QM 'L ++#c!ll**//" , L " %--22%!
 &-#/M+   QM)\*-''9   QM{dkk1G!\*-w$$foo!6#5#5#>#>#B#B  
 QM )\*- '+;+;V__M   QMr    )r   r   r   r*   r   N)r   r   r   r   r   r   r0   r1   r   r   r   r   r   r    r   r   r     s'    
H6)&0.rr    r   c                   @    \ rS rSrSrSS jrS rS rS rS r	S r
S	rg
)MultiThreadi  r   c                     [         R                  X5        SU l        X l        SU l        [
        R                  " SS5      R                  S5      n[        U5      U l	        g )Nr%      r   z0,1,2,3,4,5,6,7,8r"   )
r   r   r*   
trans_modefuse_grad_size_in_numr   r   r(   r)   gpu_num)r   r   r  r   s       r   r   MultiThread.__init__  sQ    t,	$%("9924GHNN
 8}r    c                 .   [        U R                  5      S:  a  [        S5        [        SU R                  5        [        SU R                  5        [        SU R                   SU R
                   35        [        U R
                  5       HL  nU R                  U R                  U R                  U R                  U R                  UU R                  S5        MN     g SU R                  ;   as  [        S	5        U R                  R                  5       nUR                  S
[        [        [        [         R"                  " S5      R%                  S5      5      5      SS.S9  g [        S5        U R                  R                  5       nUR                  S
SS0S9  g )Nr#   r   r   r   r   r   Txpuz:begin to _transpile_startup_program for single-node in XPUr   r   r"   r   )devicesr<   r   r   r<   )r)   r   r   r   r   r   r9   r:   r   r-   r  rT   r[   r   maprz   r   r   r(   )r   r<   rh   s      r   r0   &MultiThread._transpile_startup_program  sQ   t~~"FG&(=(=>%t~~6F499+[>? -''(())NNIINN . 'P ,,99;(#' #RYY/D%E%K%KC%P$
 $%   
 KL,,99;_YNKr    c                    U R                  5         U R                  S:X  aJ  [        S5        U R                  U R                  -  U l        U R                  5         U R                  5         g U R                  S:X  a  [        S5        U R                  5         g U R                  S:X  a>  [        [        R                  " S5      R                  S5      5      S:X  a  [        S	5        g [        S
5        U R                  5         g )N
all_gatherz%begin to transpile in all-gather modefuse_all_reducez*begin to transpile in fuse all-reduce modeall_reduce_xpur   r"   r#   zHskip transpile in all-reduce-xpu mode when number of devices is only onez%begin to transpile in all-reduce mode)r   r  r   r   r  allgather_ranks_insert_allgather_ops_update_adam_opsr   r)   r   r   r(   r   r6   s    r   r1   #MultiThread._transpile_main_program  s    ((*??l*9:#';;#=D &&(!!#__ 11>?++-OO//BII34::3?@AEZ 9:&&(r    c                 .   U R                   R                  5       nSnSn[        [        [	        UR
                  5      5      5       GH  u  pEU R                  U5      (       d  M  U R                  UR                  ;   d  M:  UR                  5       U R                     n[        U5      S:X  a  Mh  [        U5      S-  S:X  d   eUn[        S[        U5      S5       GH'  nUR                  Xh      n	UR                  Xh   S-   U R                  /[        U	R                  5      QS[         R"                  R$                  R&                  SS9n
UR                  XhS	-         nU	R(                  (       a  M  Xt:X  a;  US	-  nUR+                  US
SU0SU0U R,                  [.        R0                  0S9  US	-  nUS	-   U R2                  -  nUR+                  USSU0SU
0SU R                  SUU R,                  [.        R0                  0S9  GM*     GM     Uc  g[	        UR
                  5       Hn  u  pEU R5                  U5      (       d  M  [        U R2                  5       H8  nUR+                  XB-   SSU0SU0SUU R,                  [.        R0                  0S9  M:       g   g)z)
insert allgather op to the main_program
rn   Nr   r   
_allgatherFTr  r#   r   rK   rD   rF   r  rp   rq   r   r<   rs   )r   rT   r   r   r   r   r   r   ry   r{   r)   r9   r   rV   r!  r   r   rX   rY   r
  ru   r   r   r\   r|   r   r   )r   rh   r<   r   r   r~   r   r   r   rv   new_grad_vars              r   r"  !MultiThread._insert_allgather_ops!  s    !!..0Yuyy%9 :;GC$$R((((BMM9 llnT-A-AB{#q(;'!+q000q#k"2A6A!JJ{~6E#(#3#3(^l:#33Hd5;;6GH$)"ll2277&* $4 $L !::ka%&89D++ }!(("!5$';%*DM#'#3#3V__"E )  !  '{dkk9G$$) #T{!& 5$d&:&:%w ,,foo % 
7 7 <d < +GC$$R(($T[[1G$$1 #T{!&%w ,,foo % 	  2  ,r    c                    U R                   R                  5       n[        [        [	        UR
                  5      5      5       GH@  u  p#U R                  U5      (       d  M  UnUR                  S:w  a  UR                  S:w  a  MB  UR                  S5      S   nUR                  UR                  S5      S      UR                  UR                  S5      S      UR                  UR                  S5      S      UR                  UR                  S5      S      UR                  UR                  S5      S      UR                  UR                  S	5      S      S
.nUR                  UR                  S5      S      UR                  UR                  S5      S      UR                  UR                  S5      S      UR                  UR                  S5      S      UR                  UR                  S5      S      S.nUR                  S5      UR                  S5      UR                  S5      UR                  S5      UR                  S5      S.n[        U R                  5       V	s/ s Hp  n	UR                  US-   [        U	5      -   UR                  UR                  S5      S      R                   S["        R$                  R&                  R(                  SS9PMr     n
n	UR+                  USSUR                  UR                  S5      S   S-      0SU
0U R                  SS.S 9  US!-  n[        U R                  5       H,  n	X   US"'   UR+                  UUR                  UUUS 9  US!-  nM.     UR-                  U5        GMC     g#s  sn	f )$z3
remove the original adam op, and add new adam ops
adamlambr   r   r   Moment1Moment2Beta1PowBeta2Pow)r   r   r,  r-  r.  r/  ParamOut
Moment1Out
Moment2OutBeta1PowOutBeta2PowOut)r0  r1  r2  r3  r4  epsilonbeta1beta2	lazy_modemin_row_size_to_use_multithread)r5  r6  r7  r8  r9  _FTr  r(   rK   r&  rD   )numaxisrF   r#   r   N)r   rT   r   r   r   r   r   rB   r   r   outputattrr9   r!  rV   r'   r   r   rX   rY   r
  r   
_remove_op)r   rh   r   r~   r   r   rG   rH   rI   r   
split_varss              r   r#  MultiThread._update_adam_opsl  sG    !!..0Yuyy%9 :;GC$$R((GGv%"''V*;XXg.q1
"ZZ(9!(<=$)JJrxx/G/J$K$zz"((9*=a*@A$zz"((9*=a*@A %

288J+?+B C %

288J+?+B C !&

299Z+@+C D"'**RYY|-DQ-G"H"'**RYY|-DQ-G"H#(::bii.Fq.I#J#(::bii.Fq.I#J  "wwy1WWW-WWW-!#!579ww98" #4#7#78	 9 $$'#-A6#jj'):1)=>DD$)"ll2277&* %  9  	    UZZ(9!(<|(KL #J/"&"6"6B !  !t334A%/]F6N$$WW% '# %  aKF 5   (A <@	s   (A7Nc                    U R                   R                  5       nSU R                  -  nSn/ n[        UR                  5       H  nU R                  U5      (       d  M  U R                  UR                  ;   d  M7  UR                  5       U R                     n[        U5      S:X  a  Me  [        U5      S-  S:X  d   S5       e[        S[        U5      S5       HT  nXg   nUR                  U5      n	XgS-      n
UR                  U
5      nU	R                  (       a  MC  UR                  U5        MV     M     Uc  g/ nSnU Hr  n[        U5      S:X  d,  [        US   5      U R                  :X  d  UR                  U:w  a   UR                  U/5        UR                  nM^  US   R                  U5        Mt     / n[!        UR                  5       H  u  pU R#                  U5      (       d  M  U H  nUR%                  [&        R(                  " SUS   R*                   35      US   R                  SS	S
9nUR                  U5        UR-                  USSU0UUS.SS	SS	SUS   R                  U R.                  [0        R2                  0S9  M       O   [!        UR                  5       H  u  pU R#                  U5      (       d  M  U H  nUR-                  USSU0SU0SUS[4        R6                  R8                  R:                  U R.                  [0        R2                  0S9  UR-                  USSU0SU0U R.                  [0        R2                  0S9  M       O   [        U5      S:X  a  UR=                  5         g[!        UR                  5       HV  u  pU R#                  U5      (       d  M  UR-                  USSUS   0SUS   0SUU R.                  [0        R2                  0S9    O   UR=                  5         g)r   r   Nr   zRvars need to be one param var followed by one grad var, but got odd number of varsr#   rn   FusedOutput_FT)r@   r   rA   r   r  r  r  r  	use_alignr   rF   r   rp   rq   r<   r   r   rK   rD   rs   )r   rT   r   r   r   r   r   ry   r{   r)   r9   varru   r   r  r   r   r   rV   r   rW   r@   r   r   r\   r|   r_   r`   r   r   _sync_with_cpp)r   rh   r<   r   param_gradsr~   r   r   r   rv   	grad_namesegments
last_dtyperE  
fused_varsr   segmenttmp_var	fused_vars                      r   r   &MultiThread._insert_fuse_allreduce_ops  s    !!..0dkk/599%B$$R((((BMM9 llnT-A-AB{#q(;'!+q0 10 q#k"2A6A!,J!IIj1E +E 2I 99Y/D++ &&t, 7 &( <
CH"x|$(B(BB99
*& YY
##C(  
 +GC$$R(('G#..(11*71:??*;< &aj..$)&* / G %%g.$$. '1+27 K''#WQZ%5%5 ,,foo	 %   (. 3 ,8 !+GC$$R((!+I$$) #Y/!&	 2%w)6+=+=+F+F+J+J ,,foo % 
 $$1 #Y/!&	 2#//A %  ",& + ,. z?a  " !+GC$$R((  -A/"JqM2!7((&// ! 	  , 	r    )r!  r  r  r*   r  N)r#   r  )r   r   r   r   r   r   r0   r1   r"  r#  r   r   r   r    r   r  r    s*    %%LN)*IVF)Ptr    r  )r   r_   paddle.baser   5paddle.distributed.fleet.base.private_helper_functionr   paddle.frameworkr   paddle.staticr   r   r   r\   r	   r   r   r   r  r   r    r   <module>rT     sv    
  # " G		(	(	/	/D! D!N_J _Dpz pf]} ]@P- Pr    