
    x-j%                     8   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlZ	d dl
Z
d dlmZ d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dlm Z  d dl!m"Z# d dl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddZ,d Z- G d de%          Z.dS )    N)OrderedDict)reduce)product)core)no_grad)pir)fleet)new_process_group)get_1D_sub_process_mesh)
split_mesh)OpRole)alignget_current_device_type)AutoParallelStreamType)_current_expected_place_)	Optimizer   )_dtensor_from_local)copy_op_attr_with_new_member)Strategyc                 B   d}|| j         }|D ]@}t          |t          j                  r$|dk    s
J d            |                                }Ad }t          | j                  D ]}||k    rt          j        |          } nt          j        |          }||||<   |S )NzUThe parameter can't be shard twice with sharding strategy even in different mesh now.)	
placements
isinstancedistShardget_dimrangendimcopydeepcopy)paramsharding_axisparam_placements
shard_axis	placementplacement_with_shardingdimnew_placementss           i/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/auto_parallel/sharding.pyget_placement_with_shardingr+   1   s    J +% - -	i,, 	- ###g $## #**,,J"UZ    *&*joo#E  ]#344N*(?}%    c                    || j         v sJ | j                             |          }g }| j        D ]$}|                    t	          |                     %dg||<   g }t          | D ]{}g }t	          d|                     |                    D ]>}g |d|         |||dz   d          R }	|                    | j        |	                    ?|                    |           ||S )Nr   r   )	dim_namesindex_shapeappendr   r   get_dim_sizemesh)
r3   	axis_name
axis_indexrangesdim_num
all_resultxresulticoords
             r*   get_mesh_comm_listr=   L   s   &&&&%%i00JF; & &eGnn%%%%F:Jf " "q$++I6677 	, 	,A?a*o?q?1Z!^-=-=+>??EMM$)E*++++&!!!!r,   c                       e Zd ZdZddZd Zd Zd Zd Zd Z		 dd
Z
d Zd Z e            d             Zd Z e            d             Zd Zd Z e            d             Zd Zd Zd Zd Zd Zd ZdS )ShardingOptimizerStage1z4
    .. ZeRO: https://arxiv.org/abs/1910.02054

    Nc                 f   |
J d            t          |t          j        j        t          j        j        f          s
J d            || j        d<   || _        |pt                      | _        g | _	        d | _
        t          j                     | j        j        t          j                                        }n| j        j        }t!          |d          }|D ]:}t#          t%          |                    }t          j                    |v r|| _        ;d | _        d|j        v rMt!          |d          }|D ]:}t#          t%          |                    }t          j                    |v r|| _        ;t/                      | _        d|j        v r|                    dt          j                              }	t7          d|                    d                    D ]1}
| j                            |                    d|
                     2|                    d|	          }n| j                            |           |j                            d          | _         |j!        | j                  | _"        d	| _#        d
| _$        d|j        v r|j                            d          | _#        |j!        | j#                 | _$        t/                      }| j        D ]E}|                    |           tK          || j#                  D ]}|                    |           F|| _        t          j&                     d S )Nz)The argument `optimizer` cannot be empty.zR`paddle.distributed.ShardOptimizer` only supports AdamW and SGD optimizer for now.
_inner_optdpmpppr   )r/   r   r   )global_meshsub_mesh_dim)'r   paddle	optimizerAdamWSGD__dict__	_shard_fnr   	_strategy_slice_param_group_info_dy_shard_groupenable_static_meshr   auto_parallelget_meshr=   r
   sortedget_rank_sharding_group	_mp_group
_dim_namesset	pp_meshesr.   get_rank_by_dim_and_process_idr   r2   addget_mesh_with_dimr/   _sharding_axisr0   _sharding_degree_mp_mesh_axis
_mp_degreer   disable_static)selfrH   shard_fnstrategyr3   	dp_groupsgroup
comm_group	mp_groupspp_rankidxrZ   pp_meshsub_pp_meshs                 r*   __init__z ShardingOptimizerStage1.__init__e   s(   $$7 %$$ (.0@0DE
 
 	
 	
 a	
 	
 

 '0l#!!/XZZ')$#>'%..00DD>'D&tT22	 	2 	2E*6%==99J}%'''1$4?""*466I" 0 0.ve}}==
=??e++%/DN4>!!99$PPGQ 1 1$ 7 788 L L""4#9#9$c#9#J#JKKKK))$g)>>DDNt$$$"o33D99 $D,? @4?""!%!6!6t!<!<D"k$*<=DOI> / /g&&&#- 'd6H$ $ $ / /K MM+..../ 'DNr,   c                    t                      }t          |t          j        j                  r@t          j                            t          j                                        j                  }t          j        j	        
                                | _        | j                            |           | j        j        j        }|dk     rd}i }i }d}d}g }|D ]\  }	}
|
	|	                                }|
                                }|J d|	j         d|	 d            |J d|	j         d|
 d            |j        |j        k    sJ d	|	j         d
|	 d|
 d            | j        |j        vr3|                    |	|
f           |	j        
ddi|	_        n
d|	j        d<   |	j        
ddi|	_        n
d|	j        d<   |j        | j        v s#J d|	j         d|j         d| j         d            t1          j                    |j        j        v rct7          |j        | j                  }t9          |j                  | j        j        k    s'J d|	j         d|j         d| j        j                     |j        t?                      k    sJ d|	j         d|	 d            |j         |j         k    sJ d|	j         d
|	 d|
 d            |	j!        |
j!        k    sJ d|	j         d
|	 d|
 d            |	j"        |
j"        k    sJ d|	j         d
|	 d|
 d            | j#        dk    r| j$        |j         v r
d|	_%        d}n	d|	_%        d}|&                    |j        g                               |	           |&                    |j        g                               |
           t          j'        (                                }|)                                }|j*        d         }|dz  dz  }g }|+                                D ]\  }}||         }tY                      }t[          j.        |||g          }t1          j                    |j        v r| /                    ||           ta          |          D ]\  }}g }g }|D ]l}|&                    |g                               ||         j                   |                    ||                    |                    ||                    m| j        j        j1        r| 2                    ||           | 3                    ||          \  }}}} |d         j4        }!tj          j6        j7        j8        ts                               tt          |d         j4                 z  }"|"| j;        z  ty          j=        |!          z  ty          j=        |d         j4                  z  }"t1          j                    |j        v r| >                    ||||"           | j        j        j?        st          j@        A                    ||!ddddd|"dg g           \  }#}$| j        jB        jC        s|D ]	}
d|
_D        
d|$_D        t          j-        E                    |$F                                | j"                  }%|$G                    t[          jH        |%|$                                                     n5d }&d }'|D ]<}
|
I                                }(|j*        J                    |(          }|'||'k     r|}'|(}&=t[          jK        |&           t          j@        L                    | j"        |!| j                  }$t[          jM        |dgi           })|$G                    t[          jH        |$F                                |)                     |$I                                N                    d          }*|*G                    t[          jH        |*F                                |)                     d}+|D ]}
|
I                                }(t          jP        |
j"                  },t[          jK        |(           t          j@        Q                    |$|+|+|,z             }-t          j@        R                    |-|
j"                  }-t[          jS        |(           t          j@        T                    |
|-g           |+|,ty          j=        |!          z  |"z   dz
  |"z  |"z  ty          j=        |!          z  z  }+| j        j        j1        st[          jU                     |$j"        d         | j;        z  }.| j        j        J                    t1          j                              }/|/|.z  }0|0|.z   }1t          j@        Q                    |$|0|1          }2t          j@        V                    |$| j        jW        | j;                  }3| j        j        j1        r6|3I                                X                    t          jZ        j[                   t[          jU                     t          j@        T                    |2|3g           g }4|+                                D ]\  }5}6|4                    |5           |                    |4|| f           |+                                D ]y\  }5}6|6\  }}7}8t          j@        Q                    |3|7|8          }9||                                         j\        }:|:]                    | j                   t[          jM        |9j        dg|:          };|9G                    t[          jH        |9F                                |;                     |9I                                j        ^                                }<|<d         _                                |<d<   t[          jM        |9j        dg|:          |<d<   t          |9I                                j        |<          |9I                                _        |                    |5|9f           {| ja        jb        _d| ja        jb        _c        | j        | ja        jb        _d        | je        | ja        jb        _f        || ja        jb        _g        || ja        jb        _h        | ja        i                    |           t[          jU                     |D ]\  }4}=}>| j        j        j1        rd }?d }|4d         j                                D ](}@|j*        J                    |@          }A|?|A|?k    r|A}?|@})t          j@        k                    |^                                d                   }B|BI                                X                    t          jZ        j[                   t          j@        l                    |=| j        jW        | j;                  }C|CI                                X                    t          jZ        j[                   n0t          j@        l                    |=| j        jW        | j;                  }Ct          j@        T                    |>|Cg           |j*        J                    |          dz   }D|j*        |Dd          S )Nr      Fz5parameter dist attribute must not None. but received z : .z4gradient dist attribute must not None. but received z grad : zDParameter and grad should have same process_mesh. but received name:z, parameter:z, grad: 	no_fusionTzAparameter mesh mush be in pp_meshes. but received parameter name:z, mesh:z, pp_meshes: z? all parameter must have the same sharding group. but received z sharding group is : z, global sharding group is: z?Sharding fusion do not support partial parameter. but received zDParameter and grad should have same dims_mapping. but received name:zDParameter and grad should have same global shape. but received name:zCParameter and grad should have same local shape. but received name:r   r   i           )new_results)m_get_devicer   rG   	framework	CUDAPlacedistributedParallelEnvdev_idbase	libpaddlePlace_place	set_placerM   shardingcomm_buffer_size_MB	dist_attrnameprocess_meshr^   partial_dimsr1   optimize_attrrZ   r   rU   process_idsr   rT   rV   ranksrY   dims_mappingshape_local_shapera   r`   is_distributed
setdefaultstaticdefault_main_programglobal_blockopsitemsr   r   assign_value_group_by_size_cache_slice_param_group_info	enumerateenable_overlap_reduce_scatter_overlap_fuse_group_paramdtyper	   utilstensor_fusion_helper	alignmentr   r   r_   r   size_of_dtype!_cache_slice_param_range_and_sizerelease_gradients_C_opscoalesce_tensor_pipelineenablepersistablecreate_shaped_typetypeset_typecvt_to_dist_typeget_defining_opr/   set_insertion_pointemptycreate_tensor_dist_attributeoperand_sourcenpprod
view_slice
view_shapeset_insertion_point_after	share_varreset_insertion_point_to_endreduce_scatteridset_execution_streamr   SHARDING_STREAMvaluepartial_statuspopresultsas_tensor_dist_attrr   rA   
_grad_clipshould_comm_on_shard_dimsharding_grouprW   mp_grouphas_dist_paramhas_not_dist_paramapply_gradientsall_used_opsnop
all_gather)Erc   params_gradsplacer   parameters_dict
grads_dictr   r   new_params_gradsr"   gradparam_dist_attrgrad_dist_attrsub_meshmain_programtarget_blocklast_op
group_sizeall_gather_param_info_listr3   
parametersgrads
var_groupsgroup_indices	group_idxindicesgroup_param_listgroup_grad_listr/   slice_param_dictpadded_size_dictmain_shard_fused_parammain_fused_paramr   
align_size_
fused_grad
fused_typefirst_grad_opfirst_indexgrad_opr   prev_var
grad_beginsizegrad_buffer
shard_sizerank
rank_beginrank_endview_shard_fused_gradshard_fused_gradslice_param_listslice_param
param_infoparam_begin	param_end
slice_gradpartail_statusslice_grad_dist_attrslice_grad_out_dist_attrshard_paramfused_paramlast_idxoprk   tmpallgather_valuestart_indexsE                                                                        r*   r   z'ShardingOptimizerStage1.apply_gradients   sW   eV-788 	$.."..007 E k+1133e$$$"n5I"""%
"' H	Q H	QKE4|#oo//O!^^--N".._
__W\___ /.. "--buzbb[_bbb .--  ,0KKKK FW\Wa  F  Fot  F  F  C  F  F  F LKK
 ".*EEE ''666&.+6*=E''7;E'4&.+6*>E''7<E'4"/4>AAA dTYT^  d  dgv  hD  d  d  SW  Sa  d  d  d BAA }/">"JJJ2#0$2E  8/00D4H4NNNN FV[V`  F  Fw  xL  F  F  jn  j~  jD  F  F ONN
 #/355888iRWR\iiafiii 988  ,0KKKK FW\Wa  F  Fot  F  F  C  F  F  F LKK ;$*,,, FW\Wa  F  Fot  F  F  C  F  F  F -,, %):::: EV[V`  E  Ens  E  E  ~B  E  E  E ;::
 !##&/*FFF'+$!% (-$%)"&&'CRHHOO   !!/">CCJJ4PPPP}99;;#0022"2&(4/$6
%'" / 5 5 7 7 G	G G	GD*t$E$J:Z4 M }$"22222:}MMM&/&>&> }G }G"	7#% "$$ 9 9E)))R88??"5).   %++Ju,=>>>#**5<8888>*9 P00,OOO **96FGG$$*$ (*0K4>/11 -a0678  +,(//0 )*:1*=*CDDE  =??d&666::!(("	   ~.@ H$*M$B$B'"% %MAz  >29 4$3 4 4D/3D,,-1J*!'!>!>"))+;+H" "J '',Z9M9M9O9OPP    %)M"&K / 4 4"&"6"6"8"8 , 0 6 6w ? ?&.%+2E2E*/K,3M+M:::!'!4!4(5" "J
 !$ @tR P PI'',Z__->->	JJ    *99;;JJ1MMH%%,X]]__iHH   "#J /  "&"6"6"8"8!wt'899/888&,m&>&>&
J4E' ' '-m&>&>'):' ' 5g>>>//{0CDDD" %)4+=e+D+D$D&0%1&'%( $.!. ))  $1%88	9

 ~.= 74666'4Q74;PP
+177HH!J.
%
2(.(@(@
H) )% $*=#?#? 4 79N$ $  >*9 $4466KK.>D   0222''*,<=   $& /?/E/E/G/G 9 9+K$++K8888*11(.(   0@/E/E/G/G $G $G+K4>1E;	!'!9!9(+y" "J (.88::I # #&&t':;;;+.+K"/"~, ,( '',&OO--/C    #2244>FFHH - 3K3))++ -Q/ 8&3bT>  -Q/ 5&6688B(@   ..00: %++[*,EFFFFI$Gs}G~ ?%1BFDO&?8<8LDO&526.DO&/8FDO&5<NDO&9''(8999(***
 (		D 	D 
~&5 *2.;;== % %B&*0044C'3>>#&"$ m''(9(9!(<==##%%::*:@   #)-":":!5!8$:O# #  //11FF*:@    #)-":":!5!8$:O# # M##[/$BCCCC"&,,W559--r,   c                    d t          t          |                    D             | _        t          |          D ]\  }}|D ]}||         }i | j        |         |j        <   |j        | j        |         |j                 d<   d| j        |         |j                 d<   d| j        |         |j                 d<   |j        | j        |         |j                 d<   |j        | j        |         |j                 d<   d S )Nc                     g | ]}i S  r  .0r   s     r*   
<listcomp>zIShardingOptimizerStage1._cache_slice_param_group_info.<locals>.<listcomp>  s    'N'N'Nq'N'N'Nr,   r   r   param_startr   r   r   )r   lenrN   r   r   r   r   r   )rc   r   r   r   r   r/   r"   s          r*   r   z5ShardingOptimizerStage1._cache_slice_param_group_info  s-   'N'NE#m:L:L4M4M'N'N'N$"+M":": 	' 	'Iw  ' '"5)FH,Y7
CK ,Y7
CGL
  ,Y7
C!
  ,Y7
C
 $ ,Y7
C 
 & ,Y7
C" '	' 	'r,   c                    |                                 D ]R\  }}|j                            dd          }|\  }}	}
|	| j        |         |         d<   |
| j        |         |         d<   S|                                 D ]\  }}|| j        |         |         d<   | j        |                                          D ]\  }}|| j        |         |         d<   d S )Nslice@ r	  r   padded_sizer   )r   r   replacerN   )rc   r   r   r   r   r   r   slice_param_namer   r   r   r   r  s                r*   r   z9ShardingOptimizerStage1._cache_slice_param_range_and_size  s    (8'='='?'? 	 	#K*/77"EE(2%A{I  (34DE
  (34DE  "2!7!7!9!9 	 	D+ (3D9-HH 3I>DDFF 	 	GD! (3D9,GG	 	r,   c                    ddd}|D ]}|                                 g}d}g }t          |          dk    r|                                }|j        t	          t
          j                  k    r|}n|                                dk    rr|                    |	                    d                                                      |j        t	          t
          j                  k    r|                    |           nnt          |          dk    ||j
                            |          dz   }	|D ]8}|j
                            |          }
|	|
k    r|                    ||	           9|d         |	|d         k    r*|	|d<   t          |          dk    r|d         |d<   ||d<   |d         t          j        |d                    dS dS )a!  
        In order to overlap computation and reduce_scatter communication, we need to:
          a. place reduce_scatter in communication stream
          b. place reduce_scatter op and its producer ops after the last grad define op
        This function will complete the item b.
        N)rk   r   r   r   rk   r   r   )r   r
  r   op_roleintr   Backwardnum_operandsr1   r   r   r/   move_opr   r   )rc   r   r   insertion_infor   stackr   advance_opsr   new_idxold_idxs              r*   r   z/ShardingOptimizerStage1._reduce_scatter_overlap'  s    "&T22#  	7  	7D))++,EGKe**q..YY[[:V_!5!555 G??$$))LL!2!21!5!5!E!E!G!GHHHzS%9%999#**2... e**q.. "&*0099A=% : :B*.44R88G'))$,,R999 #5)1!666,3N5);''!++/:2t,,/6t, $+).*>????? ,+r,   c                 
   t           j                                        }t           j                                        }t           j                            |          5  d }g }d}|D ]5} |||j                  }	|                    |	           |dz   |j        z   }6|d         j        }
t          j	        j
        j        t                               t          |
         z  }|| j        z  }t           j                            ||
ddddd|dg g           \  }}d}|D ]X}t#          j        |j                  t)          j        |
          z  }||z   d	z
  |z  |z  t)          j        |
          z  }||z  }Yt           j                            |                                |g          }t-          j        |d         j        dgi           }|                    t-          j        ||                     d|_        t           j                            ||           |                                 !                    ||                                          }| j"        |_#        d|_        || j        z  }| j$        j%        &                    tO          j(                              }||z  }t           j        )                    ||||z             }d|_        t           j                            |d
|z              |                                 !                    d
|z   |                                          }| j"        |_#        d|_        d}i }i }tU          |          D ]\  }}t#          j        |j                  t)          j        |
          z  }||z   d	z
  |z  |z  t)          j        |
          z  }|||j        <   tW          ||z
  d          }||z  }tY          ||z
  |          }||k     rft           j        )                    |||          }d|_        d|j        z   } t           j        -                    ||            |.                    |           t           j                            |          5  t-          j/                     t           j        0                    |           }!d|!_        |!                    |                                           |j1        |!_1        |j2        |!_2        |j3        |!_3        |j4        |!_4        |j5        |!_5        |j6        |!_6        |j7        |!_7        |j8        |!_8        d d d            n# 1 swxY w Y   |||f||!<   	 d d d            n# 1 swxY w Y   ||||fS )Nc                    |                                  j        D ]a}|                                dk    rG||                                d         k    r)|                    d                                          c S bt          d| d          )Nzbuiltin.set_parameterparameter_namer   zcan't find param (z) in startup program)r   r   r   attrsoperandsource
ValueError)startupr   r   s      r*   get_param_from_startupzIShardingOptimizerStage1._fuse_group_param.<locals>.get_param_from_startup[  s    !..004 6 6B		%<<< BHHJJ/?$@@@!zz!}}3355555 CCCC  r,   zfused@-r   TFrs   r   r   zshard@r  )9rG   r   default_startup_programr   program_guardr   r1   r   r	   r   r   r   r   r   r_   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _pir_opsset_persistable_valuer   	add_kwargr~   
place_attrrV   r   r/   r   rU   r   r   maxminset_parameterset_parameters_fromreset_insertion_point_to_start	parameter	trainablestop_gradientr   regularizerdo_model_average	need_clipr   is_parameter)"rc   group_indexr   startup_programr   r$  startup_param_listfuse_param_namer"   startup_paramr   r   r   r   r   r   r  r   r   r   r   r   r   shard_fused_paramr   total_buffer_sizer   r   r/   r   r   init_slice_paramr  r   s"                                     r*   r   z)ShardingOptimizerStage1._fuse_group_paramV  sa    -??AA}99;;]((99 @	 @		 	 	 "$&O) E E 6 6#UZ! ! #))-888"1C"7%*"D&q)/E0:+-- <   $d&;;J#];;" NA{ J) * *wu122T5G5N5NNZ'!+
: !)%001 
 k)

66  ""ZL J 8 #02$ I   !5j)!L!LMMM&*K#O11+OOO+88::DD!1!1!3!3    +/+'+/(#t'<<J'-33DMOODDD
*J & 8 8Zj)@! ! -1)O11!8o#=   &2%>%>%@%@%J%J?*,=,B,B,D,D& &" 15"-15". !!! )*: ; ; * *uwu122T5G5N5NNZ'!+
: !)%001 
 0; ,!"3j"@!DD![0! 1J >
KK	**'-}'?'?);	( ($ 48$0'/%*'<$O11(*:   !44_EEE44\BB F F:<<<&,o&?&?,' ' 37/#,,-=-B-B-D-DEEE05-494G1494G1272C/7<7M405-5:5I2383E0F F F F F F F F F F F F F F F" #!5$[1M*m@	 @	 @	 @	 @	 @	 @	 @	 @	 @	 @	 @	 @	 @	 @	D "	
 	
s8   PU.CT;/U;T??UT?UU!$U!r   c                 ,    |                      |          S N)r   )rc   lossr9  r   param_group_idxs        r*   _apply_optimizez'ShardingOptimizerStage1._apply_optimize  s     ##L111r,   c                 ~    d| j         v r.|dk    r| j         |         S t          | j         d         |          S t          )NrA   )rK   getattrAttributeError)rc   items     r*   __getattr__z#ShardingOptimizerStage1.__getattr__  sD    4=((|##}T**4=6===  r,   c                     |dk    r&t          |           j         d}t          |          t          | j        ||          S )NrA   z._inner_opt is READ ONLY)r   __name__rG  setattrrA   )rc   rH  r   msgs       r*   __setattr__z#ShardingOptimizerStage1.__setattr__  sF    <$ZZ(BBBC %%%te444r,   c                    g }g }g }g }|                                 D ]\  }}|                                sd|vrd|v r|                    |           9d|v r|                    |           Sd|v r|                    |           m|                    |           |D ]}||= t          j        j                                         | j        |                                  | j	        D ]j}| 
                    |||           |                     |||           |                     |||           t          j        j                                         kd S )Nr  _moment_pow_acc_master)r   is_distr1   rG   devicecudaempty_cacherO   _create_dy_sharding_grouprN   _all_gather_master_opt_params_all_gather_moment_opt_params_broadcast_pow_acc_opt_params)	rc   
state_dictmaster_opt_param_namesmoment_opt_param_namespow_acc_opt_param_namesslice_param_namesr   tensor
group_infos	            r*   .convert_state_dict_without_tensor_fusion_paramzFShardingOptimizerStage1.convert_state_dict_without_tensor_fusion_param  s   !#!#"$&,,.. 	/ 	/LD&>>## t##D  &--d3333t##'..t4444d""&--d3333!((.... & 	! 	!D4  &&((('**,,,6 	- 	-J..J(>   ..J(>  
 ..J(?   M**,,,,	- 	-r,   c                    | j         j        }|t          j                                        }t          |d          }|D ]?}t          j        t          |                    }t          j                    |v r|| _	        @d S )NrB   )
rL   rQ   r   rR   rS   r=   	new_grouprT   rU   rO   )rc   r3   shard_groupsrg   rh   s        r*   rW  z1ShardingOptimizerStage1._create_dy_sharding_group!  s|    ~#<%..00D)$55! 	2 	2Eu66J}%'''1$	2 	2r,   c                    g }g }g }|                                 D ]}d|v r/|                    |                    d          d                    5d|v r/|                    |                    d          d                    hd|v r.|                    |                    d          d                    t          t	          |                    }t          t	          |                    }t          t	          |                    }| j        |                                  | j        D ]}d}|                                D ]\  }}	t          ||	d                   }| 
                    ||          }
|                     |||
|           |                     |||
|           |                     |||
|           d S )NrP  .distr   rQ  rR  r   r   )keysr1   splitrT   rY   rO   rW  rN   r   r,  _bucket_tensors_with_group_size_re_slicing_opt_param_remove_pow_acc_opt_params)rc   r[  moment_suffixspow_acc_suffixsmaster_suffixsr   ra  r   
param_namer   bucket_infos              r*   +convert_state_dict_with_tensor_fusion_paramzCShardingOptimizerStage1.convert_state_dict_with_tensor_fusion_param,  s   OO%% 	? 	?DD  %%djj&9&9"&=>>>>t##&&tzz'':':2'>????d""%%djj&9&9"&=>>>N 3 344 _!5!566N 3 344'**,,,6 	 	JJ*4*:*:*<*< F F&
J Z-DEE

>>J K &&J^   &&J^   ++J_   	 	r,   c                    |\  }}| j         j                            t          j                              }t          |                                          D ]1\  }\  }	}
|D ]&}|||         v r||	|z            |d|	z   |z   <   ||	|z   = '2d S )Nr  )rV   r   r/   r   rU   r   r   )rc   r[  ra  rq  rn  group_rank_mappingsize_mappingcur_rankrk   rp  r   pow_acc_suffixs               r*   rl  z2ShardingOptimizerStage1._remove_pow_acc_opt_paramsT  s     ,7(L'-33DMOODD-6z7G7G7I7I-J-J 	< 	<)C)*j"1 < <1#666":#>? x*4~EF zN:;;<	< 	<r,   c           	      (   |\  }}| j         j                            t          j                              }|D ]}g }	t          |                                          D ]:\  }
\  }}|||z            }g }t          j        ||                                	                                | j
                   |j        | j                                                 }t          j        ||          }|                    dg          }|	                    |           |j        d         |d         k     rC|	                    t          j        |d         |j        d         z
  g|j                             ~~t          j        j                                         <t          j        |	d          }~	t          j        j                                         d}t          |                                          D ]\  }
\  }}|||
         v r|||z            }|}t          ||
                   D ]\  }}||k    r n|||
         |         z  }||||d         z   |d         z
           }d	 t1          t3          |j        j                            D             }t7          ||j        |          }||d
|z   |z   <   ||d         z  }|||z   = t          j        j                                         ~t          j        j                                         d S )Nrg   axisr   r   r  )r   r   r	  c                 4    g | ]}t          j                    S r  r   	Replicater  s     r*   r  zAShardingOptimizerStage1._re_slicing_opt_param.<locals>.<listcomp>  s0     2 2 2 ((2 2 2r,   r  )rV   r   r/   r   rU   r   r   r   _local_value
contiguousrO   r   r^   r   rG   concatviewr1   r   zerosr   rT  rU  rV  r   r
  r   r   )rc   r[  ra  rq  param_suffixsrt  ru  rv  param_suffixopt_param_listrk   rp  r   	opt_param
param_listparam_sharding_axisglobal_opt_paramfused_opt_paramparam_indexcur_rank_start_indexr;   rank_idshard_opt_paramshard_opt_param_placementss                           r*   rk  z-ShardingOptimizerStage1._re_slicing_opt_paramb  s    ,7(L'-33DMOODD) N	- N	-LN1::;K;K;M;M1N1N 1 1--j*&zL'@A	
**,,7799.   
 '0&:''')) $ $*=%8$ $ $  $4#8#8"#>#> %%&6777 $)!,z-/HHH")) *= 9"2"8";!< #3"8      0"..0000 %mNCCCOM**,,, K1::;K;K;M;M1N1N 1 1--j*1#666 !+:+D EI+6(&/0B30G&H&H E E
7"h..!E,S0A!0DD,,&5,/C$[102$]304 4'O
2 2!&s9+A+G'H'H!I!I2 2 2. ':'!.2' 'O ( x*4|CD z-88zL89"..0000  M**,,,,]N	- N	-r,   c                 L   g }|                                 D ]V\  }}d|z   |z   }||vr||vr|                    ||                                                                                    Wt	          |          dk    rd S t          j        |d          }	g }
t          j        |
|	| j	                   d |
D             }
t          j        |
d          }	|                                 D ]4\  }}d|z   |z   }||vr||vr||                                         }||= 5t
          j
        j                                         d}|                                 D ]$\  }}d|z   |z   }t          j        |d                   }| j        n|d         | j                 }t#          |t          j                  rA|                                }||xx         | j        z  cc<   t+          ||                   ||<   t-          t.          j        |d          }|	|||z            }|                    |          }|d	         }t5          || j        |d                   }t9          d           gt	          |j                  z  }| j        j                             t          j!                              }|| j                                                 }|| j"        z  |j        |         z  }||j        |         | j"        z  z   }t9          t+          |          t+          |                    }|||<   |tG          |                   }tI          |                                |||j                  }||||z   <   |d
         }||z  }&t
          j
        j                                         d S )Nr  r   rz  ry  c                 6    g | ]}|                                 S r  )cpu)r  rH  s     r*   r  zBShardingOptimizerStage1._all_gather_opt_params.<locals>.<listcomp>  s     LLLt

LLLr,   r   r   r   r   r  )%r   r1   r  cloner
  rG   r  r   r   rO   rT  rU  rV  r    r!   rW   r`   r   r   r   ra   r  r   operatormulreshaper+   r^   slicer   rV   r   r/   rU   r_   tupler   )rc   r[  ra  opt_param_names
opt_suffixr  rp  r   opt_param_namer  fused_opt_param_listlocal_tensorr  global_shapemp_placementparam_tensor_parallel_axisglobal_sizeglobal_paramr  opt_param_meshopt_param_placementsshard_indexr   r  shard_slice_start_idxshard_slice_end_idxshard_slicer  s                               r*   _all_gather_opt_paramsz.ShardingOptimizerStage1._all_gather_opt_params  s    
 &0&6&6&8&8 	 	"J
%
2Z?NZ//_44!!>*7799??AA    ~!##F -Q???! /9M	
 	
 	
 	
  ML7KLLL -(<1EEE&0&6&6&8&8 	+ 	+"J
%
2Z?NZ//_44%n5BBDDL>**&&(((&0&6&6&8&8 9	' 9	'"J
%
2Z?N=G)<==L~)),78JKlDJ77 1=1E1E1G1G. !;<<<O<<<?B$%?@@ @L!;< !|Q??K*kK77L +22<@@O'7N#>!4j6N$ $ 
 !;;-#o.C*D*DDK'-33DMOODDD"6##gii  
 t,,%&9:%;! &!'(;<'((    )**C0C,D,D K 0;K+,-eK.@.@AO1$$&&$%	 O 3BJzJ./$]3K;&KK&&(((((r,   c                 |   t          |          dk    rd S i }|D ]A}|                    d          d         }||vrg ||<   ||                             |           Bt          t	          |                                                    }|                                D ]\  }}|                     ||||           d S Nr   rg  r   )r
  ri  r1   dictrT   r   r  )rc   r[  ra  r]  momentsr   moment_suffixmoment_namess           r*   rY  z5ShardingOptimizerStage1._all_gather_moment_opt_params  s     %&&!++F* 	0 	0D JJw//3MG++)+&M"))$////vgmmoo..//+2==?? 	 	'M<''Jm   	 	r,   c                     t          |          dk    rd S |d                             d          d         }|                     ||||           d S r  )r
  ri  r  )rc   r[  ra  r\  master_suffixs        r*   rX  z5ShardingOptimizerStage1._all_gather_master_opt_params2  sh     %&&!++F.q177@@D##"		
 	
 	
 	
 	
r,   c                 P   t          |          dk    rd S g }|D ]2}|                    d          d         }|                    |           3t          t	          |                    }d}|                                D ]\  }}	t          ||	d                   }|                     ||          \  }
}| j        j	        
                    t          j                              }t          |                                          D ]\  }\  }}	|
|         d         }|D ]}d|z   |z   }||k    rf||         }|                                }t          j        || j        j	        |         | j                   ||||z   <   |                    |           w|	d         }d t%          t          |j                            D             }t)          j        d	g          }t          j        || j        j	        |         | j                   t-          |||          }||||z   <   !d S )
Nr   rg  r   r   r  )srcrg   r   c                 4    g | ]}t          j                    S r  r}  r  s     r*   r  zIShardingOptimizerStage1._broadcast_pow_acc_opt_params.<locals>.<listcomp>g  s-     & & &-.((& & &r,   r   )r
  ri  r1   rT   rY   r   r,  rj  rV   r   r/   r   rU   r   r  	broadcastrO   r   r   r   rG   r  r   )rc   r[  ra  r^  rn  r   rw  r   rp  r   rt  r   rv  rk   	root_rankpow_acc_namepow_acc_tensorpow_acc_local_tensortmp_meshtmp_placementstmp_datas                        r*   rZ  z5ShardingOptimizerStage1._broadcast_pow_acc_opt_params@  s    &''1,,F+ 	3 	3D!ZZ004N"">2222 _!5!566
&0&6&6&8&8 	B 	B"J
ZK)@AAJJ !% D D
!
 !
A '-33DMOODD-6z7G7G7I7I-J-J 	M 	M)C)*j*3/2I"1 M M'*4~Ey((%/%=N+9+F+F+H+H(N, 06yA"2   
 ?MJzN:;NN<0000).9H& &27HN8K8K2L2L& & &N  &|QC00HN  06yA"2   
 &9 (N& &N ?MJzN:;;7M	M 	Mr,   c                    d |D             }d |D             }d}d}t          |                                          D ]\  }}|d         }	|	dk    r||z
  }
|	|
k    r>||                             |           ||                             |	           ||	z  }d}	nM|
dk    r@||                             |           ||                             |
           |	|
z  }	||
z  }|dz  }d}|	dk    ||fS )Nc                     g | ]}g S r  r  r  s     r*   r  zKShardingOptimizerStage1._bucket_tensors_with_group_size.<locals>.<listcomp>w  s    000000r,   c                     g | ]}g S r  r  r  s     r*   r  zKShardingOptimizerStage1._bucket_tensors_with_group_size.<locals>.<listcomp>x  s    ///q///r,   r   r  r   )r   valuesr1   )rc   ra  r   group_mappingru  current_sizecurrent_bucket_indexrk   r   tensor_sizeavailable_spaces              r*   rj  z7ShardingOptimizerStage1._bucket_tensors_with_group_sizev  sC   00Z000//J/// ():):)<)<== 	% 	%OC$]3K//",|";/11!#&--.BCCC %,,[999 K/L"#KK '**%c*112FGGG$S)00AAA#6$7(A-(#$L# //& l**r,   c                     t          j                    }t          |                                          }|D ]E}||         }|                                sd|vr$d|v sd|v sd|v r| d| }||         ||<   ||= Fd S )Nr  rP  rQ  rR  _rank)r   rU   listrh  rS  )rc   r[  rv  tensor_namesr   r`  	rank_names          r*   (convert_state_dict_with_rank_unique_namez@ShardingOptimizerStage1.convert_state_dict_with_rank_unique_name  s    =??JOO--..  	! 	!D%F>>## t##D  J$$6$6)t:K:K#44(44	(24(8
9%4  	! 	!r,   c                     t          |                                          }t          |                                          D ]/}d|v r)|                    d          d         }||         ||<   ||= 0d S )Nr  r   )r  rh  ri  )rc   r[  r  r   no_rank_names        r*   #convert_state_dict_with_origin_namez;ShardingOptimizerStage1.convert_state_dict_with_origin_name  s{    JOO--..**++ 	% 	%D$#zz'2215+5d+;
<(t$		% 	%r,   )NN)r   )rK  
__module____qualname____doc__rn   r   r   r   r   r   rD  rI  rN  r   rb  rW  rr  rl  rk  r  rY  rX  rZ  rj  r  r  r  r,   r*   r?   r?   _   s        
<  <  <  < |U. U. U.n
' ' ',  .-@ -@ -@^I
 I
 I
X DE2 2 2 2
! ! !5 5 5 WYY*- *- Y*-X	2 	2 	2 WYY% % Y%N< < <T- T- T-l WYYd) d) Yd)L  &
 
 
4M 4M 4Ml+ + +<! ! !"% % % % %r,   r?   rA  )/r    r  collectionsr   	functoolsr   	itertoolsr   numpyr   rG   paddle.distributedrx   r   r   paddle.autogradr   paddle.base.libpaddler   r	   5paddle.distributed.auto_parallel.static.process_groupr
   Jpaddle.distributed.auto_parallel.static.reshard_funcs.nd_mesh_reshard_funcr   -paddle.distributed.auto_parallel.static.utilsr   /paddle.distributed.fleet.meta_optimizers.commonr   3paddle.distributed.fleet.utils.tensor_fusion_helperr   r   $paddle.distributed.passes.pass_utilsr   paddle.frameworkr   ru   paddle.optimizerr   	moe_utilsr   &static.reshard_funcs.base_reshard_funcr   re   r   r+   r=   r?   r  r,   r*   <module>r     s     # # # # # #                  ! ! ! ! ! !       # # # # # # % % % % % % $ $ $ $ $ $           E D D D D D B B B B B B        H G G G G G D D D D D D & & & & & & * * * * * * P P P P P P         6  &L% L% L% L% L%i L% L% L% L% L%r,   