
    Αi%                     2   S SK r S SKrS SKJr  S SKJr  S SKJr  S SKr	S SK
r
S SKJr  S SK
Jr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJr  S SKJr  S SKJrJr  S SKJ r   S SK!J"r#  S SK$J%r%  SSK&J'r'  SSK(J)r)  SSK*J+r+  SS jr,S r- " S S\%5      r.g)    N)OrderedDict)reduce)product)core)no_grad)pir)fleet)new_process_group)get_1D_sub_process_mesh)
split_mesh)OpRole)alignget_current_device_type)AutoParallelStreamType)_current_expected_place_)	Optimizer   )_dtensor_from_local)copy_op_attr_with_new_member)Strategyc                 f   SnUc  U R                   nU HA  n[        U[        R                  5      (       d  M$  US:X  d   S5       eUR	                  5       nMC     S n[        U R                  5       H   nXc:w  d  M
  [        R                  " U5      n  O   [        R                  " U5      nUb  XWU'   U$ )NzUThe parameter can't be shard twice with sharding strategy even in different mesh now.)	
placements
isinstancedistShardget_dimrangendimcopydeepcopy)paramsharding_axisparam_placements
shard_axis	placementplacement_with_shardingdimnew_placementss           i/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/auto_parallel/sharding.pyget_placement_with_shardingr+   1   s    J ++%	i,, # g# #**,J & #UZZ &*jjo# !
 ]]#34N*(?}%    c                    XR                   ;   d   eU R                   R                  U5      n/ nU R                   H  nUR                  [	        U5      5        M     S/X2'   / n[        U6  Hg  n/ n[	        SU R                  U5      5       H2  n/ USU QUPXbS-   S  Q7n	UR                  U R                  U	   5        M4     UR                  U5        Mi     U$ )Nr   r   )	dim_namesindex_shapeappendr   r   get_dim_sizemesh)
r3   	axis_name
axis_indexrangesdim_num
all_resultxresulticoords
             r*   get_mesh_comm_listr=   L   s    &&&%%i0JF;;eGn% FJfq$++I67A?a*o?q?1!^-=+>?EMM$))E*+ 8 	&!  r,   c                       \ rS rSrSrSS jrS rS rS rS r	S	 r
 SS
 jrS rS r\" 5       S 5       rS r\" 5       S 5       rS rS r\" 5       S 5       rS rS rS rS rS rS rSrg)ShardingOptimizerStage1_   z,
.. ZeRO: https://arxiv.org/abs/1910.02054

Nc                    Uc   S5       e[        U[        R                  R                  [        R                  R                  45      (       d   S5       eXR
                  S'   X l        U=(       d
    [        5       U l        / U l	        S U l
        [        R                  " 5         U R                  R                  c  [        R                  R                  5       nOU R                  R                  n[!        US5      nU H8  n[#        [%        U5      5      n[        R&                  " 5       U;   d  M2  Xpl        M:     S U l        SUR,                  ;   aJ  [!        US5      nU H8  n[#        [%        U5      5      n[        R&                  " 5       U;   d  M2  Xpl        M:     [/        5       U l        SUR2                  ;   a  UR5                  S[        R&                  " 5       5      n	[7        SUR9                  S5      5       H,  n
U R0                  R;                  UR=                  SU
S95        M.     UR=                  SU	S9nOU R0                  R;                  U5        UR,                  R?                  S5      U l         URB                  U R@                     U l"        S	U l#        S
U l$        SUR,                  ;   a  UR,                  R?                  S5      U l#        URB                  U RF                     U l$        [/        5       nU R0                   H?  nUR;                  U5        [K        XRF                  S9 H  nUR;                  U5        M     MA     Xl        [        RL                  " 5         g )Nz)The argument `optimizer` cannot be empty.zR`paddle.distributed.ShardOptimizer` only supports AdamW and SGD optimizer for now.
_inner_optdpmpppr   )r/   r   r   )global_meshsub_mesh_dim)'r   paddle	optimizerAdamWSGD__dict__	_shard_fnr   	_strategy_slice_param_group_info_dy_shard_groupenable_static_meshr   auto_parallelget_meshr=   r
   sortedget_rank_sharding_group	_mp_group
_dim_namesset	pp_meshesr.   get_rank_by_dim_and_process_idr   r2   addget_mesh_with_dimr/   _sharding_axisr0   _sharding_degree_mp_mesh_axis
_mp_degreer   disable_static)selfrI   shard_fnstrategyr3   	dp_groupsgroup
comm_group	mp_groupspp_rankidxr[   pp_meshsub_pp_meshs                 r*   __init__ ShardingOptimizerStage1.__init__e   s   $ 	
7	
$ ((..0@0@0D0DE
 
 	
 a	
 

 '0l#!!/XZ')$#>>'%%..0D>>''D&tT2	E*6%=9J}}%''1$ 
 4??"*46I".ve}=
==?e+%/N #
 4>>!99$PGQ 1 1$ 78""4#9#9$c#9#JK 9))$g)>DNNt$"oo33D9 $D,?,? @4??"!%!6!6t!<D"kk$*<*<=DOI>>g&#- '6H6H$K MM+.$ * 'Nr,   c                 4    [        5       n[        U[        R                  R                  5      (       aE  [        R                  R	                  [        R
                  R                  5       R                  5      n[        R                  R                  R                  5       U l        U R                  R                  U5        U R                  R                  R                  nUS:  a  Sn0 n0 nSnSn/ nU GHm  u  pU
c  M  U	R!                  5       nU
R!                  5       nUc   SU	R"                   SU	 S35       eUc   SU	R"                   SU
 S35       eUR$                  UR$                  :X  d   S	U	R"                   S
U	 SU
 S35       eU R&                  UR(                  ;  a:  UR+                  X45        U	R,                  c
  SS0U	l        OSU	R,                  S'   M  U	R,                  c
  SS0U	l        OSU	R,                  S'   UR$                  U R.                  ;   d/   SU	R"                   SUR$                   SU R.                   S35       e[0        R2                  " 5       UR$                  R4                  ;   a  [7        UR$                  U R&                  5      n[9        UR4                  5      U R:                  R<                  :X  d8   SU	R"                   SUR4                   SU R:                  R<                   35       eUR(                  [?        5       :X  d   SU	R"                   SU	 S35       eUR@                  UR@                  :X  d   SU	R"                   S
U	 SU
 S35       eU	RB                  U
RB                  :X  d   SU	R"                   S
U	 SU
 S35       eU	RD                  U
RD                  :X  d   SU	R"                   S
U	 SU
 S35       eU RF                  S:  a$  U RH                  UR@                  ;   a
  SU	l%        SnO	SU	l%        SnURM                  UR$                  / 5      R+                  U	5        URM                  UR$                  / 5      R+                  U
5        GMp     [        RN                  RQ                  5       nURS                  5       nURT                  S   nUS-  S-  n/ nURW                  5        GH  u  nnUU   n[Y        5       n[Z        R\                  " UUU/5      n[0        R2                  " 5       UR4                  ;   a  U R_                  UU5        [a        U5       GH  u  nn/ n/ nU HY  nURM                  U/ 5      R+                  UU   R"                  5        UR+                  UU   5        UR+                  UU   5        M[     U R                  R                  Rb                  (       a  U Re                  UU5        U Rg                  UU5      u  nnnn US   Rh                  n![j        Rl                  Rn                  Rp                  [s        5          [t        US   Rh                     -  n"U"U Rv                  -  [x        Rz                  " U!5      -  [x        Rz                  " US   Rh                  5      -  n"[0        R2                  " 5       UR4                  ;   a  U R}                  UUUU"5        U R                  R                  R~                  (       d  [        R                  R                  UU!SSSSSU"S/ / 5      u  n#n$U R                  R                  R                  (       d  U H
  n
SU
lD        M     SU$lD        [        RZ                  R                  U$R                  5       U RD                  5      n%U$R                  [Z        R                  " U%U$R!                  5       5      5        GOIS n&S n'U H=  n
U
R                  5       n(URT                  R                  U(5      nU'b  UU':  d  M9  Un'U(n&M?     [Z        R                  " U&5        [        R                  R                  U RD                  U!U R                  5      n$[Z        R                  " US/0 5      n)U$R                  [Z        R                  " U$R                  5       U)5      5        U$R                  5       R                  S5      n*U*R                  [Z        R                  " U*R                  5       U)5      5        Sn+U GH  n
U
R                  5       n([        R                  " U
RD                  5      n,[Z        R                  " U(5        [        R                  R                  U$U+U+U,-   5      n-[        R                  R                  U-U
RD                  5      n-[Z        R                  " U(5        [        R                  R                  U
U-/5        U+U,[x        Rz                  " U!5      -  U"-   S-
  U"-  U"-  [x        Rz                  " U!5      -  -  n+GM     U R                  R                  Rb                  (       d  [Z        R                  " 5         U$RD                  S   U Rv                  -  n.U R:                  R<                  R                  [0        R2                  " 5       5      n/U/U.-  n0U0U.-   n1[        R                  R                  U$U0U15      n2[        R                  R                  U$U R:                  R                  U Rv                  5      n3U R                  R                  Rb                  (       a7  U3R                  5       R                  [        R                  R                  5        [Z        R                  " 5         [        R                  R                  U2U3/5        / n4URW                  5        H  u  n5n6U4R+                  U55        M     UR+                  U4UU 45        URW                  5        GHi  u  n5n6U6u  nn7n8[        R                  R                  U3U7U85      n9UU   R!                  5       R                  n:U:R                  U R&                  5        [Z        R                  " U9R$                  S/U:5      n;U9R                  [Z        R                  " U9R                  5       U;5      5        U9R                  5       R                   R                  5       n<U<S   R                  5       U<S'   [Z        R                  " U9R$                  S/U:5      U<S'   [        U9R                  5       R                   U<S9U9R                  5       l        UR+                  U5U945        GMl     GM     GM     U R                  R                  b  SU R                  R                  lc        U R:                  U R                  R                  ld        U R                  U R                  R                  lf        X`R                  R                  lg        XpR                  R                  lh        U R                  R                  U5        [Z        R                  " 5         U GH  u  n4n=n>U R                  R                  Rb                  (       Ga&  S n?S nU4S   R                  5        H-  n@URT                  R                  U@5      nAU?b  WAU?:  d  M)  WAn?W@nM/     [        R                  R                  UR                  5       S   5      nBUBR                  5       R                  [        R                  R                  5        [        R                  R                  U=U R:                  R                  U Rv                  5      nCUCR                  5       R                  [        R                  R                  5        O?[        R                  R                  U=U R:                  R                  U Rv                  5      nC[        R                  R                  U>WC/5        GM     URT                  R                  U5      S-   nDURT                  UDS  $ )Nr      Fz5parameter dist attribute must not None. but received z : .z4gradient dist attribute must not None. but received z grad : zDParameter and grad should have same process_mesh. but received name:z, parameter:z, grad: 	no_fusionTzAparameter mesh mush be in pp_meshes. but received parameter name:z, mesh:z, pp_meshes: z? all parameter must have the same sharding group. but received z sharding group is : z, global sharding group is: z?Sharding fusion do not support partial parameter. but received zDParameter and grad should have same dims_mapping. but received name:zDParameter and grad should have same global shape. but received name:zCParameter and grad should have same local shape. but received name:r   r   i           )new_results)m_get_devicer   rH   	framework	CUDAPlacedistributedParallelEnvdev_idbase	libpaddlePlace_place	set_placerN   shardingcomm_buffer_size_MB	dist_attrnameprocess_meshr_   partial_dimsr1   optimize_attrr[   r   rV   process_idsr   rU   rW   ranksrZ   dims_mappingshape_local_shaperb   ra   is_distributed
setdefaultstaticdefault_main_programglobal_blockopsitemsr   r   assign_value_group_by_size_cache_slice_param_group_info	enumerateenable_overlap_reduce_scatter_overlap_fuse_group_paramdtyper	   utilstensor_fusion_helper	alignmentr   r   r`   r   size_of_dtype!_cache_slice_param_range_and_sizerelease_gradients_C_opscoalesce_tensor_pipelineenablepersistablecreate_shaped_typetypeset_typecvt_to_dist_typeget_defining_opr/   set_insertion_pointemptycreate_tensor_dist_attributeoperand_sourcenpprod
view_slice
view_shapeset_insertion_point_after	share_varreset_insertion_point_to_endreduce_scatteridset_execution_streamr   SHARDING_STREAMvaluepartial_statuspopresultsas_tensor_dist_attrr   rB   
_grad_clipshould_comm_on_shard_dimsharding_grouprX   mp_grouphas_dist_paramhas_not_dist_paramapply_gradientsall_used_opsnop
all_gather)Erd   params_gradsplacer   parameters_dict
grads_dictr   r   new_params_gradsr"   gradparam_dist_attrgrad_dist_attrsub_meshmain_programtarget_blocklast_op
group_sizeall_gather_param_info_listr3   
parametersgrads
var_groupsgroup_indices	group_idxindicesgroup_param_listgroup_grad_listr/   slice_param_dictpadded_size_dictmain_shard_fused_parammain_fused_paramr   
align_size_
fused_grad
fused_typefirst_grad_opfirst_indexgrad_opr   prev_var
grad_beginsizegrad_buffer
shard_sizerank
rank_beginrank_endview_shard_fused_gradshard_fused_gradslice_param_listslice_param
param_infoparam_begin	param_end
slice_gradpartail_statusslice_grad_dist_attrslice_grad_out_dist_attrshard_paramfused_paramlast_idxoprl   tmpallgather_valuestart_indexsE                                                                        r*   r   'ShardingOptimizerStage1.apply_gradients   sG   eV--7788$$..""..077E kk++113e$"nn55II""%
"'KE|#oo/O!^^-N". G

|SVW\V]]^_. "- FuzzlRZ[_Z``ab-  ,,0K0KK WW\WaWaVbbnotnuu}  C  ~D  DE  FK
 "".*E*EE ''6&&.+6*=E'7;E''4&&.+6*>E'7<E''4"//4>>A STYT^T^S__fgv  hD  hD  gE  ER  SW  Sa  Sa  Rb  bc  dA }}/">">"J"JJ2#00$2E2E 8//0D4H4H4N4NN VV[V`V`Uaavw  xL  xL  wM  Mi  jn  j~  j~  jD  jD  iE  FN
 #//358 QRWR\R\Q]]`af`gghi8  ,,0K0KK WW\WaWaVbbnotnuu}  C  ~D  DE  FK ;;$**, VW\WaWaVbbnotnuu}  C  ~D  DE  F, %%):):: UV[V`V`Uaamnsmtt|  ~B  }C  CD  E:
 !#&&/*F*FF'+$!% (-$%)"&&'C'CRHOO !!/">">CJJ4PQ (T }}99;#002""2&(4/$6
%'" / 5 5 7D*t$E$J::Z4M }}$"2"2222:}M&/&>"	7#% "$$E)))R8??"5).. %++Ju,=>#**5<8 % >>**9900,O **96FG$$*$ (*00KK44>>/1 -a06678  ++,((/0 ))*:1*=*C*CDE  ==?d&6&66::!(("	 ~~..@@$*MM$B$B'"%MAz  >>2299$3D/3D, %4-1J*!'!>!>")+;+H+H"J '',,Z9M9M9OP %)M"&K /"&"6"6"8 , 0 0 6 6w ?&.%+2E*/K,3M !0 ++M:!'!4!4(55"J
 !$ @ @tR PI'',,Z__->	J  *99;JJ1MH%%,,X]]_iH "#J /"&"6"6"8!wwt'8'89//8&,mm&>&>&
J4E' '-mm&>&>'):):' 55g>//{0CD" %)4+=+=e+D$D&0%1&'%( $.!. ))  $11%8	9
 !02 ~~..==446'44Q74;P;PP
++1177H!J.
%
2(.(@(@
H)% $*==#?#? 4 4 7 79N9N$  >>**99$446KK.>>DD 002''*,<= $& /?/E/E/G+K$++K8 0H +11(.( 0@/E/E/G+K4>1E;	!'!9!9(+y"J (.88:II # #&&t':':;+.+K+K"//"~,( '',,&OO-/C #224>>FFH - 3K3))+ -Q/ 88&33bT> -Q/ 5&668BB(@ ..0: %++[*,EFI 0Hs '? !8R ??%%1BFDOO&&?8<8L8LDOO&&526..DOO&&/8FOO&&5<NOO&&9''(89((*
 (	
~~&&555*2.;;=B&**004C'3>#&"$	 > mm''(9!(<=##%::*::@@ #)--":":!5!5!8!8$:O:O#  //1FF*::@@ #)--":":!5!5!8!8$:O:O# MM##[/$BC5 (8 #&&,,W59--r,   c                 N   [        [        U5      5       Vs/ s H  n0 PM     snU l        [        U5       H  u  pEU H  nX   n0 U R                  U   UR                  '   UR
                  U R                  U   UR                     S'   SU R                  U   UR                     S'   SU R                  U   UR                     S'   UR                  U R                  U   UR                     S'   UR                  U R                  U   UR                     S'   M     M     g s  snf )Nr   r   param_startr   r   r   )r   lenrO   r   r   r   r   r   )rd   r   r   r   r   r   r/   r"   s           r*   r   5ShardingOptimizerStage1._cache_slice_param_group_info  s1   49#m:L4M'N4Mq4M'N$"+M":I ")FH,,Y7

CKK ,,Y7

CGL
  ,,Y7

C!
  ,,Y7

C
 $$ ,,Y7

C 
 && ,,Y7

C" ! #; (Os   D"c                    UR                  5        HP  u  pVUR                  R                  SS5      nUu  pn
U	U R                  U   U   S'   U
U R                  U   U   S'   MR     UR                  5        H  u  pUU R                  U   U   S'   M     U R                  U   R                  5        H  u  pUU R                  U   U   S'   M     g )Nslice@ r  r   padded_sizer   )r   r   replacerO   )rd   r   r   r   r   r   r   slice_param_namer   r   r   r   r  s                r*   r   9ShardingOptimizerStage1._cache_slice_param_range_and_size  s     (8'='='?#K*//77"E(2%AI  ((34DE
  ((34DE (@ "2!7!7!9D ((3D9-H ":
 33I>DDFGD ((3D9,G Gr,   c                 ^   SSS.nU GH  nUR                  5       /nSn/ n[        U5      S:  a  UR                  5       nUR                  [	        [
        R                  5      :X  a  UnOUR                  5       S:X  ag  UR                  UR                  S5      R                  5       5        UR                  [	        [
        R                  5      :w  a  UR                  U5        OO[        U5      S:  a  M  Uc  M  UR                  R                  U5      S-   n	U H6  nUR                  R                  U5      n
X:w  d  M%  UR                  X5        M8     US   b  XS   :  d  GM_  XS'   [        U5      S:  a  US   US'   GM}  XcS'   GM     US   b  [        R                  " US   5        gg)z
In order to overlap computation and reduce_scatter communication, we need to:
  a. place reduce_scatter in communication stream
  b. place reduce_scatter op and its producer ops after the last grad define op
This function will complete the item b.
N)rl   r  r   r   rl   r   r  )r   r  r   op_roleintr   Backwardnum_operandsr1   r   r   r/   move_opr   r   )rd   r   r   insertion_infor   stackr   advance_opsr  new_idxold_idxs              r*   r   /ShardingOptimizerStage1._reduce_scatter_overlap'  s    "&T2#D))+,EGKe*q.YY[::V__!55 G??$)LL!2!21!5!E!E!GHzzS%99#**2. e*q. "&**009A=%B*..44R8G)$,,R9 & #5)1!66,35);'!+/:2t,/6t,A $F $+)).*>? ,r,   c                 d   [         R                  R                  5       n[         R                  R                  5       n[         R                  R	                  U5         S n/ nSnU H8  nU" X8R
                  5      n	UR                  U	5        US-   UR
                  -   nM:     US   R                  n
[        R                  R                  R                  [        5          [        U
   -  nXR                  -  n[         R                  R!                  UU
SSSSSUS/ / 5      u  pSnU Hc  n["        R$                  " UR&                  5      [(        R*                  " U
5      -  nX-   S	-
  U-  U-  [(        R*                  " U
5      -  nUU-  nMe     [         R,                  R/                  UR1                  5       U/5      n[,        R2                  " US   R4                  S/0 5      nUR7                  [,        R8                  " UU5      5        SUl        [         R<                  R?                  X5        URA                  5       RC                  X}R1                  5       5      nU RD                  Ul#        SUl        XR                  -  nU RH                  RJ                  RM                  [N        RP                  " 5       5      nUU-  n[         R                  RS                  UUUU-   5      nSUl        [         R<                  R?                  US
U-   5        URA                  5       RC                  S
U-   UR1                  5       5      nU RD                  Ul#        SUl        Sn0 n0 n[U        U5       GH  u  nn["        R$                  " UR&                  5      [(        R*                  " U
5      -  nX-   S	-
  U-  U-  [(        R*                  " U
5      -  nUUUR
                  '   [W        UU-
  S5      nUU-  n[Y        UU-
  U5      nUU:  d  M  [         R                  RS                  UUU5      nSUl        SUR
                  -   n [         R<                  R[                  UU 5        UR]                  U5        [         R                  R	                  U5         [,        R^                  " 5         [         R<                  Ra                  U 5      n!SU!l        U!R7                  UR1                  5       5        URb                  U!l1        URd                  U!l2        URf                  U!l3        URh                  U!l4        URj                  U!l5        URl                  U!l6        URn                  U!l7        URp                  U!l8        S S S 5        UUU4UW!'   GM     S S S 5        WWWW4$ ! , (       d  f       N)= f! , (       d  f       N%= f)Nc                     U R                  5       R                   HP  nUR                  5       S:X  d  M  XR                  5       S   :X  d  M1  UR	                  S5      R                  5       s  $    [        SU S35      e)Nzbuiltin.set_parameterparameter_namer   zcan't find param (z) in startup program)r   r   r   attrsoperandsource
ValueError)startupr   r  s      r*   get_param_from_startupIShardingOptimizerStage1._fuse_group_param.<locals>.get_param_from_startup[  so    !..044B	%<< HHJ/?$@@!zz!}3355 5 !(.BC r,   zfused@-r   TFru   r   r   zshard@r  )9rH   r   default_startup_programr   program_guardr   r1   r   r	   r   r   r   r   r   r`   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _pir_opsset_persistable_valuer   	add_kwargr   
place_attrrW   r   r/   r   rV   r   r   maxminset_parameterset_parameters_fromreset_insertion_point_to_start	parameter	trainablestop_gradientr   regularizerdo_model_average	need_clipr   is_parameter)"rd   group_indexr   startup_programr   r%  startup_param_listfuse_param_namer"   startup_paramr   r   r   r   r   r   r  r   r   r   r   r   r   shard_fused_paramr   total_buffer_sizer   r   r/   r   r   init_slice_paramr  r   s"                                     r*   r   )ShardingOptimizerStage1._fuse_group_paramV  sC    --??A}}99;]]((9	 "$&O) 6#ZZ! #))-8"1C"7%**"D * 'q)//E00::+- <   $&;&;;J#]];;"NA J)wwu112T5G5G5NN'!+
: !))%01 
 k)
 *  66  "ZLJ 88 #002$I   !5!5j)!LM&*K#OO11+O+88:DD!1!1!3  +/++'+/(#'<'<<J''--33DMMODD
*J & 8 8Zj)@! -1)OO11!8o#= &2%>%>%@%J%J?*,=,B,B,D&" 15"-15". !!! )*: ;uwwu112T5G5G5NN'!+
: !))%01 
 0; ,!"3j"@!D![0! 1J >
K	*'-}}'?'?);	($ 48$0'/%**'<$OO11(*: !44_E44\B::<&,oo&?&?,' 37/#,,-=-B-B-DE05-494G4G1494G4G1272C2C/7<7M7M405-5:5I5I2383E3E0 C" #!5$[1M !<m :D "	
 	
+ CBY :9s-   N#V!BV!
C#V-V!
VV!!
V/c                 $    U R                  U5      $ N)r   )rd   lossr;  r   param_group_idxs        r*   _apply_optimize'ShardingOptimizerStage1._apply_optimize  s     ##L11r,   c                     SU R                   ;   a.  US:X  a  U R                   U   $ [        U R                   S   U5      $ [        e)NrB   )rL   getattrAttributeError)rd   items     r*   __getattr__#ShardingOptimizerStage1.__getattr__  sA    4==(|#}}T**4==6==  r,   c                     US:X  a#  [        U 5      R                   S3n[        U5      e[        U R                  X5      $ )NrB   z._inner_opt is READ ONLY)r   __name__rK  setattrrB   )rd   rL  r   msgs       r*   __setattr__#ShardingOptimizerStage1.__setattr__  s>    <$Z(())ABC %%t44r,   c                    / n/ n/ n/ nUR                  5        H  u  pgUR                  5       (       d  M  SU;  a  M$  SU;   a  UR                  U5        M=  SU;   a  UR                  U5        MV  SU;   a  UR                  U5        Mo  UR                  U5        M     U H  nX	 M     [        R                  R
                  R                  5         U R                  c  U R                  5         U R                   Ha  nU R                  XU5        U R                  XU5        U R                  XU5        [        R                  R
                  R                  5         Mc     g )Nr  _moment_pow_acc_master)r   is_distr1   rH   devicecudaempty_cacherP   _create_dy_sharding_grouprO   _all_gather_master_opt_params_all_gather_moment_opt_params_broadcast_pow_acc_opt_params)	rd   
state_dictmaster_opt_param_namesmoment_opt_param_namespow_acc_opt_param_namesslice_param_namesr   tensor
group_infos	            r*   .convert_state_dict_without_tensor_fusion_paramFShardingOptimizerStage1.convert_state_dict_without_tensor_fusion_param  sB   !#!#"$&,,.LD>>##t#D &--d3t#'..t4d"&--d3!((. /" &D  &&&('**,66J..(> ..(>
 ..(? MM**, 7r,   c                    U R                   R                  nUc  [        R                  R	                  5       n[        US5      nU HC  n[        R                  " [        U5      5      n[        R                  " 5       U;   d  M=  X@l	        ME     g )NrC   )
rM   rR   r   rS   rT   r=   	new_grouprU   rV   rP   )rd   r3   shard_groupsrh   ri   s        r*   r]  1ShardingOptimizerStage1._create_dy_sharding_group!  sf    ~~##<%%..0D)$5!Eu6J}}%''1$ "r,   c                    / n/ n/ nUR                  5        H  nSU;   a%  UR                  UR                  S5      S   5        M.  SU;   a%  UR                  UR                  S5      S   5        MY  SU;   d  Ma  UR                  UR                  S5      S   5        M     [        [	        U5      5      n[        [	        U5      5      n[        [	        U5      5      nU R
                  c  U R                  5         U R                   Hs  nSnUR                  5        H  u  p[        XyS   5      nM     U R                  Xg5      n
U R                  XX5        U R                  XX5        U R                  XX5        Mu     g )NrV  .distr   rW  rX  r   r   )keysr1   splitrU   rZ   rP   r]  rO   r   r.  _bucket_tensors_with_group_size_re_slicing_opt_param_remove_pow_acc_opt_params)rd   ra  moment_suffixspow_acc_suffixsmaster_suffixsr   rg  r   
param_namer   bucket_infos              r*   +convert_state_dict_with_tensor_fusion_paramCShardingOptimizerStage1.convert_state_dict_with_tensor_fusion_param,  sc   OO%DD %%djj&9"&=>t#&&tzz'':2'>?d"%%djj&9"&=> &  N 34 _!56N 34'**,66JJ*4*:*:*<&
 -DE
 += >>K && && ++ 7r,   c                    Uu  pVU R                   R                  R                  [        R                  " 5       5      n[        UR                  5       5       H.  u  nu  pU H   nXuU   ;   a  XU-      USU	-   U-   '   XU-   	 M"     M0     g )Nr  )rW   r   r/   r   rV   r   r   )rd   ra  rg  ry  rv  group_rank_mappingsize_mappingcur_rankrl   rx  r   pow_acc_suffixs               r*   rt  2ShardingOptimizerStage1._remove_pow_acc_opt_paramsT  s     ,7(''--33DMMOD-6z7G7G7I-J)C)*"1#66"#>? x*4~EF N:; #2 .Kr,   c           	      .   Uu  pVU R                   R                  R                  [        R                  " 5       5      nU GH  n/ n	[        UR                  5       5       GH.  u  n
u  pXU-      n/ n[        R                  " UUR                  5       R                  5       U R                  S9  UR                  U R                     R                  5       n[        R                  " XS9nUR!                  S/5      nU	R#                  U5        UR$                  S   US   :  aB  U	R#                  [        R&                  " US   UR$                  S   -
  /UR(                  S95        AA[        R*                  R,                  R/                  5         GM1     [        R                  " U	SS9nA	[        R*                  R,                  R/                  5         Sn[        UR                  5       5       H  u  n
u  pXuU
   ;   a  XU-      nUn[        XZ   5       H  u  nnUU:X  a    OUXj   U   -  nM     UUUUS   -   US   -
   n[1        [3        UR4                  R$                  5      5       Vs/ s H  n[        R6                  " 5       PM     nn[9        UUR4                  U5      nUUS	U-   U-   '   UUS   -  nXU-   	 [        R*                  R,                  R/                  5         M     A[        R*                  R,                  R/                  5         GM     g s  snf )
Nrh   axisr   r   r  )r   r   r  r  )rW   r   r/   r   rV   r   r   r   _local_value
contiguousrP   r   r_   r   rH   concatviewr1   r   zerosr   rZ  r[  r\  r   r  r   	Replicater   )rd   ra  rg  ry  param_suffixsr}  r~  r  param_suffixopt_param_listrl   rx  r   	opt_param
param_listparam_sharding_axisglobal_opt_paramfused_opt_paramparam_indexcur_rank_start_indexr;   rank_idshard_opt_paramr   shard_opt_param_placementss                            r*   rs  -ShardingOptimizerStage1._re_slicing_opt_paramb  s    ,7(''--33DMMOD)LN1::;K;K;M1N--j&L'@A	
**,779..
 '0&:&:'''') $ $*==$  $4#8#8"#> %%&67 $))!,z-/HH")) *= 9"2"8"8";!< #3"8"8  0""..0= 2OB %mmNCOMM**, K1::;K;K;M1N--j#66 !++D EI+6(&/0B0G&H
7"h.!,0A!0DD, 'I
 '6,/C$[102$]304'O "'s9+A+A+G+G'H!I2!IA (!I / 2 ':'!..2'O ( x*4|CD z-88L89""..0? 2OD  MM**,] *v2s   'Lc                 .   / nUR                  5        HJ  u  pgSU-   U-   nX;  a  M  X;  a  M  UR                  X   R                  5       R                  5       5        ML     [	        U5      S:X  a  g [
        R                  " USS9n	/ n
[        R                  " XU R                  S9  U
 Vs/ s H  oR                  5       PM     n
n[
        R                  " U
SS9n	UR                  5        H/  u  pgSU-   U-   nX;  a  M  X;  a  M  X   R                  5       nX	 M1     [
        R                  R                  R                  5         SnUR                  5        GH  u  pgSU-   U-   n[        R                  " US   5      nU R                   bi  US   U R"                     n[%        U[        R&                  5      (       a8  UR)                  5       nUU==   U R*                  -  ss'   [-        UU   5      UU'   [/        [0        R2                  US5      nU	XU-    nUR5                  U5      nUS   n[7        UU R8                  US   5      n[;        S 5      /[	        UR<                  5      -  nU R>                  R@                  RC                  [        RD                  " 5       5      nUU R8                     R)                  5       nUU RF                  -  UR<                  U   -  nUUR<                  U   U RF                  -  -   n[;        [-        U5      [-        U5      5      nUUU'   U[I        U5         n[K        UR                  5       UUUR<                  5      nUXU-   '   US	   nUU-  nGM     [
        R                  R                  R                  5         g s  snf )
Nr  r   r  r  r   r   r   r   r  )&r   r1   r  cloner  rH   r  r   r   rP   cpurZ  r[  r\  r    r!   rX   ra   r   r   r   rb   r  r   operatormulreshaper+   r_   slicer   rW   r   r/   rV   r`   tupler   )rd   ra  rg  opt_param_names
opt_suffixr  rx  r   opt_param_namer  fused_opt_param_listrL  local_tensorr  global_shapemp_placementparam_tensor_parallel_axisglobal_sizeglobal_paramr  opt_param_meshopt_param_placementsshard_indexr   r  shard_slice_start_idxshard_slice_end_idxshard_slicer  s                                r*   _all_gather_opt_params.ShardingOptimizerStage1._all_gather_opt_params  s   
 &0&6&6&8"J%
2Z?N/4!!*779??A '9 ~!# --Q?! 9M9M	
 8LL7Kt
7KL --(<1E&0&6&6&8"J%
2Z?N/4%5BBDL* '9 	&&(&0&6&6&8"J%
2Z?N==G)<=L~~)),78J8JKlDJJ771=1E1E1G. !;<O<?B$%?@@L!;< !|Q?K*K7L +22<@O'7N#>!4!4j6N$ 
 !;-#o.C.C*DDK''--33DMMODD"6###gi  
 t,,,%%&9:%;! &!''(;<''((    )*C0C,DK 0;K+,-eK.@AO1$$&$%%	O 3BJJ./$]3K;&Ks '9v 	&&(W  Ms   +Nc                 4   [        U5      S:X  a  g 0 nU H3  nUR                  S5      S   nXd;  a  / XF'   XF   R                  U5        M5     [        [	        UR                  5       5      5      nUR                  5        H  u  pgU R                  XXv5        M     g Nr   ro  r   )r  rq  r1   dictrU   r   r  )rd   ra  rg  rc  momentsr   moment_suffixmoment_namess           r*   r_  5ShardingOptimizerStage1._all_gather_moment_opt_params  s     %&!+*D JJw/3M+)+&"))$/	 + vgmmo./+2==?'M'' ,;r,   c                 z    [        U5      S:X  a  g US   R                  S5      S   nU R                  UUUU5        g r  )r  rq  r  )rd   ra  rg  rb  master_suffixs        r*   r^  5ShardingOptimizerStage1._all_gather_master_opt_params2  sI     %&!+.q177@D##"		
r,   c                 D   [        U5      S:X  a  g / nU H(  nUR                  S5      S   nUR                  U5        M*     [        [	        U5      5      nSnUR                  5        H  u  p[        XyS   5      nM     U R                  X'5      u  pU R                  R                  R                  [        R                  " 5       5      n[        UR                  5       5       GH5  u  nu  pX   S   nU GH  nSU-   U-   nX:X  ae  X   nUR                  5       n[        R                  " UU R                  R                  U   U R                   S9  UXU-   '   UR#                  U5        Mv  U	S   n[%        [        UR&                  5      5       Vs/ s H  n[        R(                  " 5       PM     nn[*        R,                  " S/5      n[        R                  " UU R                  R                  U   U R                   S9  [/        UUU5      nUXU-   '   GM!     GM8     g s  snf )	Nr   ro  r   r   r  )srcrh   r   r   )r  rq  r1   rU   rZ   r   r.  rr  rW   r   r/   r   rV   r   r  	broadcastrP   r   r   r   r  rH   r  r   )rd   ra  rg  rd  rv  r   r  r   rx  r   r}  r   r  rl   	root_rankpow_acc_namepow_acc_tensorpow_acc_local_tensortmp_meshtmp_placementstmp_datas                        r*   r`  5ShardingOptimizerStage1._broadcast_pow_acc_opt_params@  s    &'1,+D!ZZ04N"">2 , !_!56
&0&6&6&8"JZK)@AJ '9
 !% D D!
 ''--33DMMOD-6z7G7G7I-J)C)**/2I"1'*4~E(%/%=N+9+F+F+H(NN, 0066yA"22
 ?MJN:;NN<0).9H27HNN8K2L&2LQ(2L # &  &||QC0HNN  0066yA"22
 &9 (N&N ?MJN:;7 #2 .K &s   Hc                    U Vs/ s H  n/ PM     nnU Vs/ s H  n/ PM     nnSnSn[        UR                  5       5       H  u  pU	S   n
U
S:  d  M  X&-
  nX::  a-  XH   R                  U5        XX   R                  U
5        Xj-  nSn
O;US:  a.  XH   R                  U5        XX   R                  U5        X-  n
Xk-  nUS-  nSnU
S:  a  My  M     XE4$ s  snf s  snf )Nr   r  r   )r   valuesr1   )rd   rg  r   r   group_mappingr~  current_sizecurrent_bucket_indexrl   r   tensor_sizeavailable_spaces               r*   rr  7ShardingOptimizerStage1._bucket_tensors_with_group_sizev  s   %/0ZZ0$./JqJ/ ():):)<=OC$]3K/",";1!&--.BC %,,[9 /L"#K '*%*112FG$)00A#6$7(A-(#$L# /  >, **7 1/s
   CCc                     [         R                  " 5       n[        UR                  5       5      nU HG  nX   nUR	                  5       (       d  M  SU;  a  M&  SU;   d  SU;   d  SU;   a  U SU 3nX   X'   X	 MI     g )Nr  rV  rW  rX  _rank)r   rV   listrp  rY  )rd   ra  r  tensor_namesr   rf  	rank_names          r*   (convert_state_dict_with_rank_unique_name@ShardingOptimizerStage1.convert_state_dict_with_rank_unique_name  s    ==?JOO-. D%F>>##t#D J$$6)t:K#fE(4	(2(8
%  !r,   c                     [        UR                  5       5      n[        UR                  5       5       H'  nSU;   d  M  UR                  S5      S   nX   X'   X	 M)     g )Nr  r   )r  rp  rq  )rd   ra  r  r   no_rank_names        r*   #convert_state_dict_with_origin_name;ShardingOptimizerStage1.convert_state_dict_with_origin_name  sT    JOO-.*+D$#zz'215+5+;
($	 ,r,   )rP   rb   rX   ra   r   rM   r_   r`   rW   rO   rN   r[   )NN)r   )rP  
__module____qualname____firstlineno____doc__ro   r   r   r   r   r   rG  rM  rS  r   rh  r]  rz  rt  rs  r  r_  r^  r`  rr  r  r  __static_attributes__ r,   r*   r?   r?   _   s    
< |U.n
',.-@^I
X DE2
!5 Y*- *-X	2 Y% %N<T-l Yd) d)L&
4Ml+<!"%r,   r?   rD  )/r    r  collectionsr   	functoolsr   	itertoolsr   numpyr   rH   paddle.distributedrz   r   r   paddle.autogradr   paddle.base.libpaddler   r	   5paddle.distributed.auto_parallel.static.process_groupr
   Jpaddle.distributed.auto_parallel.static.reshard_funcs.nd_mesh_reshard_funcr   -paddle.distributed.auto_parallel.static.utilsr   /paddle.distributed.fleet.meta_optimizers.commonr   3paddle.distributed.fleet.utils.tensor_fusion_helperr   r   $paddle.distributed.passes.pass_utilsr   paddle.frameworkr   rw   paddle.optimizerr   	moe_utilsr   &static.reshard_funcs.base_reshard_funcr   rf   r   r+   r=   r?   r  r,   r*   <module>r     sp      #     !  # % $ E B H D & * P 6&L%i L%r,   