
    Αi5                       S SK r S SKJr  S SKrS SKJr  S SKJrJ	r	J
r
  S SKJr  S SKJrJrJrJrJrJrJrJrJrJrJr  S SKJr  S SKJr  S SKJrJr  S S	K J!r!  S
SK"J#r#  S
SK$J%r%J&r&  S
SK'J(r(  \RR                  RT                  r*\RR                  RW                  5       r,/ SQr-/ SQr.\" \ R^                  5      r0\Rb                  Rd                  Rf                  r4Sr5S r6\&" S5       " S S\%5      5       r7S r8S r9\*Rt                  4S jr;S r<S r=S r>S r?S r@\Rb                  Rd                  R                  \44S jrBS rCS rDS  rES! rF S,S" jrGS# rHS$ rIS-S% jrJS& rKS' rL " S( S)5      rM " S* S+5      rNg).    N)reduce)ParallelModeis_data_parallel_reduce_opis_parameter_related)new_process_group)_get_comm_group
get_loggerget_var_numelinsert_dependencies_for_varsis_backward_opis_dep_skip_opis_forward_opis_optimize_op*naive_set_dist_op_attr_for_program_by_mesh6naive_set_dist_op_attr_for_program_by_mesh_and_mappingset_var_dist_attr)get_var_size)core)default_main_programdefault_startup_program)unique_name   )_is_master_grad_cast_op)PassBaseregister_pass)AutoParallelStreamType)create_py_readercreate_double_buffer_readerreadslicesplitassignsend_v2)
adamadamaxadamwdecayed_adagradmomentumdgc_momentumlars_momentummerged_momentumlambsgdfloat16c                     U R                   R                  S5      =(       a    SU R                   R                  S5      ;   $ )Nop_namescopez/auto_parallel/reshard)deschas_attrattrops    p/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/passes/auto_parallel_sharding.py_is_reshard_opr7   Q   s8    77 C
"bggll>&B
BC    auto_parallel_shardingc                      ^  \ rS rSrU 4S jrS rS rS rS rS r	S r
S	 rS
 rS rS rS rS rS rS rS rS rS rS rS rS rS rS rSrU =r$ )ShardingPass[   c                   > [         TU ]  5         U R                  SS 5        U R                  SS 5        U R                  SS 5        U R                  SS 5        U R                  SS 5        U R                  SS 5        U R                  SS 5        U R                  SS 5        U R                  S	S 5        U R                  S
S 5        U R                  SS 5        U R                  S/ 5        U R                  SS5        U R                  SS5        U R                  SS5        [        5       U l        / U l        0 U l        SU l        S U l        / U l	        g )Ndist_contextstagesharding_degreedegreeenable_overlapparam_comm_stream_numgrad_comm_stream_numparam_bucket_size_numelgrad_bucket_size_numelpartition_algorenable_hierarchical_commparams_gradsglobal_rank	amp_dtyper.   gradient_sync_after_accumulateF)
super__init__set_attrset	dp_groupssharding_infosvarname_to_sharding_infosharding_hybrid_dpouter_dp_groupshared_params_grads)self	__class__s    r6   rO   ShardingPass.__init__]   s   nd+gt$'.h%&--t4,d3/6.5'.0$7nb)mR(k9-6> (*%"'"#% r8   c                 H   U R                  S5      c  gU R                  S5      S;  a  gU R                  S5      b;  [        U R                  S5      [        5      (       a  U R                  S5      S::  a  gONU R                  S5      b;  [        U R                  S5      [        5      (       a  U R                  S5      S::  a  gOg[        U R                  S5      5      S	::  a  g[        U R                  S
5      [        5      (       a  U R                  S
5      S	:  a  gU R                  S5      c  gU R                  S5      c  gU R                  S5      c  gU R                  S5      c  gU R                  S5      c  gU R                  S5      c  gU R                  S5      c  gg)Nr>   Fr?   )r         r@   r   rA   rI   r   rJ   rB   rC   rD   rE   rF   rG   rH   T)get_attr
isinstanceintlenrX   s    r6   _check_selfShardingPass._check_selfu   s   ==(0==!2==*+7t}}->?EE01Q6 7]]8$0t}}X6<<BB B
 t}}^,-24==7==$--C
C ==)*2==019==/08==23;==12:==*+3==34<r8   c                     g)NT )rX   
other_passs     r6   _check_conflictShardingPass._check_conflict   s    r8   c                    U R                  S5      U l        [        U R                  S5      =(       d    U R                  S5      5      U l        [        U R                  S5      5      U l        [        U R                  S5      5      U l        U R                  S5      U l        [        U R                  S5      5      U l        [        U R                  S5      5      U l        U R                  S	5      U l	        U R                  S
:  d  U R                  S
:  a  U R                  (       d   S5       e[        U R                  S5      5      U l
        [        U R                  S5      5      U l        U R                  S5      U l        U R                  S5      nUR                  5       UR                  5       peU R                  S5      U l        U R                  S:X  a&  [        R                   R"                  R$                  nSnU R'                  XT5        UR(                   H6  n	U R+                  X5        U R-                  U	5        U R/                  X5        M8     UR1                  SU R2                  5        U R5                  X5        g )Nr>   r@   rA   r?   rJ   rB   rC   rD   rH   r   z3multiple comm stream need enable_overlap to be TruerE   rF   rG   rI   rL   bfloat16)r^   _dist_contextr`   sharding_world_sizer?   rJ   rB   rC   rD   rH   rE   rF   rG   global_blockrL   r   VarDescVarTypeBF16_build_sharding_groupsblocks_shard_optimizer_shard_gradient_synchronization_shard_parameterrP   rW   _optimization_pass)
rX   main_programstartup_programcontextrI   
main_blockstartup_block__amp_target_dtype____amp_target_dtype_name__blocks
             r6   _apply_single_implShardingPass._apply_single_impl   s   !]]>:#&MM+,Gh0G$
  w/0
t}}];<"mm,<=%(7N)O%P"$'6L(M$N!(,&)
% %%)T-F-F-J&& E& (+MM34(
$ '*MM23'
#  $}}->?}}^4%%'((* "
 {3>>Z'#'<<#7#7#<#< (2% 	##J=!((E!!%7007!!%7 )
 	)A)AB>r8   c                 H    U R                  U5        U R                  X5        g N) _collective_data_parallel_groups_build_sharding_infos)rX   r{   rI   s      r6   rr   #ShardingPass._build_sharding_groups   s    --j9"":<r8   c                    UR                    H|  n[        U5      (       a  UR                  [        ;   a  M)  [	        U5      (       a  M;  [        U R                  X R                  S5      nUc  Ma  U R                  R                  U5        M~     [        U R                  5      S:w  a"  [        S[        U R                  5       S35      eg )Nr   r   zSSo far Only and Exactly one data parallel group in network are supported, but got [z ] different data parallel groups)opsr   type	_skip_opsr7   +_inference_data_parallel_group_for_operatorrJ   rl   rR   addra   NotImplementedError)rX   r{   r5   groups       r6   r   -ShardingPass._collective_data_parallel_groups   s    ..B $$9(< b!!?  "&8&8!E  ""5) ! t~~!#%efijnjxjxfyez  {[  \  $r8   c                    [        XU R                  5      nU R                   GH  nUR                  U R                  :  d"   SU R                   SUR                   S35       eUR                  U R                  -  S:X  d"   SU R                   SUR                   S35       eU R
                  UR                  ;   d"   SU R
                   SUR                   S35       e[        U5      U R                  :  d!   S[        U5       S	U R                   S
35       eUR                  U R                  :  a  SU l        U R                  S:  d   eU R                  S:  d   e[        U R                  5      S:X  d   S5       e[        UR                  U R                  U R
                  5      u  pE[        U5      n[        U5      U l        OUnXPR                  l        [        UU R
                  UU R                   5      nU R"                  R%                  U5        UR&                   H  nX`R(                  UR*                  '   M     GM      g )Nzsharding world size [z(] should not larger than dp world size []r   z(] should be divisible by dp world size [zcurrent ranks [z.] does NOT belong to the data parallel group [znumber of parameters [z#] is not enough to be shard among [z] ranksTr\   r   zthybrid sharding and data parallelism are supported only when there is exactly one data parallel group in the network)re_order_programrl   rR   nranksrm   rJ   ranksra   rU   rC   rD   _get_dp_and_sharding_groupsr   rV   _sharding_groupShardingInforG   rS   appendparamsrT   name)rX   r{   rI   dp_grouprV   sharding_groupsharding_infoparams           r6   r   "ShardingPass._build_sharding_infos   sQ   'd&8&8

 H??d&>&>> '(@(@'AAijrjyjyizz{|> ??T%=%==B '(@(@'AAijrjyjyizz{|B ##x~~5 !$"2"2!33abjbpbpaqqrs5 |$(@(@@ (\):(;;^_c_w_w^xx  A@
 !9!99*.'11A5550014444>>*a/  K/ 2MNND$<$<d>N>N2. "3>!B&7&G#!)1?.(  $$	M &&}5&--<I--ejj9 .O 'r8   c                     U R                  U5        U R                  U5        U R                  X5        U R                  X5        g)z
sharding all optimizer related ops and vars, include:
gradient clip ops & vars
weight decay ops & vars
optimizer ops and states
N)_shard_amp_related_op_and_vars_shard_weight_decay_shard_optimizer_ops_and_states_insert_optimizer_broadcasts)rX   r{   r|   s      r6   rt   ShardingPass._shard_optimizer  s<     	++J7  ,,,ZG))*Dr8   c                    U R                   S:  a  g [        [        [        UR                  5      5      5       GH+  u  p#[        X5      (       am  U R                   S:  a]  UR                  S   nUS UR                  S5       nU R                  U5      (       d"  UR                  USS9  UR                  USS9  M  M  UR                  S;   d  M  / nUR                  R                  S5       H@  nUS UR                  S5       nU R                  U5      (       d  M/  UR                  U5        MB     U(       a;  UR                  R                  SU5        UR                  R!                  SU5        GM8  UR                  S	:X  a  UR#                  S
5      nUR                  S   n	UR$                  U	   n
UR                  USS9  UR'                  USSU
0SU
R(                  SU
R*                  SS[,        U0S9  U R.                  R1                  U
5      n[3        UR                  U   UR4                  UR6                  U R.                  UR8                  S9  GM  UR                  USS9  GM.     UR;                  5         g )Nr   r   @Fsync)check_finite_and_unscaleupdate_loss_scalingXOutr   op_rolefill_constantshapedtypevaluer   outputsattrschunk_id)r?   reversedlist	enumerater   _is_param_grad_fp32_cast_opoutput_arg_namesfind_is_parameter_in_local_shard
_remove_op_remove_varr   r1   inputr   	set_input
set_outputr3   vars_insert_op_without_syncr   r   OP_ROLE_KEYrl    get_tensor_dist_attr_for_programr   process_meshdims_mappingr   _sync_with_cpp)rX   r{   idxr5   output_name
param_name
reversed_x
input_namer   out_nameout_var	dist_attrs               r6   r   +ShardingPass._shard_amp_related_op_and_vars,  sC   ::>Yz~~%> ?@GC*:::tzzA~ 11!4()@;+;+;C+@A
88DD))#E):**;U*C E
 OO
"$''--"4J!+,Bjooc.B!CJ88DD"))*5	 #5 GG%%c:6GG&&uj9ww"<<"$'')"4#%#6#6q#9",//(";"--c->"::!0%*G$4 ' ' ' +W	#	 ; 
 !..OO ' " O&NN3/%22%22 ..%.%7%7 #--c->o Ar 	!!#r8   c                    U R                   S:  a  g / SQn[        5       n[        5       n[        [        UR                  5      5       H  u  pV[        U5      (       d  M  UR                  U;   d  M)  UR                  S5      S   nUS UR                  S5       nU R                  U5      (       a  Mi  UR                  U5        UR                  S;   d  M  UR                   H  n	UR                  U	5        M     M     [        [        [        UR                  5      5      5       H.  u  pV[        U5      (       d  M  XS;   d  M  UR                  USS9  M0     U H  n
UR                  U
SS9  M     [        [        UR                  5      5       GH7  u  pV[        U5      (       d  M  UR                  S	:X  d  M*  / nUR                   H  nXt;  d  M
  UR!                  U5        M     UR"                  R%                  SU5        UR                  S   n[        U R&                  5       H  u  pUR)                  X]-   S
-   SSU/0SU/0SUR*                  R,                  SSS[.        R0                  R2                  R4                  [6        [8        R:                  0S9nU R<                  R?                  URA                  U5      5      nM       O   URC                  5         g )Nr\   )elementwise_mulsquared_l2_normclip_by_normr   r   @GRAD)r   r   Fr   sumr   
all_reducexoutring_idr0   z /gradient_clip_model_parallelismreduce_typer   inputsr   r   )"r?   rQ   r   r   r   _is_gradient_clip_opr   r   r   r   r   r   r   r   r   input_arg_namesr   r1   r   rS   
_insert_opr   idpaddledistributedReduceOpSUMr   OpRoleOptimizerl   r   varr   )rX   r{   removed_op_typeremoved_op_idxremoved_tmp_varr   r5   r   r   r   varnamereserved_varssum_op_outputir   new_opr   s                    r6   _shard_gradient_clip!ShardingPass._shard_gradient_clipk  s   ::> Q%Ijnn56GC'++ww/)XXc]1-
'(B*//'*BC
88DD"&&s+ww"EE+-+>+>K+//< ,? 7  Yz~~%> ?@GC'++$%%c%6	 A 'G""7"7 ' Ijnn56GC'++ww% ""$"4"4J!8%,,Z8 #5 !!#}5 " 3 3A 6(1$2E2E(F$A'22!) #m_5!& 8%}':':'='=*,N)6+=+=+F+F+J+J'	 3 F **KK&NN=9  )G, C 7F 	!!#r8   c                     U R                   S:  a  g [        [        [        UR                  5      5      5       H   u  p#[        U5      (       d  M  [        S5      e   UR                  5         g )Nr\   z$weight decay is NOT supported by now)r?   r   r   r   r   _is_weight_decay_opr   r   )rX   r{   r   r5   s       r6   r    ShardingPass._shard_weight_decay  s\    ::>Yz~~%> ?@GC&r**): 	 A 	!!#r8   c                 &   / n[        [        [        UR                  5      5      5       H  u  pE[	        U5      (       d    OUR
                  [        ;   d  M-  SUR                  ;   d   e[        UR                  S5      5      S:X  d   eUR                  S5      S   nU R                  U5      (       dE  UR                  UR                   Vs/ s H  nXv:w  d  M
  UPM     sn5        UR                  USS9  M  U R                  R                  U R!                  U5      5        M     [        [        [        UR                  5      5      5       HE  u  pE[        UR                  5      S:X  d  M   UR                  S   U;   d  M5  UR                  USS9  MG     U HQ  nUR#                  U5      (       a  UR%                  USS9  UR#                  U5      (       d  MA  UR%                  USS9  MS     UR'                  5         UR'                  5         g s  snf )NParamr   r   Fr   )r   r   r   r   r   r   _supported_optimizer_typeinput_namesra   r   r   extendr   r   rW   r   _get_param_gradhas_varr   r   )rX   r{   r|   should_removed_optimizer_statesr   r5   r   r   s           r6   r   ,ShardingPass._shard_optimizer_ops_and_states  s   *,'Yz~~%> ?@GC!"%%ww33"..000288G,-222XXg.q1
88DD3:: ,.+>+>+>&4 $+> ))#E):,,33,,Z8% A,  Y}/@/@%A BCGCB''(A-''*.MM((5(9 D 7G!!'**&&wU&;$$W--))')>	 7 	!!#$$&5s   	H
H
c                    U R                   S:  d  U R                  S:  a  g U R                   GH0  nUR                   GH  nUR	                  UR
                  5      (       d   eUR	                  UR
                  5      (       d   eUR                  SSU0SU0SUR                  R                  SUR                  UR
                  5      [        [        R                  0S9nUR                  S	S
[        R                  -   5        U R                   R#                  U5      nUc   e[%        UUR&                  UR(                  U R                   UR*                  S9  GM     GM3     UR-                  5         g )Nr\   r   	broadcastr   r   r   rootr   r0   /r   )r?   rE   rS   r   r   r   	append_opr   r   get_var_rankr   r   r   	_set_attrr   DataParallelrl   r   r   r   r   r   r   )rX   r{   r|   r   r   r   param_dist_attrs          r6   r   )ShardingPass._insert_optimizer_broadcasts  sR   ::>T99A=!00M&--!))%**5555$,,UZZ8888#--$<"EN!=#6#6#9#9 : :5:: F#V__	 . 	   "C,*C*C$C &&GGN   '222F#00#00&&,55+ . 1: 	!!#r8   c                 d    XR                   ;   d   eU R                   U   nUR                  U5      $ r   )rT   is_in_local_shard)rX   r   r   s      r6   r   )ShardingPass._is_parameter_in_local_shard  s5    :::::55jA..z::r8   c                 r    XR                   ;   d   eU R                   U   nUR                  U5      nUc   eU$ r   )rT   get_param_grad)rX   r   r   p_gs       r6   r   ShardingPass._get_param_grad  sC    :::::55jA**:6
r8   c                 (   U R                   S:  a  g U R                   Vs/ s H  o"R                  PM     nn[        [	        [        UR                  5      5      5       GH  u  pE[        XQ5      (       Ga  UR                  S:X  a-  UR                  S5      [        R                  R                  :X  d=  UR                  S:X  aJ  UR                  S5      [        R                  R                  :X  a  Sn[        R                  R                  nOSn[        R                  R                  nUR                  S   n[        U5      n	U R                   U	   n
[#        UUUUU
R$                  R                  U
R'                  U	5      U R(                  U5      nU R*                  (       a  U
R-                  U	5      (       d  UR/                  US-   SS9  OIUR1                  S	U R2                  R                  5        UR1                  S
S[4        R6                  -   5        [9        XQ5      (       d  GM  UR:                  S   n[        U5      n	U R                   U	   n
U
R-                  U	5      (       a  GM  UR/                  USS9  GM     UR=                  5         g s  snf )Nr\   r   r   r   r   r   Fr   r   r0   r  )r?   rR   r   r   r   r   r   _is_param_grad_allreduce_opr   r3   distr   r   AVGr   _get_base_name_from_grad_namerT   _insert_reduce_opr   r  rl   rU   r  r   r  rV   r   r	  _is_param_grad_sum_opr   r   )rX   r{   r   dp_ring_idsr   r5   reduce_op_typer   r   	base_namer   	reduce_opr   s                r6   ru   ,ShardingPass._shard_gradient_synchronization  s   ::>-1^^<^Exx^<Yz~~%> ?@GC*2::GG|+.$--2C2CCGGx'.$--2C2CC%-N"&--"3"3K%-N"&--"3"3K//2
9*E	 $ = =i H-"!''**!..y9&&		 //(::9EE))#')>LLD,?,?,B,BCLL&l.G.G(G %R44..q19(C	 $ = =i H$66yAA))#E):_ Ab 	!!#e =s   Jc                 

   U R                   S:  a  g U R                   Vs/ s H  o3R                  PM     nnU R                   GH  nUR	                  U5      u  nn/ nU H?  n	Xy   S:X  d  M  UR                  U	5      UR                  :w  d  M.  UR                  U	5        MA     [        [        [        UR                  5      5      5       GH  u  p[        U5      (       a  M  UR                   GH  n[        XUR                  5      (       a)  U R!                  U5      (       d  UR                  U5        MH  X;  a  MO  UR                  U5      nXR                  :X  a  UnO["        R$                  " US-   5      nUR'                  U5      nUR)                  UUR*                  UR,                  SS9nU R.                  R1                  U5      n[3        U R.                  UUR4                  UR6                  UR8                  S9  U R.                  R;                  U5      nUR=                  U5      nUR?                  X5        URA                  UU5        [C        UU
UUR                  UURD                  R                  URG                  S5      U R.                  5        GM     GM     [        [        [        UR                  5      5      5       H\  u  pURH                  S:w  a  M  UR                  S   nURJ                  S   nX;   d  M<  URM                  U
SS	9  URO                  USS	9  M^     [        [        [        UR                  5      5      5       GH<  u  p[Q        URJ                  5      S
:X  d   eURJ                  S   nURH                  S:X  a  URG                  S5      U;   ah  U RR                  (       aF  UR                  U5      UR                  :X  a'  URU                  SU RR                  R                  5        O@URM                  U
SS	9  O/UR                  U5      UR                  :w  a  URM                  U
SS	9  M  URH                  S:w  d  GM  UU;   d  GM
  UR                  U5      UR                  :w  d  GM,  URM                  U
SS	9  GM?     U HD  n	UR                  U	5      UR                  :w  d  M$  URO                  U	SS	9  URO                  U	SS	9  MF     GM     URW                  5         URW                  5         g s  snf )Nr]   r   z
@BroadCastF)r   r   r   persistabler   r   castr   r   r  r   ),r?   rR   r   rS   "get_broadcast_vars_and_param_usager  
local_rankr   r   r   r   r   r   r   _is_param_fp16_cast_opparam_namesr   r   generater   
create_varr   r   rl   r   r   r   r   r   get_op_dist_attr_for_programget_input_dist_attr_rename_inputset_input_dist_attr_insert_init_and_broadcast_opr   r3   r   r   r   r   ra   rV   r  r   )rX   r{   r|   r   r  r   need_broadcast_varsparam_usagenot_used_param_namer   r   r5   r   	root_rankbroadcast_varname	input_varnew_varref_dist_attrop_dist_attrinput_dist_attrr   s                        r6   rv   ShardingPass._shard_parameterK  s   ::>-1^^<^Exx^<!00M @@L#"$)
+q0%22:>$//0 (..z: * $D:>>)B$CD!"%%"$"4"4J ."(A(A   $@@LL/66zB !<  - : :: FI $<$<<,6),7,@,@&5-) %/NN:$>	","7"7!2"+//"+//(-	 #8 # !..OO ) &
 * ..#)66)66%2%;%; !..KKBO % +7*J*J&+ ((G$88- 2")%00!%++..	***	g #5	 ED $D:>>)B$CD77f$//2
 11!44))#E):**;U*C E $D=3D3D)E$FG2../1444 11!477k)wwy)[8 // - : :; G,77!8 LLD4G4G4J4JK)44Su4E *66{C,778 *44Su4E GG{*#{2%22;?$//0 ",,Su,=9 H< *
!..z:$//0 **:E*B!--ju-E *o 1~ 	!!#$$&C =s   T c                    U R                   S::  a  g SU l        SU l        SU l        [	        U R
                  5      S:X  d   S[	        U R
                  5       S35       eU R
                  S   n[        R                  R                  X5         U R                  U5        U R                  S:  aC  U R                   S:X  a  U R                  U5        O!U R                   S	:X  a  U R                  U5        S S S 5        g ! , (       d  f       g = f)
Nr   sharding_coalesce_grad_sharding_coalesce_param_rK   zZgradient synchronization optimization only support one sharding group right now, but got [z].r   r\   r]   )r?   grad_coalesce_prefixparam_coalesce_prefixcomm_op_scheduling_priorityra   rS   r   staticprogram_guard_gradient_sync_optimizationrE   &_fuse_overlap_parameter_comm_stage_two(_fuse_overlap_parameter_comm_stage_three)rX   rx   ry   r   s       r6   rw   ShardingPass._optimization_pass  s    ::?$=!%?"+-( 4&&'1, 	
hilmq  nA  nA  jB  iC  CE  F	
, ++A.]]((G,,]; ++a/::???NZZ1_AA-P HGGs   A%C::
Dc                     U R                   S::  a  U R                  (       d  g [        5       R                  5       n[	        5       R                  5       nU R                  UU5      u  pEU R                  UUUU5        g )Nr   )rF   rB   r   rn   r   _group_grads_overlap_grad_comm)rX   r   r{   r|   coalesce_to_group_mapgrad_name_to_group_maps         r6   r@  (ShardingPass._gradient_sync_optimization  ss    &&!+T5H5H)+88:
/1>>@8<8I8I9
5 	!"		
r8   c                 R   [        5       R                  5       n[        5       R                  5       n[        XR                  5      u  pE[
        R                  S5        [
        R                  SU R                   S[        UR                  5       5       S[        UR                  5       5       S35        0 nU R                  (       a  / U l
        UR                  R                  n[        U R                  5       HV  nUS:X  a  UR                  n	O
[        USS9n	U R                  R!                  U	["        R$                  R&                  S	.5        MX     [
        R                  S
U R                   S35        0 U l        [+        UR                  5       5       GH  u  p[        U
5      S:  d   e[        U
5      S:  a  [,        R.                  " U R0                  [3        U5      -   5      nUR5                  UU
R6                  SSS9  UR5                  UU
R6                  SSS9U
l        UR;                  SSU
R<                  0U
R<                  U
R8                  S.SSSSSU
R6                  [>        [@        RB                  0S9  OU
R<                  S   U
l        [
        R                  SU S[E        U
R<                   Vs/ s H  n[G        U5      PM     sn5       S35        [
        RI                  SU SU
R<                   Vs/ s H  oRJ                  PM     sn S35        U
XjR8                  RJ                  '   XR                  -  nU R                  U   S   nU R                  U   S   nUR;                  SSU
R8                  0SU
R8                  0SURL                  S U
RN                  [>        [@        RP                  0S9nXR(                  U'   URS                  S!S"[T        RV                  -   5        U R                  (       d  GMk  UURX                  l-        U R\                  URX                  l/        GM     0 n[+        UR`                  5       GH  u  nn[c        U5      (       d  M  URe                  S#5      S   nUR<                  U   nUU   n
S nU R                  (       a  URX                  RZ                  n[        UR                  5       5      U R                  :  a-  [g        U5      nUR<                  URe                  S$5      S      nOSUR`                  XR                  -
     n[c        U5      (       d   S%5       eUR<                  URe                  S#5      S      nUU/U/U4/UU'   [        U
R<                  5      S:  d  GM9  U
Rh                  (       a)  U
R<                  S&   nUU   R!                  UU/U/U45        UU   R!                  US-   U/U
R<                  U45        GM     [k        UR                  5       SS'9nU H  nUU   S S S&2    Hw  u  nnnn[m        UUUUU Rn                  [@        RP                  S&/S(S(S)S*9
nU R                  (       d  MF  Uc  MK  UURX                  l-        U R\                  URX                  l/        My     M     URq                  5         g s  snf s  snf )+NzSharding Stage2 Optimization:zParam Bucket size is [], [z] Parameters are fused into [z	] Bucketsr   Tforce_new_group
comm_groupcomm_streamz#Parameter Communication would use [z
] streams.r   r   r   r   stop_gradientcoalesce_tensorInputOutputFusedOutput	copy_data	use_alignr   r   zBucket[z] size [z]MB.z] parameters: .rO  rP  r  r   r   r   r  r0   r  r   ParamOutz:Unexpected: sharding broadcast pre op should be broadcast.rK   reverseFsharding_stage2_broadcast_depr   is_recomputer   r0   )9r   rn   r   group_paramrE   _loggerinfora   keysrB   param_comm_group_stream_pairsr   r   rangerC   r   r   r   SHARDING_STREAMr   op_to_stream_idxr   r   r&  r<  strr'  r   coalesce_varr  r   r   r   Forwardr   r   debugr   r   rankr   r  r   r	  r   execution_streamr=  scheduling_priorityr   is_sharding_param_broadcast_opoutput_get_broadcast_first_depend_opr  sortedr   rl   r   )rX   r   r{   r|   group_to_param_mapparam_to_group_mapbroadcast_var_to_group_mapr   r   r   param_groupcoalesce_var_namepcomm_stream_idxrO  rP  r   dep_mapr5   r1  broadcast_var	prior_varpre_op	last_gradindicer   
prior_vars	post_vars	depend_ops                                r6   rA  3ShardingPass._fuse_overlap_parameter_comm_stage_two  s   )+88:
/1>>@1<772
. 	45$T%A%A$B$sK]KbKbKdGeFf  gD  EH  I[  I`  I`  Ib  Ec  Dd  dm  n	
 &(" 24D.!''--E45566)//E-eTJE2299&+'='M'M'S'S 7 LL5d6P6P5QQ[\ %'D!'(:(?(?(ABNA{#q(((;!#$/$8$8..Q7%! ((*%++ $"&	 )  ,6+@+@*%++ $"&	 ,A ,( ''*#[%5%56"-"2"2'2'?'?
 $T#T!2!2#V^^	 (  ,7+;+;A+>(LL!HS;CSCS)TCSa,q/CS)T%U$VVZ[ MM!NK<L<L+M<LqFF<L+M*NaP
  ''?'?'D'DE
  "<"<<O;;OLJ <<_MK  )) [556 8 89z}}K,,	 * 	F -<!!&)^S<3L3L-LM"""4?  144   4G C\ z~~.EAr-b11$&IIe$4Q$7! *0A B89JK"&&"$,,"?"?K w||~&)C)CC7
CB *		*0Ea0H II'^^A0J0J,JKF9&AA TA !+e0DQ0G HI 9+LM
{''(1,"44$/$4$4R$8	
))m_kJ AJ%%Q1A1A;O= /F 5A;B1:dd;K7ZK8&&OO" "'!@	 &&&9+@;FI''888 '';# <L , 	!!#M *U ,Ns   9Z=Z$c                     g r   rf   )rX   r   s     r6   rB  5ShardingPass._fuse_overlap_parameter_comm_stage_three  s    r8   c                 
   UR                   nU R                  S:  a  SU l        SnU H  n[        U5      (       d  M  Un  O   Uc  gUR                  S   n[	        U R                  5      n/ n[        5       n	S n
SnU[        U5      :  Ga  X;   n[        U5      (       Gax  UR                  S:H  =(       aF    UR                  S5      [        R                  R                  [        R                  R                  4;   nU(       d   S5       eUR                  S   n[        U5      nUR                  U5      nUR!                  U5      nUR#                  UU5      (       a  X;  d   eUR%                  UU5        O8UR'                  U5        [	        U R                  5      nUR%                  UU5        [        UR(                  5      S:X  aZ  US-
  Ul        Sn[-        X;U-
     5      (       a  US-  n[-        X;U-
     5      (       a  M  X;U-
     nUR                  S   nUUl        U	R1                  U5        UR2                  R'                  U5        U R4                  (       a  UR7                  U5      (       a  S	Ul        X;S-      R                  S
:X  a=  X;S-      R                  S5      [8        R:                  R                  R                  4;   d   S5       eX;S-      R                  S   U:X  d   S5       eUR<                  R'                  US-   5        US-  nO3U
" XW5      (       a&  UR'                  U5        [	        U R                  5      nUS-  nU[        U5      :  a  GM  [        UR(                  5      S:  a  UR'                  U5        [>        RA                  S5        [>        RA                  SU R                   S[        U	5       S[        U5       S35        0 n0 n0 n0 n/ n[C        U5       GH  u  nn[        UR(                  5      S:  GaG  URE                  [F        RH                  " U RJ                  [M        U5      -   5      URN                  SS	S9Ul(        U RR                  RU                  UR(                  S   5      n[W        U RR                  URP                  URX                  URZ                  UR\                  S9  UUUR*                  '   UR2                  R_                  5       nUUU'   URa                  UR2                  5        UR6                  (       a:  UR<                  R_                  5       nUUU'   URa                  UR<                  5        OUR(                  S   Ul(        UR(                   H  nUUURb                  '   M     UUURP                  Rb                  '   GM     [        URe                  5       5      n[        URe                  5       5      n[        U5      n URg                  U5      n![        U!5      S:X  d   eURg                  U 5      n![        U!5      S:X  d   eURg                  U 5      n![        U!5      S:X  d   e[i        [k        [C        UR                   5      5      5       GHo  u  n"nU"U;   a  UU"   nUR                  S   nUUR(                  S   Rb                  :X  d%   SUR(                  S   Rb                   SU S35       eURm                  UURP                  Rb                  5        URo                  UURP                  Rb                  5        U"U;   a  URq                  U"SS9  U"U;   d  M  UU"   nUR(                  S   Rb                  n#U#UR                  ;   d   SU# SU S35       eUR(                   Vs/ s H  nURb                  PM     n$n/ n%/ n&UR(                   H:  n'U'Rr                  n(U%Ra                  U(5        U&R'                  [        U(5      5        M<     URu                  U"SSU$0U$URP                  S.SSSS	SURN                  S U%S!U&[v        [x        Rz                  0S"9n)U RR                  RU                  URP                  5      n[}        U)URZ                  URX                  U RR                  UR\                  S9  [        UU"UR!                  UR.                  5      URP                  U RR                  [x        Rz                  S/SSS#S$9
n*GMr     UR                  5         UU4$ s  snf )%a:  
conditions for gradients to be grouped:
    1. group size < grad_bucket_size_numel
    2. same dp group (TODO)
    3. same src rank
    4. same dtype
    5. dependency: grad would NOT be used by other ops within group segment

main logic:
    1. record coalesce group
    2. record all dp allreduce/reduce op idx

    3. insert coalesce op
    4. insert coalesce dependency (avoid allocate memory too early)
    5. modify and remove allreduce/reduce op
    6. ensure sharding-dp hybrid parallel logic

gradients inside same group would be fuse into one coalesce tensor
r   Nr   c                     [        U R                  U R                  -   5      nUR                   Vs1 s H  o3R                  iM     nn[        UR                  U5      5      S:  $ s  snf )Nr   )rQ   r   r   r   r   ra   intersection)r5   r   vars_r   	var_namess        r6   op_depend_on_group5ShardingPass._group_grads.<locals>.op_depend_on_group  sW    **R-@-@@AE-2ZZ8ZcZI8u)))4599 9s   A%r   r   zZSharding should reduce grad first and than allreduce if Hybrid Sharding with Data-Parallelr\   Tr   z@Hybrid Sharding with Data-Parallel should sync same gradient varz-Sharding Gradient Communication Optimization:zGradient Bucket size is [rK  z] Gradients are fused into [z
] Buckets.FrQ  r   rK   z$Unexpected: it is supposed to sync [z] but got [r   r   z-Unexpected: op is supposed to generate grad [rS  rT  rU  rX  rY  r   concated_shapesconcated_ranksr   sharding_grad_coalesce_depr_  )Ar   rF   r   r   VarGrouprQ   ra   r   r   r3   r  r   r  r   r  r  r   
acceptablecollectr   r   coalesce_op_idxr   coalesce_dep_varnamer   reduce_op_indicesrU   r  r   r   allreduce_op_indicesrb  rc  r   r'  r   r&  r;  ri  r   rj  rl   r   r   r   r   r   popr   r   rd  r  r   r   r*  _rename_outputr   r   r   r   r   Backwardr   r   r   )+rX   r   r   r   first_backward_opr5   first_backward_varname	cur_groupgrad_groupsgrouped_grad_namesr  r   	is_reduce	grad_namer   rm  grad_varjdep_opdep_varnamerH  rG  modify_reduce_op_mapcoalesce_op_mapremove_reduce_op_indicesr   r4  last_reduce_op_idxlast_allreduce_op_idxgradcoalesce_op_setmodify_op_setremove_op_setconflictr   first_grad_name
grad_namesr  r  grad_r   coalesce_opr  s+                                              r6   rE  ShardingPass._group_grads  s   0 ii&&* +,D' Bb!!$&! 
 $!2!C!CA!FT889	 U	: #c(lB)"--GGx/ BGGM4JMM%%MM%%O 5	 ! py //2	:9E
$11*= 99Y/''$77$>>>%%h5&&y1 ()D)D EI%%h5y~~&!+01AI- A(U44Q )U44 QZF"("9"9!"<K5@I2"&&y1++2215**}/N/N0 0 37I/1u:??l:sq5z%@ **3377@ 
 u  1u:66q9YF ZF 2299!a%@FA#B22""9-$T%@%@A	FAu #c(lx y~~!#y)DE'(C(C'DDM_I`Haa}  B  CN  O  ~P  PZ  [	

 "$ "!#% !+.HAu5::"%*%5%5$--11CF:  ++ %"& &6 &" &&GG

1 
 "&&&&!..!..*33 :? 5 56%*%<%<%@%@%B";@$%78(//0G0GH**,1,F,F,J,J,L)BG()>?,33E4N4NO%*ZZ]"

49&tyy1 #=B!%"4"4"9"9:E /H o22450557845"//>8}!!!"//>8}!!! --m<8}!!!  Yuyy%9 :;GC**,S1//2	 EJJrN$7$77 :5::b>;N;N:O{[dZeefg7   E,>,>,C,CD!!)U-?-?-D-DE..  5 1o%',"'**Q-"4"4&"*=*== COCTT_`b_ccde= 5:JJ?JDdiiJ
?"$!#"ZZE!KKE#**51"))#e*5 (
 $;;*#Z0",','9'9
 $U#T)?(.#V__ < & &&GG** 
 G!..!..&&*33 9IIe889&&&&OO" "'!=	w <R 	$&<<<q @s   d c                 j   U R                   (       d  g/ U l        UR                  R                  n[	        U R
                  5       HC  nUS:X  a  UR                  nO
[        USS9nSU 3nU R                  R                  UUS.5        ME     UR                  n	0 n
Sn0 n[        U	5       GH+  u  p[        U5      (       Ga  UR                  S:X  a:  UR                  S5      [        R                  R                  R                   4;   a  Ma  XR
                  -  nXU'   U R                  U   S	   nU R                  U   S
   nUR#                  S5      S   nUU   nUR$                  R&                  U:X  d   e[)        UR*                  5      S:  a  UUR*                  S   UR$                  USUR,                  4/X'   US-   nU R.                  (       a  UR0                  (       a  US-  nX   R                  UUR$                  UR*                  USUR,                  45        UUR,                  l        U R4                  UR,                  l        UR9                  SUR:                  5        U R.                  (       a  UR0                  (       a  XS-      nUR                  S:X  a8  UR                  S5      [        R                  R                  R                   4;   d   eUR#                  S5      S   U:X  d   eUUR,                  l        U R4                  UR,                  l        US-  nUR                  S:X  a  UR                  S5      [<        R                  R>                  :X  aj  UR0                  (       dY  U RA                  S5      (       dC  X;  a  / X'   X   R                  US-   UR$                  UR$                  SSUR,                  45        US-  nUS-  nGM.     [C        U
RE                  5       SS9nU H  nX   SSS2    H  u  nnnnnnUS:X  a  SOSn[G        UUUUU RH                  [J        RL                  S/SSUUS9nUc  ME  [O        UURP                  U RH                  URR                  S9  Uc  Mt  UUR,                  l        U R4                  UR,                  l        M     M     U RT                  (       Ga`  SnU RV                  U-  S:X  d   eUR                  nUR                  nU RX                  U-  n U RX                  U-  n!U V"s/ s H  n"U"U-  U :X  d  M  U"PM     n#n"[Z        R]                  S5        [Z        R]                  SU RX                   S35        [Z        R]                  SU# S35        [)        U#5      U RV                  U-  :X  d   eU V"s/ s H  n"U"U-  U!:X  d  M  U"PM     n$n"[)        U$5      U:X  d   e[Z        R]                  SU$ S35        / n%/ n&[	        U R
                  5       H5  n'U%R                  [        U#SS95        U&R                  [        U$SS95        M7     [_        [a        [        UR                  5      5      5       GH  u  p[        U5      (       d  M  UR                  S:X  a-  UR                  S5      [<        R                  R                   :X  d   eX   n(U%U(   n)U&U(   n*UR#                  S5      S   nU R                   (       a  UR,                  R2                  n[c        UR                  S5      5      n+Sn,U+U-  U :X  a  Sn,U+U-  n-UR9                  SU*R:                  5        UR9                  SU-5        U,(       d  GM  U+U-  n.URe                  US-   SSU0SU0SU)R:                  SU.S[<        R                  R                   [f        [J        RL                  0S 9n/U/R9                  S!S"[h        Rj                  -   5        U R                   (       d  GM  WU/R,                  l        U R4                  U/R,                  l        GM     URm                  5         gs  sn"f s  sn"f )#a  
overlap gradient communication with backward & optimizer computation.

1. assign gradient communications to grad comm stream
2. for coalesce gradient communication:
    2.1 insert before communication dependencies
    2.2 insert after communication dependencies only when need
3. there is not need to add explicit dependencies for non-coalesce gradient communication

P.S. this overlap pass is ONLY adapted for standalone executor (graph based) and stream award allocator.
Nr   TrL  sharding_grad_comm_streamrN  r   r   rO  rP  r   r   rK   sharding_grad_comm_depr   r   rM   sharding_reduce_avg_depr\  F)r   r`  r   r0   skip_insert_when_sequential_run)r   ctxr      z:Sharding Gradient Hierarchical Communication Optimization.zcurrent global rank idx: rZ  zlocal inter node ranks idx: zlocal intra node ranks idx: root_idr   r   r   r0   r  )7rB   grad_comm_group_stream_pairsr   r   rf  rD   r   r   r   r   r   r   r3   r   r   r   r   rq  rj  r   ra   r   r   rU   r  rn  r=  ro  r  r   r  r  r^   rs  rd  r   rl   r   r  r   r   r   rH   rm   rJ   rb  rc  r   r   r`   r   r   r   r	  r   )0rX   r   r   rG  rH  r   r   r   streamr   r{  reduce_op_countgrad_comm_op_to_stream_idxr   r5   
stream_idxrO  rP  reduce_varname
grad_grouppost_idxnext_opr  r  r  r0   r   r  r  nranks_per_nodeglobal_groupglobal_ranksrelative_idx_in_nodenode_idxrm  inter_node_ranksintra_node_ranksinter_node_groupsintra_node_groups_grad_comm_stream_idxinter_node_groupintra_node_groupdst_rankin_peerintra_node_dstinter_node_dstr   s0                                                   r6   rF  ShardingPass._overlap_grad_comm  s;   & "",.)##)) t001AAv%++)%F 14F--44"'#) 2 ii%'" ~GC)"--77l*rww}/E&&//33J 0 ,/H/HH
1;2.!>>zJ 
 #??
K! "$5!1!!42>B
!..33~EEE z'!+  &OOB/&33'4LL	$GL  #QwH..:3O3O AL''$&33&OO'4LL	 1<-44 0 Y
6**z/K/K!'lG"<<<7GLL%= **3377=  
 #>>%03~EEE :EG%%688 %%9 1HC GGx'.$--2C2CC&88 MM*JKK)')L''!G&33&33 5LL	  1$1HCO &T 5A DbD! *-FFED 0 9&&OO" "'!-4S	  (>!%.%;%; ..!*!3!3	 #.?J	++< << "++?= " V (((  O++o=BBB(..L'--L#'#3#3o#E ''?:H ) (D/)-AA (   
 LLL LL4T5E5E4FaHILL78H7IKL$%++>??
 ) (D?*h6 (   
 '(O;;;LL78H7IKL " "4445!((%&6M "((%&6M 6 $D599)=$>?-b118+GGM2dmm6G6GGH ,F+I('89M'N$'89M'N$%'YYu%5a%8N**&(ll&C&C"2779#56H#G/15II"&%-%?NLL,<,?,?@LLN;w)1_)D!&!>!>!G!)$'#8 %~% !*+;+>+> )> -t}}/@/@ +V__	# "? " ((*C,2K2K,K  ...@KF,,= $ @ @ #,,@[ @b 	e  s   `+`+`0(`0)rl   rL   r=  rR   rH   rB   rJ   rF   r;  r  rD   rh  rV   rE   r<  re  rC   rG   rU   rS   rm   rW   r?   rT   )__name__
__module____qualname____firstlineno__rO   rc   rh   r   rr   r   r   rt   r   r   r   r   r   r   r   ru   rv   rw   r@  rA  rB  rE  rF  __static_attributes____classcell__)rY   s   @r6   r;   r;   [   s    &0&P3?j=*/JbE=$~B$H$&'P!$F;
6$pE'NQ2
"o$b|=|` `r8   r;   c                 n    U R                    H  nUR                  [        ;   d  M  Us  $    [        S5      e)NzCould not find optimizer op.)r   r   r   	Exception)r   r5   s     r6   rr  rr    s0    ii77//I  2
33r8   c                     U R                  U5      nUR                  U5      n	U R                  USSU0SU0SUSU[        U0S9n
U
R	                  SS[
        R                  -   5        [        U
U	R                  U	R                  UU	R                  S	9  X4:w  ah  U R                  US
SUR                  0SUR                  SUR                  [        U0S9n
[        U
U	R                  U	R                  UU	R                  S	9  gg)z
empty op for initialization
r  r   r   r   r  r   r0   r  r   emptyr   r   r   r   N)r   r   r   r   r  r   r	  r   r   r   r   r   r   r   )r   
insert_idxr   r#  r0  r   r   r>   r|  broadcast_var_dist_attrr   s              r6   r,  r,    s-    IIg&M*KK **W~ wI
 + 
F ^S<+D+D%DE:,,,,(11 ..M../,,,,W	 / 	
 	?#00#00,55	
 r8   c	                 L   US:  d
   SU 35       eU R                  UUSU/0SU/0SUSUSU[        U0S9n	UR                  U R                  U5      5      n
[	        U	U
R
                  U
R                  UU
R                  S	9  U	R                  S
S[        R                  -   5        U	$ )Nr   z5root id should be a positive int, but now root id is r   r   r   r  r   r   r   r0   r  )r   r   r   r   r   r   r   r   r  r   r	  )r   op_typer  
reduce_varr   r  r>   r   r   r   r   s              r6   r  r    s     a< 
?yI< **j\"%ww;	
 + F ==		*I ;## ^S<+D+D%DEMr8   c                 `    SnSn[        U 5      U-  U/n[        XX25      n[        XXB5      nXg4$ Nr   r   )ra   r   )origin_groupsharding_group_sizerm  dp_axissharding_axisr   r   r   s           r6   r   r   ;  sC    GM"557JKE|GBH$\-NN##r8   c                     U R                   R                  S5      =(       a*    U R                   R                  S5      R                  S5      $ )Nr0   z/gradient_clipr1   r2   r3   
startswithr4   s    r6   r   r   F  s;    77N+ #1j!"#r8   c                     U R                   R                  S5      =(       a*    U R                   R                  S5      R                  S5      $ )Nr0   z/regularizationr  r4   s    r6   r   r   L  s;    77N+ $1j"#$r8   c                 b   [        U5      (       d  g[        X[        [        R                  R
                  R                  5      (       d  g[        X5      (       a  gUR                  S   nUS UR                  S5       nU R                  U5      (       d  gU R                  U5      R                  $ )NFr   r   )r   _is_desired_cast_opr}   r   ro   rp   FP32r   r   r   r   r   is_parameter)r   r5   r   r  s       r6   r   r   R  s    "')=)=)B)B  u))%%a(K3k..s34I==##99Y,,,r8   c                 r    [        U5      (       a  g[        X5      (       d  gUR                  S   nX2;  a  gg)NFr   T)r   r  r   )r   r5   r   r   s       r6   r$  r$  b  s:    bu))##A&Jr8   c                 L   UR                   S:w  a  g[        UR                  5      S:X  d   e[        UR                  5      S:X  d   eU R	                  UR                  S   5      nU R	                  UR                  S   5      nUR
                  U:w  d  UR
                  U:w  a  gg)Nr!  Fr   r   T)r   ra   r   r   r   r   )r   r5   src_var_typedst_var_typer2  
output_vars         r6   r  r  m  s     
ww&r!!"a'''r""#q(((		",,Q/0I2..q12J,&**:*:l*Jr8   c                     S nSU ;   a  U S U R                  S5       nU$ SU ;   a  U S U R                  S5       nU$ SU ;   a  U S U R                  S5       nU$ )Nz.cast_fp16@GRADz.cast_bf16@GRADr   )r   )r  r  s     r6   r  r    s}    II%A	/@ AB	
 	 
i	'A	/@ AB	  
I	7	w 78	r8   c                     [        U 5      (       d  gU R                  S   n[        U5      nUR                  U5      (       d  gUR	                  U5      R
                  $ )NFr   )r   r   r  r   r   r  r5   r   r   r  s       r6   r  r    sQ    %b))%%a(K-k:I==##99Y,,,r8   c                     [        U 5      (       d  gU R                  S:w  a  gU R                  S   n[        U5      nUR	                  U5      (       d  gUR                  U5      R                  $ )NFr   r   )r   r   r   r  r   r   r  r  s       r6   r  r    s`    "	ww%%%a(K-k:I==##99Y,,,r8   c                     U R                   S:H  =(       aN    U R                  R                  S5      =(       a,    [        R                  U R                  R                  S5      ;   $ )Nr  r0   )r   r1   r2   r   r	  r3   r4   s    r6   rp  rp    sL    
; 	FGG^,	F%%n)EEr8   c                    S nUR                    H  n[        XQR                  U5      (       a  M   UR                  U5      nUR                  nUR                  U5      nUR                  n	[        U5      S:X  a  Mk  US   n
U
S:  d  Mx  X   S:  d  M  Ub  X:X  d  M  [        UR                  UR                  U
U 5      n[        U5      n  U$    U$ )Nr   rK   r   )r   r   r   r(  r   get_input_dims_mappingr   ra   r   process_idsr   )rank_idr5   r>   r  r   r   r   r   input_dim_mapping
mesh_shapebatch_size_axisgroup_rankss               r6   r   r     s     H((
 $J,GG$AA"EI$11L ) @ @ L%++J$%*/2O#
(Ca(G?o&@"1$00$**'	#K  1=HO1 )0 Or8   c                    0 nSn/ nU  H$  n[        U5      nX6-  nUR                  XV45        M&     [        U5       Vs0 s H  ow/ _M     nnSnSn	U H2  u  pVXS-  US-   -  U-  :  a  US-  nX(   R                  U5        X-  n	M4     U$ s  snf )z
shard the continuous param into same rank and divide the forward&backward computation into segment,
which will favor the fuse pass in later.

we assume that the params is already sorted by utilization order.
g        r   g      ?r   )r   r   rf  )
r   
group_sizemappingtotal_param_mem	param2memr   memr   cur_rankmem_accus
             r6   partition_by_use_orderr    s     GOI5!%&  $J/0/"u/G0HH
+x!|<zIIMH  '	   N 1s   B
c                 2   0 n[        U5       H  n/ X#'   M	     S/U-  nU  Hr  nUR                  [        U5      5      nX&   R                  U5        [	        S UR
                  S5      nUS:  d   SUR                   SU S35       eXF==   U-  ss'   Mt     U$ )zB
use greedy algorithm to partition parameter as even as possible.
r   c                 
    X-  $ r   rf   )r   ys     r6   <lambda>*partition_by_greedy_even.<locals>.<lambda>  s    AEr8   r   zparam [z#] should larger than 0, but it is [r   )rf  indexminr   r   r   r   )r   r  r	  rank_sizesr   rm  numels           r6   partition_by_greedy_evenr    s     Gz" #C*E{{3u:&U#)5;;:qy 	
ejj\!DUG1M	
y 	u  Nr8   c                    US:X  a  [        X5      nO[        X5      n[        R                  S5        UR	                  5        Hw  u  pE[        R                  SU S[        U Vs/ s H  n[        U5      PM     sn5       S35        [        R                  SU Vs/ s H  ofR                  PM     sn S35        My     U$ s  snf s  snf )Ngreedy_evenzSharding Parameter Partition:zRank:z, Parameter Size:z MB.zParams in this rank: rZ  )r  r  rb  rc  itemsr   r   r   )r   r  algorrank_to_paramskvr   s          r6   partition_parametersr"    s    1&E/CLL01$$&A3'!,L!3\#->!,L(M'NdS	
 	,!-D!3hh!-D,EQGH	 ' 	 -M-Ds   $C Cc                    0 nU H  u  pEXE4X4R                   '   M     / nU R                   HO  nUR                   H"  nX;   d  M
  X;  d  M  UR                  U5        M$     [	        U5      [	        U5      :X  d  MO    O   U R                  S   n	0 n
[	        U R                  5      n/ n[        U	5      (       Gab  U	R                  [        ;   GaM  [        [        [        U R                  5      5      5       Hb  u  pUR                  [        ;   d  M  [	        UR                  S5      5      S:X  d   eXzUR                  S5      S   '   UR                  U5        Md     [	        U5      [	        U
5      :X  d   eU H\  nU R                  SS9nUR                  R                  X   R                  5        UR                  UUR!                  X   5      5        M^     U H  nU R#                  USS9  M     U R%                  5         [	        U R                  5      U:X  d   e[&        R)                  S	U S
35        U Vs/ s H  oCU   PM	     sn$ s  snf )NrK   r   r   r   nop)r   Fr   z(Sharding the Order of param being used: rZ  )r   r   r   r   ra   r   r   r   r   r   r   r   r  r1   	copy_fromset_op_dist_attr_for_programr(  r   r   rb  rc  )r   param_gradsr>   pname_to_pg_pairsry  g	use_orderr5   r   last_oppname_to_opnum_opsremove_op_indicesr   pnamer   s                   r6   r   r     s   %&F&&!  Iii,,J/+  ,	 -
 y>S!233  iimGK%))nGg7<<3L#LYuyy%9 :;GCww33288G,-22246BHHW-a01!((-	 <
 9~[!1111 E__%_0FKK!!+"4"9"9:5599+:LM  %CSu- % 	599~((( LL;I;aHI*34)Qa )444s   6Ic                    0 n0 n/ n[        U5      nU R                   H  nU R                  UR                  5      nUR	                  Xg5      (       a  UR                  Xg5        O[        U5      nUR                  Xg5        U R                  UR                  5      Ul        XR;   a  X%   R                  UR                  5        OUR                  /X%'   XSUR                  '   M     X#4$ )z-
param are group by:
rank id
fuse_size
dtype
)r  r   r  r   r  r  r  r   )r   	fuse_sizert  ru  bucketr  r   rm  s           r6   ra  ra  @  s     F#I%%))%**5,,e* +Ie*&3&E&EJJ'
	# *)00<-2ZZL))25::&% &( 11r8   c                   8    \ rS rSrS rS rS rS rS rS r	Sr
g	)
r   ib  c                    Xl         U VVs0 s H  u  pVUR                  XV4_M     snnU l        [        U R                  5      [        [	        U R                  5      5      :X  d   S5       eU VVs/ s H  u  pWUPM	     snnU l        U R
                   Vs/ s H  oUR                  PM     snU l        UR                  U l        X l	        UR                  R                  U R                  5      U l        X@l        [        U R
                  U R                  U R                  5      U l        0 U l        U R#                  5         g s  snnf s  snnf s  snf )Nz&found duplicated param in params_grads)r   r   rI   ra   rQ   r   r%  r   r  rJ   r   r  r#  rG   r"  r  param_to_rank_map_param_to_rank)rX   r   rm  rI   rG   ry  r)  r  s           r6   rO   ShardingInfo.__init__c  s   
8DEQVVaV^E4$$%S1B1B-C)DD 	
4	
D &22\TQq\2,0KK8KqFFK8,,++++D,<,<=.2KK$*>*>
  !# F
 38s   E0EEc                     U R                   R                  5        H&  u  pU H  nXR                  UR                  '   M     M(     g)z0
mapping parameters to the rank which holds it.
N)r  r  r5  r   )rX   rm  r   r   s       r6   r6  ShardingInfo._map_param_to_rankx  s;     !//557LD15""5::.   8r8   c                 @    XR                   ;   a  U R                   U   $ g)NrK   )r5  )rX   r   s     r6   r  ShardingInfo.get_var_rank  s"    (((%%g..r8   c                 >    U R                  U5      U R                  :H  $ r   )r  r#  rX   r   s     r6   r  ShardingInfo.is_in_local_shard  s      ,??r8   c                    [        5       n[        5       n0 n[        R                  U R                  S5      nUR                   HE  n[        U5      (       a  M  UR                   H   nXpR                  ;   d  M  XW==   S-  ss'   M"     MG     UR                   H  n[        XU R                  5      (       d  M   UR                  S   nUR                  S   nUR                  U5        UR                  U5        XtU'   XW==   S-  ss'   U R                  U   U R                  U'   M     UR                  5        H  u  pU
S:  d  M  UR                  U	5        M      X%4$ r  )rQ   dictfromkeysr%  r   r   r   r$  r   r   r5  r  )rX   r   broadcast_varsfp16_paramsfp16_to_fp32r.  r5   r   r   r   usages              r6   r"  /ShardingInfo.get_broadcast_vars_and_param_usage  s9   emmD$4$4a8))Bb!! 00
!1!11+q0+ 1  ))B)%T5E5EFF++A.J--a0K{+OOK((2%#q(#.2.@.@.LD{+  (--/LEqy""5) 0 **r8   c                     U R                  U5      (       d  [        SU S35      eXR                  ;  a  [        SU S35      eU R                  R                  US 5      $ )Nzparam[z] not in current rank.z] not in params_grads)r  
ValueErrorrI   getr=  s     r6   r  ShardingInfo.get_param_grad  sb    %%j11vj\1GHII...vj\1FGHH  $$Z66r8   )
rJ   r   r  r#  r%  r5  r   rI   rG   r  N)r  r  r  r  rO   r6  r  r  r"  r  r  rf   r8   r6   r   r   b  s!    "*6@+:7r8   r   c                   ,    \ rS rSrS rS rS rS rSrg)r  i  c                     Xl         S U l        SU l        SU l        / U l        S U l        S U l        S U l        / U l        / U l	        SU l
        g )NrK   r   F)max_sizer   rm  r  r   rj  r  r  r  r  r  )rX   rM  s     r6   rO   VarGroup.__init__  sS     
	
	 $(!#!#$&!!&r8   c                     U R                   S:X  a  gUR                  U R                  :w  a  gX R                  :w  a  gU R                   [        U5      -   U R                  :  a  gg)Nr   TF)r  r   rm  r
   rM  rX   r   rm  s      r6   r  VarGroup.acceptable  sP    ::?{{djj(yy zzM%004==@r8   c                     UR                   U l         X l        U =R                  [        U5      -  sl        U R                  R                  U5        g r   )r   rm  r  r
   r   r   rP  s      r6   r  VarGroup.collect  s7    [[
	

mE**
		r8   c                 ,    [        U R                  5      $ r   )ra   r   rb   s    r6   __len__VarGroup.__len__  s    499~r8   )r  r  r  rj  r   r  rM  r  rm  r  r   N)	r  r  r  r  rO   r  r  rU  r  rf   r8   r6   r  r    s    '
 r8   r  r   )r  )Ologging	functoolsr   r   paddle.distributedr   r  8paddle.distributed.auto_parallel.static.operators.commonr   r   r   5paddle.distributed.auto_parallel.static.process_groupr   -paddle.distributed.auto_parallel.static.utilsr   r	   r
   r   r   r   r   r   r   r   r   7paddle.distributed.fleet.meta_optimizers.sharding.utilsr   paddle.frameworkr   paddle.staticr   r   paddle.utilsr   auto_parallel_master_gradr   	pass_baser   r   
pass_utilsr   op_proto_and_checker_makerr   kOpRoleAttrNamer   r   r   INFOrb  ro   rp   FP16r}   r~   r7   r;   rr  r,  r  r  r   r   r   r   r$  r  r  r  r  r  rp  r   r  r  r"  r   ra  r   r  rf   r8   r6   <module>rh     sy      ! 
    Q ! G $ > . .		(	(	/	/--==?	  W\\
"||++00 % C '(r8 r )rj+46
D OO&R$#$-  %%**%	&
-- (,>4( 15h2DM7 M7`! !r8   