
    x-j5                       d dl Z d dlmZ d dlZd dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZmZ d d	l m!Z! d
dl"m#Z# d
dl$m%Z%m&Z& d
dl'm(Z( ej)        j*        Z*ej)        +                                Z,g dZ-g dZ. ee j/                  Z0ej1        j2        j3        Z4dZ5d Z6 e&d           G d de%                      Z7d Z8d Z9e*j:        fdZ;d Z<d Z=d Z>d Z?d Z@ej1        j2        jA        e4fdZBd ZCd ZDd  ZEd! ZF	 d-d"ZGd# ZHd$ ZId.d&ZJd' ZKd( ZL G d) d*          ZM G d+ d,          ZNdS )/    N)reduce)ParallelModeis_data_parallel_reduce_opis_parameter_related)new_process_group)_get_comm_group
get_loggerget_var_numelinsert_dependencies_for_varsis_backward_opis_dep_skip_opis_forward_opis_optimize_op*naive_set_dist_op_attr_for_program_by_mesh6naive_set_dist_op_attr_for_program_by_mesh_and_mappingset_var_dist_attrget_var_size)core)default_main_programdefault_startup_program)unique_name   )_is_master_grad_cast_op)PassBaseregister_pass)AutoParallelStreamType)create_py_readercreate_double_buffer_readerreadslicesplitassignsend_v2)
adamadamaxadamwdecayed_adagradmomentumdgc_momentumlars_momentummerged_momentumlambsgdfloat16c                 n    | j                             d          od| j                             d          v S )Nop_namescopez/auto_parallel/reshard)deschas_attrattrops    p/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/passes/auto_parallel_sharding.py_is_reshard_opr8   Q   s<    7  C
"bgll>&B&B
BC    auto_parallel_shardingc                        e Zd Z fdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z xZS )ShardingPassc                 H   t                                                       |                     dd            |                     dd            |                     dd            |                     dd            |                     dd            |                     dd            |                     dd            |                     dd            |                     d	d            |                     d
d            |                     dd            |                     dg            |                     dd           |                     dd           |                     dd           t                      | _        g | _        i | _        d| _        d | _        g | _	        d S )Ndist_contextstagesharding_degreedegreeenable_overlapparam_comm_stream_numgrad_comm_stream_numparam_bucket_size_numelgrad_bucket_size_numelpartition_algorenable_hierarchical_commparams_gradsglobal_rank	amp_dtyper/   gradient_sync_after_accumulateF)
super__init__set_attrset	dp_groupssharding_infosvarname_to_sharding_infosharding_hybrid_dpouter_dp_groupshared_params_grads)self	__class__s    r7   rO   zShardingPass.__init__]   s   nd+++gt$$$'...h%%%&----t444,d333/666.555'...0$777nb)))mR(((k9---6>>> (*%"'"#%   r9   c                    |                      d          dS |                      d          dvrdS |                      d          Dt          |                      d          t                    r|                      d          dk    rdS n[|                      d          Dt          |                      d          t                    r|                      d          dk    rdS ndS t          |                      d                    d	k    rdS t          |                      d
          t                    r|                      d
          d	k     rdS |                      d          dS |                      d          dS |                      d          dS |                      d          dS |                      d          dS |                      d          dS |                      d          dS dS )Nr>   Fr?   )r         r@   r   rA   rI   r   rJ   rB   rC   rD   rE   rF   rG   rH   T)get_attr
isinstanceintlenrX   s    r7   _check_selfzShardingPass._check_selfu   s   ==((05==!!225==*++7t}}->??EE011Q66u 7]]8$$0t}}X66<< B BB B uB
 5t}}^,,--2254==77== 	$--C
 C
C C 5==)**25==01195==/0085==233;5==122:5==*++35==344<5tr9   c                     dS )NT )rX   
other_passs     r7   _check_conflictzShardingPass._check_conflict   s    tr9   c                    |                      d          | _        t          |                      d          p|                      d                    | _        t          |                      d                    | _        t          |                      d                    | _        |                      d          | _        t          |                      d                    | _        t          |                      d                    | _        |                      d	          | _	        | j        d
k    s| j        d
k    r| j        s
J d            t          |                      d                    | _
        t          |                      d                    | _        |                      d          | _        |                      d          }|                                |                                }}|                      d          | _        | j        dk    rt          j        j        j        }d}|                     ||           |j        D ]C}	|                     |	|           |                     |	           |                     |	|           D|                    d| j                   |                     ||           d S )Nr>   r@   rA   r?   rJ   rB   rC   rD   rH   r   z3multiple comm stream need enable_overlap to be TruerE   rF   rG   rI   rL   bfloat16)r]   _dist_contextr_   sharding_world_sizer?   rJ   rB   rC   rD   rH   rE   rF   rG   global_blockrL   r   VarDescVarTypeBF16_build_sharding_groupsblocks_shard_optimizer_shard_gradient_synchronization_shard_parameterrP   rW   _optimization_pass)
rX   main_programstartup_programcontextrI   
main_blockstartup_block__amp_target_dtype____amp_target_dtype_name__blocks
             r7   _apply_single_implzShardingPass._apply_single_impl   s   !]]>::#&MM+,,Gh0G0G$
 $
  w//00
t}}];;<<"mm,<==%(7N)O)O%P%P"$'6L(M(M$N$N!(,&)
 )
% %))T-F-J-J&  E & (+MM344(
 (
$ '*MM233'
 '
#  $}}->??}}^44%%''((** "

 {33>Z''#'<#7#< (2% 	##J===!( 	8 	8E!!%77700777!!%7777)ABBBo>>>>>r9   c                 \    |                      |           |                     ||           d S N) _collective_data_parallel_groups_build_sharding_infos)rX   rx   rI   s      r7   ro   z#ShardingPass._build_sharding_groups   s2    --j999"":|<<<<<r9   c                 `   |j         D ]h}t          |          r|j        t          v r t	          |          r0t          | j        || j        d          }|| j        	                    |           it          | j                  dk    r%t          dt          | j                   d          d S )Nr   r   zSSo far Only and Exactly one data parallel group in network are supported, but got [z ] different data parallel groups)opsr   type	_skip_opsr8   +_inference_data_parallel_group_for_operatorrJ   ri   rR   addr`   NotImplementedError)rX   rx   r6   groups       r7   r   z-ShardingPass._collective_data_parallel_groups   s    . 	* 	*B $$ 9(<(< b!! ? "d&8! E  ""5))) t~!##% \fijnjxfyfy  \  \  \   $#r9   c                    t          ||| j                  }| j        D ]}|j        | j        k    sJ d| j         d|j         d            |j        | j        z  dk    sJ d| j         d|j         d            | j        |j        v sJ d| j         d|j         d            t          |          | j        k    s#J dt          |           d	| j         d
            |j        | j        k    rd| _        | j	        dk     sJ | j
        dk     sJ t          | j                  dk    s
J d            t          |j        | j        | j                  \  }}t          |          }t          |          | _        n|}|| j        _        t          || j        || j                  }| j                            |           |j        D ]}|| j        |j        <   d S )Nzsharding world size [z(] should not larger than dp world size []r   z(] should be divisible by dp world size [zcurrent ranks [z.] does NOT belong to the data parallel group [znumber of parameters [z#] is not enough to be shard among [z] ranksTr[   r   zthybrid sharding and data parallelism are supported only when there is exactly one data parallel group in the network)re_order_programri   rR   nranksrj   rJ   ranksr`   rU   rC   rD   _get_dp_and_sharding_groupsr   rV   _sharding_groupShardingInforG   rS   appendparamsrT   name)rX   rx   rI   dp_grouprV   sharding_groupsharding_infoparams           r7   r   z"ShardingPass._build_sharding_infos   s|   'd&8
 

  (	J (	JH?d&>>>>|(@||jrjy||| ?>> ?T%==BBB|(@||jrjy||| CBB #x~555s$"2ssbjbpsss 655 |$$(@@@@ A\):):  A  A_c_w  A  A  A A@@
 !999*.'1A55550144444>**a/// K 0// 2MND$<d>N2 2. "3>!B!B&7&G&G##!)1?D.( $	 M &&}555&- J J<I-ej99JO(	J (	Jr9   c                     |                      |           |                     |           |                     ||           |                     ||           dS )z
        sharding all optimizer related ops and vars, include:
        gradient clip ops & vars
        weight decay ops & vars
        optimizer ops and states
        N)_shard_amp_related_op_and_vars_shard_weight_decay_shard_optimizer_ops_and_states_insert_optimizer_broadcasts)rX   rx   ry   s      r7   rq   zShardingPass._shard_optimizer  s^     	++J777  ,,,,,ZGGG))*mDDDDDr9   c                    | j         dk     rd S t          t          t          |j                                      D ]"\  }}t          ||          ry| j         dk    rn|j        d         }|d |                    d                   }|                     |          s.|	                    |d           |
                    |d           |j        dv rg }|j                            d          D ]I}|d |                    d                   }|                     |          r|                    |           J|r8|j                            d|           |j                            d|           9|j        d	k    r|                    d
          }|j        d         }	|j        |	         }
|	                    |d           |                    |dd|
id|
j        d|
j        ddt,          |i           | j                            |
          }t3          |j        |         |j        |j        | j        |j                   |	                    |d           $|                                 d S )Nr   r   @Fsync)check_finite_and_unscaleupdate_loss_scalingXOutr   op_rolefill_constantshapedtypevaluer   outputsattrschunk_id)r?   reversedlist	enumerater   _is_param_grad_fp32_cast_opoutput_arg_namesfind_is_parameter_in_local_shard
_remove_op_remove_varr   r2   inputr   	set_input
set_outputr4   vars_insert_op_without_syncr   r   OP_ROLE_KEYri    get_tensor_dist_attr_for_programr   process_meshdims_mappingr   _sync_with_cpp)rX   rx   idxr6   output_name
param_name
reversed_x
input_namer   out_nameout_var	dist_attrs               r7   r   z+ShardingPass._shard_amp_related_op_and_vars,  s   :>>FYz~%>%> ? ?@@ 7	? 7	?GC*:r:: 5?tzA~~ 1!4()@;+;+;C+@+@)@A
88DD D))#E):::**;U*CCC OOO
"$'--"4"4 6 6J!+,Bjooc.B.B,B!CJ88DD 6"))*555  #?G%%c:666G&&uj9999w"<<<"$'')"4"4#%#6q#9",/(";"--c->>>"::!0%*G$4 ' ' ' +W	#	 ; 
 
 
 !.OO '  " O&N3/%2%2 .%.%7     #--c->>>!!#####r9   c                 |   | j         dk     rd S g d}t                      }t                      }t          t          |j                            D ]\  }}t          |          s|j        |v r|                    d          d         }|d |                    d                   }| 	                    |          s=|
                    |           |j        dv r|j        D ]}	|
                    |	           t          t          t          |j                                      D ]0\  }}t          |          s||v r|                    |d           1|D ]}
|                    |
d           t          t          |j                            D ]\  }}t          |          s|j        d	k    rg }|j        D ]}||vr|                    |           |j                            d|           |j        d         }t          | j                  D ]\  }}|                    ||z   d
z   dd|gid|gid|j        j        dddt.          j        j        j        t6          t8          j        i          }| j                            |                     |                    } n|!                                 d S )Nr[   )elementwise_mulsquared_l2_normclip_by_normr   r   @GRAD)r   r   Fr   sumr   
all_reducexoutring_idr1   z /gradient_clip_model_parallelismreduce_typer   inputsr   r   )"r?   rQ   r   r   r   _is_gradient_clip_opr   r   r   r   r   r   r   r   r   input_arg_namesr   r2   r   rS   
_insert_opr   idpaddledistributedReduceOpSUMr   OpRoleOptimizeri   r   varr   )rX   rx   removed_op_typeremoved_op_idxremoved_tmp_varr   r6   r   r   r   varnamereserved_varssum_op_outputir   new_opr   s                    r7   _shard_gradient_clipz!ShardingPass._shard_gradient_clipk  s   :>>F QPP%%Ijn5566 	= 	=GC'++ w/))XXc]]1-
'(B*//'*B*B(BC
88DD ="&&s+++w"EEE+-+> = =K+//<<<<Yz~%>%> ? ?@@ 	7 	7GC'++ n$$%%c%666& 	8 	8G""7"7777Ijn5566 !	 !	GC'++ w% ""$"4 9 9J!88%,,Z888!!#}555 " 3A 6(1$2E(F(F  $A}'22a!) #m_5!& 8%}':'=*,N)6+=+F+J'	 3  F *KK&NN=99  I =  @ 	!!#####r9   c                     | j         dk     rd S t          t          t          |j                                      D ]#\  }}t          |          st          d          |                                 d S )Nr[   z$weight decay is NOT supported by now)r?   r   r   r   r   _is_weight_decay_opr   r   )rX   rx   r   r6   s       r7   r   z ShardingPass._shard_weight_decay  s    :>>FYz~%>%> ? ?@@ 	 	GC&r** ):   	!!#####r9   c                 <   g }t          t          t          |j                                      D ]\  }}t	          |          s n|j        t          v rd|j        v sJ t          |	                    d                    dk    sJ |	                    d          d         | 
                              s>|                    fd|j        D                        |                    |d           | j                            |                                          t          t          t          |j                                      D ]C\  }}t          |j                  dk    r&|j        d         |v r|                    |d           D|D ]Z}|                    |          r|                    |d           |                    |          r|                    |d           [|                                 |                                 d S )NParamr   r   c                      g | ]
}|k    |S rd   rd   ).0r   r   s     r7   
<listcomp>z@ShardingPass._shard_optimizer_ops_and_states.<locals>.<listcomp>  s.        '&*44 $444r9   Fr   )r   r   r   r   r   r   _supported_optimizer_typeinput_namesr`   r   r   extendr   r   rW   r   _get_param_gradhas_varr   r   )rX   rx   ry   should_removed_optimizer_statesr   r6   r   r   s          @r7   r   z,ShardingPass._shard_optimizer_ops_and_states  sU   *,'Yz~%>%> ? ?@@ 	 	GC!"%% w333".0000288G,,--2222XXg..q1
88DD 3::   +-+>     ))#E)::::,33,,Z88    Y}/@%A%A B BCC 	: 	:GCB'((A--'*.MMM((5(9996 	? 	?G!!'** <&&wU&;;;$$W-- ?))')>>>!!###$$&&&&&r9   c                 t   | j         dk    s| j        dk    rd S | j        D ]}|j        D ]}|                    |j                  sJ |                    |j                  sJ |                    dd|id|id|j        j        d|	                    |j                  t          t          j        i          }|                    d	d
t          j        z              | j                            |          }|J t%          ||j        |j        | j        |j                   |                                 d S )Nr[   r   	broadcastr   r   r   rootr   r1   /r   )r?   rE   rS   r   r   r   	append_opr   r   get_var_rankr   r   r   	_set_attrr   DataParallelri   r   r   r   r   r   r   )rX   rx   ry   r   r   r   param_dist_attrs          r7   r   z)ShardingPass._insert_optimizer_broadcasts  sx   :>>T9A==F!0 	 	M&-  !))%*55555$,,UZ88888#--$<"EN!=#6#9 : :5: F F#V_	 . 	 	   "C,*C$C   &GGNN   '222F#0#0&,5    +8 	!!#####r9   c                 \    || j         v sJ | j         |         }|                    |          S r   )rT   is_in_local_shard)rX   r   r   s      r7   r   z)ShardingPass._is_parameter_in_local_shard  s7    T:::::5jA..z:::r9   c                 h    || j         v sJ | j         |         }|                    |          }|J |S r   )rT   get_param_grad)rX   r   r   p_gs       r7   r   zShardingPass._get_param_grad  sD    T:::::5jA**:66
r9   c                    | j         dk     rd S d | j        D             }t          t          t	          |j                                      D ]\  }}t          ||          rh|j        dk    r(|                    d          t          j
        j        k    s3|j        dk    r<|                    d          t          j
        j        k    rd}t          j
        j        }nd}t          j
        j        }|j        d         }t          |          }| j        |         }	t!          |||||	j        j        |	                    |          | j        |          }
| j        r|	                    |          s|                    |dz   d	           nC|                    d
| j        j                   |                    ddt4          j        z              t9          ||          rU|j        d         }t          |          }| j        |         }	|	                    |          s|                    |d	           |                                 d S )Nr[   c                     g | ]	}|j         
S rd   r   r   r   s     r7   r   z@ShardingPass._shard_gradient_synchronization.<locals>.<listcomp>      <<<Eux<<<r9   r   r   r   r   r   Fr   r   r1   r   )r?   rR   r   r   r   r   _is_param_grad_allreduce_opr   r4   distr   r   AVGr   _get_base_name_from_grad_namerT   _insert_reduce_opr   r   r   ri   rU   r  r   r   rV   r   r   _is_param_grad_sum_opr   r   )rX   rx   dp_ring_idsr   r6   reduce_op_typer   r   	base_namer   	reduce_opr   s               r7   rr   z,ShardingPass._shard_gradient_synchronization  sB   :>>F<<T^<<<Yz~%>%> ? ?@@ /	; /	;GC*2z:: #G|++..$-2CCCGx''..$-2CCC%-N"&-"3KK%-N"&-"3K/2
9*EE	 $ =i H-"!'*!..y99&	 		 /	(::9EE	 ))#')>>>>LLD,?,BCCCLL&l.G(G   %R44 ;.q19(CC	 $ =i H$66yAA ;))#E):::!!#####r9   c                 	   | j         dk     rd S d | j        D             }| j        D ]y}|                    |          \  }}g }|D ]A}||         dk    r3|                    |          |j        k    r|                    |           Bt          t          t          |j
                                      D ]\  }	}
t          |
          r|
j        D ]}t          ||
|j                  r+|                     |          s|                    |           D||vrI|                    |          }||j        k    r|}nt!          j        |dz             }|                    |          }|                    ||j        |j        d          }| j                            |          }t1          | j        ||j        |j        |j                   | j                            |
          }|                    |          }|
                    ||           |                    ||           tA          ||	||j        ||j!        j"        |
#                    d          | j                   t          t          t          |j
                                      D ]]\  }	}
|
j$        d	k    r|
j        d         }|
j%        d         }||v r.|&                    |	d
           |'                    |d
           ^t          t          t          |j
                                      D ]'\  }	}
tQ          |
j%                  dk    sJ |
j%        d         }|
j$        dk    r|
#                    d          |v r^| j)        r?|                    |          |j        k    r!|
*                    d| j)        j"                   nM|&                    |	d
           n5|                    |          |j        k    r|&                    |	d
           |
j$        dk    r9||v r5|                    |          |j        k    r|&                    |	d
           )|D ]N}|                    |          |j        k    r.|'                    |d
           |'                    |d
           O{|+                                 |+                                 d S )Nr\   c                     g | ]	}|j         
S rd   r	  r
  s     r7   r   z1ShardingPass._shard_parameter.<locals>.<listcomp>O  r  r9   r   z
@BroadCastF)r   r   r   persistabler   r   castr   r   r   r   ),r?   rR   rS   "get_broadcast_vars_and_param_usager   
local_rankr   r   r   r   r   r   r   _is_param_fp16_cast_opparam_namesr   r   generater   
create_varr   r   ri   r   r   r   r   r   get_op_dist_attr_for_programget_input_dist_attr_rename_inputset_input_dist_attr_insert_init_and_broadcast_opr   r   r4   r   r   r   r   r`   rV   r   r   )rX   rx   ry   r  r   need_broadcast_varsparam_usagenot_used_param_namer   r   r6   r   	root_rankbroadcast_varname	input_varnew_varref_dist_attrop_dist_attrinput_dist_attrr   s                       r7   rs   zShardingPass._shard_parameterK  s   :>>F<<T^<<<!0 }	F }	FM @@LL#"$) ; ;

+q00%22:>>$/0 0 (..z:::#D:>)B)B$C$CDD @ @R!"%% "$"4 < <J ."B(A  !  $@@LL C/66zBBB !)<<<  - : :: F FI M$<<<,6)),7,@&5- -) %/NN:$>$>	","7"7!2"+/"+/(-	 #8 # # !.OO )  &
 * .#)6)6%2%;    !.KKBOO % +7*J*J&+ + ((5FGGG$88-   2")%0!%+.	***	 	 	 	g<| $D:>)B)B$C$CDD D DR7f$$/2
 1!4!444))#E):::**;U*CCC#D=3D)E)E$F$FGG > >R2.//14444 1!47k))wwy))[88 /F - : :; G G,7!8 !8 LLD4G4JKKKK)44Su4EEEE *66{CC,78 8 *44Su4EEE G{**#{22%22;??$/0 0 ",,Su,===) F F
!..z::$/0 0 **:E*BBB!--ju-EEEF 	!!###$$&&&&&r9   c                    | j         dk    rd S d| _        d| _        d| _        t	          | j                  dk    s J dt	          | j                   d            | j        d         }t          j                            ||          5  | 	                    |           | j
        dk    rA| j         dk    r|                     |           n | j         d	k    r|                     |           d d d            d S # 1 swxY w Y   d S )
Nr   sharding_coalesce_grad_sharding_coalesce_param_rK   zZgradient synchronization optimization only support one sharding group right now, but got [z].r   r[   r\   )r?   grad_coalesce_prefixparam_coalesce_prefixcomm_op_scheduling_priorityr`   rS   r   staticprogram_guard_gradient_sync_optimizationrE   &_fuse_overlap_parameter_comm_stage_two(_fuse_overlap_parameter_comm_stage_three)rX   ru   rv   r   s       r7   rt   zShardingPass._optimization_pass  s   :??F$=!%?"+-( 4&''1,,, Filmq  nA  jB  jB  F  F  F -,, +A.]((GG 	Q 	Q,,];;; +a//:????NNNNZ1__AA-PPP	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Qs   A"C77C;>C;c                    | j         dk    r	| j        sd S t                                                      }t	                                                      }|                     ||          \  }}|                     ||||           d S )Nr   )rF   rB   r   rk   r   _group_grads_overlap_grad_comm)rX   r   rx   ry   coalesce_to_group_mapgrad_name_to_group_maps         r7   r7  z(ShardingPass._gradient_sync_optimization  s    &!++T5H+F)++88::
/11>>@@8<8I8I9
 9
55 	!"		
 	
 	
 	
 	
r9   c                 T   t                                                      }t                                                      }t          || j                  \  }}t
                              d           t
                              d| j         dt          |                                           dt          |                                           d           i }| j	        rg | _
        |j        j        }t          | j                  D ]M}|dk    r|j        }	nt          |d          }	| j
                            |	t"          j        j        d	           Nt
                              d
| j         d           i | _        t+          |                                          D ]7\  }}
t          |
          dk    sJ t          |
          dk    rt-          j        | j        t3          |          z             }|                    ||
j        dd           |                    ||
j        dd          |
_        |                    dd|
j        i|
j        |
j        dddddd|
j        t>          t@          j!        i           n|
j        d         |
_        t
                              d| dtE          d |
j        D                        d           t
          #                    d| dd |
j        D              d           |
||
j        j$        <   || j        z  }| j
        |         d         }| j
        |         d         }|                    dd|
j        id |
j        id!|j%        d"|
j&        t>          t@          j'        i          }|| j        |<   |(                    d#d$tR          j*        z              | j	        r||j+        _,        | j-        |j+        _.        9i }t+          |j/                  D ]\  }}ta          |          rz|1                    d%          d         }|j        |         }||         }
d }| j	        r|j+        j,        }t          |                                          | j        k     r6te          |          }|j        |1                    d&          d                  }nT|j/        || j        z
           }ta          |          s
J d'            |j        |1                    d%          d                  }||g|g|fg||<   t          |
j                  dk    r]|
j3        r.|
j        d(         }||                             ||g|g|f           ||                             |dz   |g|
j        |f           ti          |                                d)          }|D ]j}||         d d d(         D ]V\  }}}}tk          ||||| j6        t@          j'        d(gd*d*d+,
  
        }| j	        r|||j+        _,        | j-        |j+        _.        Wk|7                                 d S )-NzSharding Stage2 Optimization:zParam Bucket size is [], [z] Parameters are fused into [z	] Bucketsr   Tforce_new_group
comm_groupcomm_streamz#Parameter Communication would use [z
] streams.r   r   r   r  stop_gradientcoalesce_tensorInputOutputFusedOutput	copy_data	use_alignr   r   zBucket[z] size [c                 ,    g | ]}t          |          S rd   r   r   ps     r7   r   zGShardingPass._fuse_overlap_parameter_comm_stage_two.<locals>.<listcomp>E  s    )T)T)Ta,q//)T)T)Tr9   z]MB.z] parameters: c                     g | ]	}|j         
S rd   r   rP  s     r7   r   zGShardingPass._fuse_overlap_parameter_comm_stage_two.<locals>.<listcomp>H  s    +M+M+MqAF+M+M+Mr9   .rD  rE  r   r   r   r   r   r1   r   r   ParamOutz:Unexpected: sharding broadcast pre op should be broadcast.rK   reverseFsharding_stage2_broadcast_depr   is_recomputer   r1   )8r   rk   r   group_paramrE   _loggerinfor`   keysrB   param_comm_group_stream_pairsr   r   rangerC   r   r   r   SHARDING_STREAMr   op_to_stream_idxr   r   r  r3  strr  r   coalesce_varr   r   r   r   Forwardr   debugr   r   rankr   r   r   r   r   execution_streamr4  scheduling_priorityr   is_sharding_param_broadcast_opoutput_get_broadcast_first_depend_opr  sortedr   ri   r   )rX   r   rx   ry   group_to_param_mapparam_to_group_mapbroadcast_var_to_group_mapr   r   r   param_groupcoalesce_var_namecomm_stream_idxrD  rE  r   dep_mapr6   r)  broadcast_var	prior_varpre_op	last_gradindicer   
prior_vars	post_vars	depend_ops                               r7   r8  z3ShardingPass._fuse_overlap_parameter_comm_stage_two  s@   )++88::
/11>>@@1<472
 2
.. 	4555 nT%A  n  nsK]KbKbKdKdGeGe  n  n  EH  I[  I`  I`  Ib  Ib  Ec  Ec  n  n  n	
 	
 	
 &(" 	' 24D.!'-E4566  66)/EE-eTJJJE299&+'='M'S     LL\d6P\\\   %'D!'(:(?(?(A(ABB E	 E	NA{{##q((((;!##$/$8.Q7% %! ((*%+ $"&	 )    ,6+@+@*%+ $"&	 ,A , ,( ''*#[%56"-"2'2'? 
 $T#T!2#V^	 (     ,7+;A+>(LL[![[S)T)T;CS)T)T)T%U%U[[[   MMP!PP+M+MK<L+M+M+MPPP  
  '{'?'DE
  $"<<O;OLJ <_MK  )) [56 89z}K,	 * 	 	F -<D!&)^S<3L-LMMM" 4? 14  4 z~..  	  	EAr-b11 $&IIe$4$4Q$7! *0A B89JK"& @"$,"?K w||~~&&)CCC7
CCB *		*0E0Ea0H III'^A0J,JKF9&AA  T A !+e0D0DQ0G HI 9+LM
{'((1,,"4 $/$4R$8	
))m_kJ   AJ%%Q1A;O  
 555 	 	A;B1:ddd;K  7ZK8&O" "'!@  	 & 9+@;FI'88 ';#* 	!!#####r9   c                     d S r   rd   )rX   r   s     r7   r9  z5ShardingPass._fuse_overlap_parameter_comm_stage_three  s    r9   c                 0   |j         }| j        dk     rd| _        d}|D ]}t          |          r|} n|dS |j        d         }t	          | j                  }g }t                      }	d }
d}|t          |          k     r||         }t          |          rj|j        dk    o6|	                    d          t          j        j        t          j        j        fv }|s
J d            |j        d         }t          |          }|                    |          }|                    |          }|                    ||          r||	vsJ |                    ||           n?|                    |           t	          | j                  }|                    ||           t          |j                  dk    r`|dz
  |_        d}t-          |||z
                     r|dz  }t-          |||z
                     |||z
           }|j        d         }||_        |	                    |           |j                            |           | j        r|                    |          rd	|_        ||dz            j        d
k    r5||dz            	                    d          t8          j        j        j        fv s
J d            ||dz            j        d         |k    s
J d            |j                            |dz              |dz  }n5 |
||          r)|                    |           t	          | j                  }|dz  }|t          |          k     t          |j                  dk    r|                    |           t>                               d           t>                               d| j         dt          |	           dt          |           d           i }i }i }i }g }tC          |          D ]r\  }}t          |j                  dk    r|"                    tG          j$        | j%        tM          |          z             |j'        dd	          |_(        | j)        *                    |j        d                   }tW          | j)        |j(        |j,        |j-        |j.                   |||j        <   |j        /                                }|||<   |0                    |j                   |j        r8|j        /                                }|||<   |0                    |j                   n|j        d         |_(        |j        D ]}|||j1        <   |||j(        j1        <   tt          |2                                          }t          |2                                          }t          |          } |3                    |          }!t          |!          dk    sJ |3                    |           }!t          |!          dk    sJ |3                    |           }!t          |!          dk    sJ ti          tk          tC          |j                                       D ]\  }"}|"|v r||"         }|j        d         }||j        d         j1        k    s!J d|j        d         j1         d| d            |6                    ||j(        j1                   |7                    ||j(        j1                   |"|v r|8                    |"d           |"|v rb||"         }|j        d         j1        }#|#|j        v sJ d|# d| d            d |j        D             }$g }%g }&|j        D ]@}'|'j9        }(|%0                    |(           |&                    t          |(                     A|:                    |"dd|$i|$|j(        ddddd	d |j'        d!|%d"|&tv          tx          j=        i#          })| j)        *                    |j(                  }t}          |)|j-        |j,        | j)        |j.                   t          ||"|                    |j                  |j(        | j)        tx          j=        dgddd$%
  
        }*|@                                 ||fS )&a  
        conditions for gradients to be grouped:
            1. group size < grad_bucket_size_numel
            2. same dp group (TODO)
            3. same src rank
            4. same dtype
            5. dependency: grad would NOT be used by other ops within group segment

        main logic:
            1. record coalesce group
            2. record all dp allreduce/reduce op idx

            3. insert coalesce op
            4. insert coalesce dependency (avoid allocate memory too early)
            5. modify and remove allreduce/reduce op
            6. ensure sharding-dp hybrid parallel logic

        gradients inside same group would be fuse into one coalesce tensor
        r   Nr   c                     t          | j        | j        z             }d |j        D             }t	          |                    |                    dk    S )Nc                     h | ]	}|j         
S rd   rS  r   r   s     r7   	<setcomp>zHShardingPass._group_grads.<locals>.op_depend_on_group.<locals>.<setcomp>  s    888c888r9   r   )rQ   r   r   r   r`   intersection)r6   r   vars_	var_namess       r7   op_depend_on_groupz5ShardingPass._group_grads.<locals>.op_depend_on_group  sP    *R-@@AAE88UZ888Iu)))445599r9   r   r   zZSharding should reduce grad first and than allreduce if Hybrid Sharding with Data-Parallelr[   Tr   z@Hybrid Sharding with Data-Parallel should sync same gradient varz-Sharding Gradient Communication Optimization:zGradient Bucket size is [r@  z] Gradients are fused into [z
] Buckets.FrF  r   rK   z$Unexpected: it is supposed to sync [z] but got [r   r   z-Unexpected: op is supposed to generate grad [c                     g | ]	}|j         
S rd   rS  )r   grads     r7   r   z-ShardingPass._group_grads.<locals>.<listcomp>t  s    ???Ddi???r9   rH  rI  rJ  rM  rN  r   concated_shapesconcated_ranksr   sharding_grad_coalesce_deprY  )Ar   rF   r   r   VarGrouprQ   r`   r   r   r4   r  r   r  r   r  r   r   
acceptablecollectr   r   coalesce_op_idxr   coalesce_dep_varnamer   reduce_op_indicesrU   r  r   r   allreduce_op_indicesr\  r]  r   r  r   r  r2  rc  r   rd  ri   r   r   r   r   r   popr   r   r^  r  r   r   r"  _rename_outputr   r   r   r   r   Backwardr   r   r   )+rX   r|   r   r   first_backward_opr6   first_backward_varname	cur_groupgrad_groupsgrouped_grad_namesr  r   	is_reduce	grad_namer   rg  grad_varjdep_opdep_varnamer>  r=  modify_reduce_op_mapcoalesce_op_mapremove_reduce_op_indicesr   r,  last_reduce_op_idxlast_allreduce_op_idxr  coalesce_op_setmodify_op_setremove_op_setconflictr   first_grad_name
grad_namesr  r  grad_r   coalesce_opr|  s+                                              r7   r;  zShardingPass._group_grads  s	   0 i&** +,D'  	 	Bb!! $&! $F!2!CA!FT899	 UU	: 	: 	: #c((llQB)"-- 6BGx/ BGGM4J4JM%M%O 5	 !  p y /2	:9EE
$11*== 99Y//''$77 6$,>>>>>%%h5555&&y111 ()D E EI%%h555y~&&!++01AI- A(QU44 Q )QU44  QZF"("9!"<K5@I2"&&y111+221555* }/N/N0 0  37I/q1u:?l::s1q5z%@ @ *37@ @ @
 u@ @  q1u:6q9YFFFZ GFF 299!a%@@@FA##B	22 B""9---$T%@AA	FAu #c((llx y~!##y)))DEEE [(C  [  [M_I`I`  [  [  B  CN  O  O  [  [  [	
 	
 	

 "$ "!#% !+.. "	C "	CHAu5:""%*%5%5$-1CFF:   + %"& &6 & &" &GG
1  
 "&&!.!.*3    :? 56%*%<%@%@%B%B";@$%78(//0GHHH* P,1,F,J,J,L,L)BG()>?,33E4NOOO%*Z]"
 : :49&ty11=B!%"4"9::o2244550557788455"//>>8}}!!!!"//>>8}}!!!! --m<<8}}!!!!  Yuy%9%9 : :;; H	 H	GC***,S1/2	 EJrN$7777g5:b>;Ngg[dggg 877   E,>,CDDD!!)U-?-DEEE...  5 111o%%',"'*Q-"4&"*====eOee`beee >== @?EJ???
"$!#"Z 6 6E!KE#**5111"))#e**5555#;;*#Z0",','9 
 $U#T)?(.#V_ <  & &GG*  
 G!.!.&*3    9IIe899&&O" "'!=  	 	$&<<<r9   c                 R  ,-. | j         sdS g | _        |j        j        }t	          | j                  D ]C}|dk    r|j        }nt          |d          }d| }| j                            ||d           D|j        }	i }
d}i }t          |	          D ]\  }}t          |          r|j        dk    r-|                    d          t          j        j        j        fv rN|| j        z  }|||<   | j        |         d	         }| j        |         d
         }|                    d          d         }||         }|j        j        |k    sJ t)          |j                  dk    ri||j        d         |j        |d|j        fg|
|<   |dz   }| j        r|j        r|dz  }|
|                             ||j        |j        |d|j        f           ||j        _        | j        |j        _        |                    d|j                   | j        r|j        r|	|dz            }|j        dk    r,|                    d          t          j        j        j        fv sJ |                    d          d         |k    sJ ||j        _        | j        |j        _        |dz  }|j        dk    r|                    d          t<          j        j        k    rX|j        sQ|                      d          s<||
vrg |
|<   |
|                             |dz   |j        |j        dd|j        f           |dz  }|dz  }tC          |
"                                d          }|D ]}|
|         ddd         D ]\  }}}}}}|dk    rdnd}tG          ||||| j$        tJ          j&        dgdd||          }|AtO          ||j(        | j$        |j)                   |||j        _        | j        |j        _        | j*        r d-| j+        -z  dk    sJ |j        }|j        }| j,        -z  .| j,        -z  ,-.fd|D             }tZ          .                    d           tZ          .                    d| j,         d           tZ          .                    d| d           t)          |          | j+        -z  k    sJ ,-fd|D             } t)          |           -k    sJ tZ          .                    d|  d           g }!g }"t	          | j                  D ]J}#|!                    t          |d                     |"                    t          | d                     Kt_          ta          t          |j                                      D ]\  }}t          |          rz|j        dk    r(|                    d          t<          j        j        k    sJ ||         }$|!|$         }%|"|$         }&|                    d          d         }| j         r|j        j        }tc          |                    d                    }'d}(|'-z  .k    rd}(|'-z  })|                    d|&j                   |                    d|)           |(r|'-z  }*|2                    |dz   dd |id!|id|%j        d|*dt<          j        j        tf          tJ          j&        i"          }+|+                    d#d$th          j5        z              | j         r||+j        _        | j        |+j        _        |6                                 dS )%a  
        overlap gradient communication with backward & optimizer computation.

        1. assign gradient communications to grad comm stream
        2. for coalesce gradient communication:
            2.1 insert before communication dependencies
            2.2 insert after communication dependencies only when need
        3. there is not need to add explicit dependencies for non-coalesce gradient communication

        P.S. this overlap pass is ONLY adapted for standalone executor (graph based) and stream award allocator.
        Nr   TrA  sharding_grad_comm_streamrC  r   r   rD  rE  r   r   rK   sharding_grad_comm_depr   r   rM   sharding_reduce_avg_deprV  F)r   rZ  r   r1   skip_insert_when_sequential_run)r   ctxr      c                 &    g | ]}|z  k    |S rd   rd   )r   rg  nranks_per_noderelative_idx_in_nodes     r7   r   z3ShardingPass._overlap_grad_comm.<locals>.<listcomp>|  s4          /)-AAA AAAr9   z:Sharding Gradient Hierarchical Communication Optimization.zcurrent global rank idx: rT  zlocal inter node ranks idx: c                 &    g | ]}|z  k    |S rd   rd   )r   rg  node_idxr  s     r7   r   z3ShardingPass._overlap_grad_comm.<locals>.<listcomp>  s3          ?*h66 666r9   zlocal intra node ranks idx: root_idr   r   r   r1   r   )7rB   grad_comm_group_stream_pairsr   r   r`  rD   r   r   r   r   r   r   r4   r   r   r   r   rk  rd  r   r`   r   r   rU   r  rh  r4  ri  r   r   r  r  r]   rm  r^  r   ri   r   r  r   r   r   rH   rj   rJ   r\  r]  r   r   r_   r   r   r   r   r   )/rX   r|   r   r=  r>  r   r   r   streamr   rt  reduce_op_countgrad_comm_op_to_stream_idxr   r6   
stream_idxrD  rE  reduce_varname
grad_grouppost_idxnext_opry  rz  r{  r1   r   r  r|  global_groupglobal_ranksinter_node_ranksintra_node_ranksinter_node_groupsintra_node_groups_grad_comm_stream_idxinter_node_groupintra_node_groupdst_rankin_peerintra_node_dstinter_node_dstr   r  r  r  s/                                               @@@r7   r<  zShardingPass._overlap_grad_comm  s	   & " 	F,.)#) t011 	 	AAvv%+)%FFF 544F-44"'#)     i%'" ~~ g	 g	GC)"-- d%7l**rww}/E/E&/3J 0 0 ,t/HH
1;*2.!>zJ 
 #?
K! "$5!1!1!!42>B
!.3~EEEE z''!++  &OB/&3'4L	$GCL  #QwH. &:3O & ACL''$&3&O'4L	 	 	 1<-4 0 Y
666* z/K !#'lG"<<77GLL%= = *37= = = 
 #>>%003~EEEE :EG%68 %9 1HC Gx''..$-2CCC&8 D MM*JKK D '))')CL''!G&3&3 5L	 	 	  1$1HCC 555 (	 (	A DDbD!' '  *-FFFEED 0 9&O" "'!-4S  	  (>!%.%; .!*!3	    #.?J	+< < "+?K'T ( Z	  O+o=BBBB(.L'-L#'#3o#E '?:H         (     
 LLL   LLHT5EHHHIIILLK8HKKKLLL$%%+>? ? ? ?         (     
 '((O;;;;LLK8HKKKLLL " "4455  !((%&6MMM   "((%&6MMM   
 $D59)=)=$>$>?? / /R-b11 .8++GGM22dm6GGGGH ,Fb+I('89M'N$'89M'N$%'YYu%5%5a%8N* D&(l&C"2779#5#566H#G/15III"&%-%?NLL,<,?@@@LLN;;; )1_)D!&!>!>!G!)$'#8 %~% !*+;+> )> -t}/@ +V_	# "? " " ((*C,2K,K    . @KF,= $ @ #,@ 	r9   )__name__
__module____qualname__rO   rb   rf   r}   ro   r   r   rq   r   r   r   r   r   r   r   rr   rs   rt   r7  r8  r9  r;  r<  __classcell__)rY   s   @r7   r<   r<   [   s       & & & & &0& & &P  3? 3? 3?j= = =  */J /J /JbE E E=$ =$ =$~B$ B$ B$H$ $ $&' &' &'P!$ !$ !$F; ; ;
  6$ 6$ 6$pE' E' E'NQ Q Q2
 
 
"o$ o$ o$b  |= |= |=|` ` ` ` ` ` `r9   r<   c                 X    | j         D ]}|j        t          v r|c S t          d          )NzCould not find optimizer op.)r   r   r   	Exception)r|   r6   s     r7   rl  rl    s@    i  7///III 0 2
3
33r9   c                    |                      |          }|                    |          }	|                     |dd|id|id|d|t          |i          }
|
                    ddt
          j        z              t          |
|	j        |	j	        ||	j
        	           ||k    rZ|                     |d
d|j        id|j        d|j        t          |i          }
t          |
|	j        |	j	        ||	j
        	           dS dS )z%
    empty op for initialization
    r   r   r   r   r   r   r1   r   r   emptyr   r   r   r   N)r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r|   
insert_idxr   r  r(  r   r   r>   ru  broadcast_var_dist_attrr   s              r7   r$  r$    sW    IIg&&M*KK  **W~ wI
 + 
 
F ^S<+D%DEEE:,,(1    Y..M./,,W	 / 	
 	
 	?#0#0,5	
 	
 	
 	
 	
 	
 r9   c	                 b   |dk    sJ d|             |                      ||d|gid|gid|d|d|t          |i          }	|                    |                     |                    }
t	          |	|
j        |
j        ||
j        	           |	                    d
dt          j
        z              |	S )Nr   z5root id should be a positive int, but now root id is r   r   r   r  r   r   r   r1   r   )r   r   r   r   r   r   r   r   r   r   r   )r|   op_typer  
reduce_varr   r  r>   r   r   r   r   s              r7   r  r    s     a<<<III << **j\"%ww;	
 +  F ==		* I ;#    ^S<+D%DEEEMr9   c                     d}d}t          |           |z  |g}t          | |||          }t          | |||          }||fS Nr   r   )r`   r   )origin_groupsharding_group_sizerg  dp_axissharding_axisr   r   r   s           r7   r   r   ;  sV    GM"557JKE|UGTBBH$\5-NNN^##r9   c                     | j                             d          o,| j                             d                              d          S )Nr1   z/gradient_clipr2   r3   r4   
startswithr5   s    r7   r   r   F  sB    7N++ #1 1j!""#r9   c                     | j                             d          o,| j                             d                              d          S )Nr1   z/regularizationr  r5   s    r7   r   r   L  sB    7N++ $1 1j"##$r9   c                 X   t          |          sdS t          | |t          t          j        j        j                  sdS t          | |          rdS |j        d         }|d |	                    d                   }| 
                    |          sdS |                     |          j        S )NFr   r   )r   _is_desired_cast_oprz   r   rl   rm   FP32r   r   r   r   r   is_parameter)r|   r6   r   r  s       r7   r   r   R  s    " ur')=)B   uub)) u%a(K3k..s3334I==## u99Y,,r9   c                 r    t          |          rdS t          | |          sdS |j        d         }||vrdS dS )NFr   T)r   r  r   )r|   r6   r   r   s       r7   r  r  b  sP    b uub)) u#A&Ju4r9   c                 8   |j         dk    rdS t          |j                  dk    sJ t          |j                  dk    sJ |                     |j        d                   }|                     |j        d                   }|j        |k    s|j        |k    rdS dS )Nr  Fr   r   T)r   r`   r   r   r   r   )r|   r6   src_var_typedst_var_typer*  
output_vars         r7   r  r  m  s     
w&ur!""a''''r"##q((((		",Q/00I2.q122J,&&**:l*J*Ju4r9   c                     d }d| v r| d |                      d                   }nCd| v r| d |                      d                   }n!d| v r| d |                      d                   }|S )Nz.cast_fp16@GRADz.cast_bf16@GRADr   )r   )r  r  s     r7   r  r    s    II%%A	/@ A AAB			i	'	'A	/@ A AAB			I		7	w 7 778	r9   c                     t          |           sdS | j        d         }t          |          }|                    |          sdS |                    |          j        S )NFr   )r   r   r  r   r   r  r6   r|   r   r  s       r7   r  r    s_    %b)) u%a(K-k::I==## u99Y,,r9   c                     t          |           sdS | j        dk    rdS | j        d         }t          |          }|                    |          sdS |                    |          j        S )NFr   r   )r   r   r   r  r   r   r  r  s       r7   r  r    sq    " u	w%u%a(K-k::I==## u99Y,,r9   c                     | j         dk    o?| j                            d          o%t          j        | j                            d          v S )Nr   r1   )r   r2   r3   r   r   r4   r5   s    r7   rj  rj    sI    
; 	FG^,,	F%n)E)EEr9   c                    d }|j         D ]}t          ||j        |          s|                    |          }|j        }|                    |          }|j        }	t          |          dk    rd|d         }
|
dk    rA|	|
         dk    r5||
|k    r-t          |j	        |j        |
|           }t          |          } n|S )Nr   rK   r   )r   r   r|   r   r   get_input_dims_mappingr   r`   r   process_idsr   )rank_idr6   r>   r  r   r   r   r   input_dim_mapping
mesh_shapebatch_size_axisgroup_rankss               r7   r   r     s     H(  
 $J,GG 	$AA"EEI$1L ) @ @ L L%+J$%%**/2O##
?(Ca(G(G?o&@&@"1$0$*'	# #K  1==HEOr9   c                 *   i }d}g }| D ]-}t          |          }||z  }|                    ||f           .d t          |          D             }d}d}|D ]<\  }}||dz  |dz   z  |z  k    r|dz  }||                             |           ||z  }=|S )z
    shard the continuous param into same rank and divide the forward&backward computation into segment,
    which will favor the fuse pass in later.

    we assume that the params is already sorted by utilization order.
    g        c                     i | ]}|g S rd   rd   )r   r   s     r7   
<dictcomp>z*partition_by_use_order.<locals>.<dictcomp>  s    000q"000r9   r   g      ?r   )r   r   r`  )	r   
group_sizemappingtotal_param_mem	param2memr   memcur_rankmem_accus	            r7   partition_by_use_orderr	    s     GOI ' '5!!3%&&&&00eJ//000GHH  
so+x!|<zIIIMH  '''CNr9   c                 N   i }t          |          D ]}g ||<   dg|z  }| D ]}|                    t          |                    }||                             |           t	          d |j        d          }|dk    sJ d|j         d| d            ||xx         |z  cc<   |S )zJ
    use greedy algorithm to partition parameter as even as possible.
    r   c                     | |z  S r   rd   )r   ys     r7   <lambda>z*partition_by_greedy_even.<locals>.<lambda>  s
    AE r9   r   zparam [z#] should larger than 0, but it is [r   )r`  indexminr   r   r   r   )r   r  r  rank_sizesr   rg  numels           r7   partition_by_greedy_evenr    s     Gz""  C*E  {{3u::&&U###))5;::qyyyMejMMUMMM yy 	duNr9   greedy_evenc           
      |   |dk    rt          | |          }nt          | |          }t                              d           |                                D ]e\  }}t                              d| dt          d |D                        d           t                              dd |D              d	           f|S )
Nr  zSharding Parameter Partition:zRank:z, Parameter Size:c                 ,    g | ]}t          |          S rd   r   r  s     r7   r   z(partition_parameters.<locals>.<listcomp>  s     ,L,L,L3\#->->,L,L,Lr9   z MB.zParams in this rank: c                     g | ]	}|j         
S rd   rS  r  s     r7   r   z(partition_parameters.<locals>.<listcomp>  s    -D-D-D3ch-D-D-Dr9   rT  )r  r	  r\  r]  itemsr   )r   r  algorrank_to_paramskvs         r7   partition_parametersr    s    1&*EE/
CCLL0111$$&& I I1SASS,L,L!,L,L,L(M(MSSS	
 	
 	
 	G-D-D!-D-D-DGGGHHHHr9   c                    i |D ]\  }}||f|j         <   g }| j        D ]K}|j        D ]}|v r||vr|                    |            t	          |          t	                    k    r nL| j        d         }i }	t	          | j                  }
g }t          |          r|j        t          v rxt          t          t          | j                                      D ]n\  }}|j        t          v r[t	          |                    d                    dk    sJ ||	|                    d          d         <   |                    |           ot	          |          t	          |	          k    sJ |D ]l}|                     d          }|j                            |	|         j                   |                    ||                    |	|                              m|D ]}|                     |d           |                                  t	          | j                  |
k    sJ t&                              d	| d
           fd|D             S )NrK   r   r   r   nop)r   Fr   z(Sharding the Order of param being used: rT  c                      g | ]
}|         S rd   rd   )r   rQ  pname_to_pg_pairss     r7   r   z$re_order_program.<locals>.<listcomp>=  s    444Qa 444r9   )r   r   r   r   r`   r   r   r   r   r   r   r   r   r2   	copy_fromset_op_dist_attr_for_programr   r   r   r\  r]  )r|   param_gradsr>   rQ  g	use_orderr6   r   last_oppname_to_opnum_opsremove_op_indicesr   pnamer   r!  s                  @r7   r   r     s    + +1%&F!&!!Ii  , 	- 	-J///)++  ,,,y>>S!23333E 4 imGK%)nnGg )7<3L#L#LYuy%9%9 : :;; 	. 	.GCw333288G,,--222246BHHW--a01!((---9~~[!1!11111  	 	E__%_00FK!!+e"4"9:::5599+e:LMM    % 	. 	.CSu----59~~(((( LLHIHHHIII4444)4444r9   c                    i }i }g }t          |          }| j        D ]}|                     |j                  }|                    ||          r|                    ||           n%t          |          }|                    ||           |                     |j                  |_        ||v r!||                             |j                   n|j        g||<   |||j        <   ||fS )zA
    param are group by:
    rank id
    fuse_size
    dtype
    )r  r   r   r   r  r  r  r   )r   	fuse_sizern  ro  bucketr  r   rg  s           r7   r[  r[  @  s    F##I% 3 3))%*55t,, 	+eT**** ++IeT***&3&E&EJ'
 '
	# ***y)00<<<<-2ZLy))25:&&111r9   c                   2    e Zd Zd Zd Zd Zd Zd Zd ZdS )r   c                    || _         d |D             | _        t          | j                  t          t          | j                            k    s
J d            d |D             | _        d | j        D             | _        |j        | _        || _        |j	        
                    | j                  | _        || _        t          | j        | j        | j                  | _        i | _        |                                  d S )Nc                 (    i | ]\  }}|j         ||fS rd   rS  )r   rQ  r%  s      r7   r  z)ShardingInfo.__init__.<locals>.<dictcomp>e  s$    EEE1QVaVEEEr9   z&found duplicated param in params_gradsc                     g | ]\  }}|S rd   rd   )r   rQ  r  s      r7   r   z)ShardingInfo.__init__.<locals>.<listcomp>j  s    222TQq222r9   c                     g | ]	}|j         
S rd   rS  rP  s     r7   r   z)ShardingInfo.__init__.<locals>.<listcomp>k  s    888qAF888r9   )r   rI   r`   rQ   r   r  r   r  rJ   r   r  r  rG   r  r  param_to_rank_map_param_to_rank)rX   r   rg  rI   rG   s        r7   rO   zShardingInfo.__init__c  s    
EEEEE4$%%S1B-C-C)D)DDDD4 EDD 32\22288DK888,+++D,<==.2K$*>
 
  !!!!!r9   c                 l    | j                                         D ]\  }}|D ]}|| j        |j        <   dS )z@
        mapping parameters to the rank which holds it.
        N)r  r  r4  r   )rX   rg  r   r   s       r7   r5  zShardingInfo._map_param_to_rankx  sV     !/5577 	6 	6LD& 6 615"5:..6	6 	6r9   c                 2    || j         v r| j         |         S dS )NrK   )r4  )rX   r   s     r7   r   zShardingInfo.get_var_rank  s#    d(((%g..rr9   c                 >    |                      |          | j        k    S r   )r   r  rX   r   s     r7   r  zShardingInfo.is_in_local_shard  s      ,,??r9   c                    t                      }t                      }i }t                              | j        d          }|j        D ]5}t          |          r|j        D ]}|| j        v r||xx         dz  cc<   6|j        D ]}t          ||| j                  s|j        d         }|j        d         }|	                    |           |	                    |           |||<   ||xx         dz  cc<   | j
        |         | j
        |<   |                                D ] \  }	}
|
dk    r|	                    |	           !||fS r  )rQ   dictfromkeysr  r   r   r   r  r   r   r4  r  )rX   r|   broadcast_varsfp16_paramsfp16_to_fp32r&  r6   r   r   r   usages              r7   r  z/ShardingInfo.get_broadcast_vars_and_param_usage  s   eemmD$4a88) 	1 	1Bb!!  0 1 1
!111
+++q0+++1 ) 		M 		MB)%T5EFF +A.J-a0K{+++OOK((((2L%
###q(###.2.@.LD{++'--// 	* 	*LE5qyy""5))){**r9   c                     |                      |          st          d| d          || j        vrt          d| d          | j                            |d           S )Nzparam[z] not in current rank.z] not in params_grads)r  
ValueErrorrI   getr9  s     r7   r  zShardingInfo.get_param_grad  st    %%j11 	JHjHHHIIIT...GjGGGHHH $$Z666r9   N)	r  r  r  rO   r5  r   r  r  r  rd   r9   r7   r   r   b  sq        " " "*6 6 6  @ @ @+ + +:7 7 7 7 7r9   r   c                   &    e Zd Zd Zd Zd Zd ZdS )r  c                     || _         d | _        d| _        d| _        g | _        d | _        d | _        d | _        g | _        g | _	        d| _
        d S )NrK   r   F)max_sizer   rg  r  r   rd  r  r  r  r  r  )rX   rF  s     r7   rO   zVarGroup.__init__  s[     
	
	 $(!#!#$&!!&r9   c                     | j         dk    rdS |j        | j        k    rdS || j        k    rdS | j         t          |          z   | j        k    rdS dS )Nr   TF)r  r   rg  r
   rF  rX   r   rg  s      r7   r  zVarGroup.acceptable  s^    :??4{dj((uty  uzM%0004=@@u4r9   c                     |j         | _         || _        | xj        t          |          z  c_        | j                            |           d S r   )r   rg  r  r
   r   r   rH  s      r7   r  zVarGroup.collect  sF    [
	

mE***

	r9   c                 *    t          | j                  S r   )r`   r   ra   s    r7   __len__zVarGroup.__len__  s    49~~r9   N)r  r  r  rO   r  r  rK  rd   r9   r7   r  r    sP        ' ' '
 
 
         r9   r  r   )r  )Ologging	functoolsr   r   paddle.distributedr   r  8paddle.distributed.auto_parallel.static.operators.commonr   r   r   5paddle.distributed.auto_parallel.static.process_groupr   -paddle.distributed.auto_parallel.static.utilsr   r	   r
   r   r   r   r   r   r   r   r   7paddle.distributed.fleet.meta_optimizers.sharding.utilsr   paddle.frameworkr   paddle.staticr   r   paddle.utilsr   auto_parallel_master_gradr   	pass_baser   r   
pass_utilsr   op_proto_and_checker_makerr   kOpRoleAttrNamer   r   r   INFOr\  rl   rm   FP16rz   r{   r8   r<   rl  r$  r  r  r   r   r   r   r  r  r  r  r  r  rj  r   r	  r  r  r   r[  r   r  rd   r9   r7   <module>r]     s           ! ! ! ! ! !         
                               Q P P P P P ! ! ! ! ! ! G G G G G G G G $ $ $ $ $ $ > > > > > > . . . . . . . . . . . . . .		(	/-==??  	    *W\
"
"|+0 % C C C '((r r r r r8 r r )(rj+4 4 46
 6
 6
D O& & & &R$ $ $# # #$ $ $- - -    %*%	   &  
- 
- 
-- - -   (,   >  4  (    15 15 15h2 2 2DM7 M7 M7 M7 M7 M7 M7 M7`! ! ! ! ! ! ! ! ! !r9   