
    Αi'                        S SK r S SKJr  S SKJr  S SKJr  S SKrS SKJ	r	  S SK
JrJr  S SKJr  S SKJrJrJrJr  S S	KJr  S
SKJr  \	R2                  R4                  R6                  \	R2                  R4                  R8                  \	R2                  R4                  R:                  \	R2                  R4                  R<                  \	R2                  R4                  R>                  /r \" \ RB                  5      r" " S S\5      r#S/S jr$S r%S r&S r'S r( " S S5      r)S r*S r+S r,S r-S r.S r/S r0S0S jr1S0S jr2S r3S/S jr4S  r5S! r6S" r7S1S# jr8S$ r9 S0S% jr:S& r;S/S' jr<S( r=S) r>  S2S* jr? S3S+ jr@S, rA " S- S.5      rBg)4    N)OrderedDict)Enum)reduce)core)	ParameterProgram)OperatorDistAttr)
get_loggeris_backward_opis_optimize_op6naive_set_dist_op_attr_for_program_by_mesh_and_mapping)_current_expected_place_   )OpRolec                        \ rS rSrSrSrSrSrg)AutoParallelStreamType3   defaultauto_parallel_mpauto_parallel_sharding N)__name__
__module____qualname____firstlineno__CALC_STREAM	MP_STREAMSHARDING_STREAM__static_attributes__r       d/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/passes/pass_utils.pyr   r   3   s    K"I.Or    r   c                 x    Uc  [        5       nO[        U[         5      (       d   eU  H  nX!;  d  M
  SX'   M     U$ )NT)r   
isinstance)list_objordered_dictobjs      r!   list_to_ordered_dictr'   9   sB    "},4444" $L  r    c                     [        5       n/ nU R                  5       R                   Hc  nUR                   H,  nXA;  d  M
  UR	                  U5        UR                  U5        M.     UR                   H  nUR                  U5        M     Me     U$ N)setglobal_blockopsinput_arg_namesappendaddoutput_arg_names)programvisited_vars
input_varsopin_var_nameout_var_names         r!   get_inputs_of_programr7   F   s    5LJ""$((--K.!!+.  - .
 //L\* 0 ) r    c                     [        5       nU R                  5       R                   H  n[        UR                  U5        M     [        UR                  5       5      $ r)   )r   r+   r,   r'   r0   listkeys)r1   output_varsr4   s      r!   get_outputs_of_programr<   T   sF    -K""$((R00+> )  "##r    c                    [        U R                  5       R                  5      nUS:  a  X-  nUS:  a  X:  d   eUS:  a  X#-  nUS:  a  X#::  d   U5       eX:  d   eU R                  5       n [	        US-
  US-
  S5       H!  nU R                  5       R                  USS9  M#     [	        US-
  SS5       H!  nU R                  5       R                  USS9  M#     U R                  5         [        5       nU R                  5       R                   HK  nUR                   H  nUR                  U5        M     UR                   H  nUR                  U5        M     MM     / n	U R                  5       R                   H  n
X;  d  M
  U	R                  U
5        M     U	 H!  n
U R                  5       R                  U
SS9  M#     U R                  5         U $ )Nr      Fsync)lenr+   r,   clonerange
_remove_op_sync_with_cppr*   r-   r/   r0   varsr.   _remove_var)r1   start_op_idx
end_op_idxop_numidx
valid_varsr4   r5   r6   vars_to_removevars              r!   prune_programrP   [   s   %%'++,Fa1!666A~
?z3?Z?3$$$mmoGVaZa4))#E): 5\A%r2.))#E): /J""$((--KNN;' .//LNN<( 0 ) N##%** !!#& + **3U*; Nr    c                    U(       d   S5       e[        U R                  5       R                  5      nUS:  d   S5       eU Vs/ s H  o3S:  a  UOX2-   PM     nnUS   S:w  a  S/UQnUS   U:w  a  UR                  U5        [	        [        U5      S-
  5       H  nX   XS-      :  a  M   S5       e   / n[	        [        U5      S-
  5       H(  n[        XU   XS-      5      nUR                  U5        M*     [        U5      nU Vs/ s H  n[        U5      PM     nnU Vs/ s H  n[        [        U5      5      PM     n	n[	        U5       V
s/ s H  n
[        5       PM     nn
U	S   US'   [	        SU5       H;  nX    H0  n[        [	        U5      5       H  nXU   ;   d  M  SX   U'     M.     M2     M=     U Vs/ s H  n[        UR                  5       5      PM     nnXHU4$ s  snf s  snf s  snf s  sn
f s  snf )aY  
Split the program by op_indices.

For examples, a program has 100 ops, and op_indices = [25, 60].
Then the program is split into 3 parts, containing 25, 35 and 40
ops respectively.

The return values are a tuple with 3 elements: the split program
list, the input var names of each split program, and the output
var names of each split program.
zop_indices cannot be emptyr   zprogram cannot be emptyr?   r>   z"op_indices must be strictly sortedT)rB   r+   r,   r.   rD   rP   r7   r'   r<   r   reversedr9   r:   )r1   
op_indicesrK   rL   split_programs	new_split	num_splitpr3   r;   _valid_output_varsir5   jitems                   r!   split_programr]   ~   s"    333:%%'++,FA:000:?IJz#s|3zJJ!}%*%
"~&!S_q()!G!44 	
0	
4 *
 NS_q()!'c?JQw<OP	i( * N#I4BCNq'*NJCAOAOA3A67   16i0@A0@10@A'Ob1i %=KeAh'a.08<%(5 ( ) ! 8II7Htdiik*7HI'888? K$ D B Js   G&G+#G0G5=#G:c                   @    \ rS rSrSrS r\S 5       rS rS r	S r
Srg	)
OpInOutInfo   z[
Record unused buffer input_vars of op and other var_names except unused buffer input_vars
c                 N    SU l         [        5       U l        [        5       U l        g )NF)	_is_buildr*   _no_need_buffer_slots_other_arg_names_setselfs    r!   __init__OpInOutInfo.__init__   s    %(U"$'E!r    c                     U R                   $ r)   )rb   re   s    r!   is_buildOpInOutInfo.is_build   s    ~~r    c                     0 nUR                    H  nUR                  U5      X#'   M     0 nUR                   H  nUR                  U5      XE'   M     0 nUR                   H  nUR                  U5      Xg'   M     X$U4$ r)   )input_namesinputoutput_namesoutput
attr_namesattr)rf   r4   inputs
input_nameoutputsoutput_nameattrs	attr_names           r!   _get_op_attrsOpInOutInfo._get_op_attrs   s{    ..J!#*!5F )??K#%99[#9G  +I!wwy1E ' %%r    c                    U R                  U5      u  p#n[        R                  " UR                  X#U5      U l        [        U R                  5      S:X  a  g UR                   HG  nXPR                  ;  d  M  UR                  U5       H  nU R                  R                  U5        M      MI     UR                   HG  nXPR                  ;  d  M  UR                  U5       H  nU R                  R                  U5        M      MI     SU l        g )Nr   T)ry   r   infer_no_need_buffer_slotstyperc   rB   rm   rn   rd   r/   ro   rp   rb   )rf   r4   rs   ru   rw   	slot_namein_nameout_names           r!   
build_infoOpInOutInfo.build_info   s    !%!3!3B!7%)%D%DGGVe&
" t))*a/I : ::!xx	2G--11':  3 (
 I : :: "		) 4H--11(; !5 )
 r    c                 \    [        U R                  5      S:H  =(       d    XR                  ;   $ Nr   )rB   rc   rd   )rf   arg_names     r!   	is_neededOpInOutInfo.is_needed   s+    **+q0 5444	
r    )rb   rc   rd   N)r   r   r   r   __doc__rg   propertyrj   ry   r   r   r   r   r    r!   r_   r_      s/    *
  &(
r    r_   c                 \    UR                  U 5      nUS L=(       a    UR                  (       + $ r)   )_find_var_recursivepersistable)var_nameblockrO   s      r!   var_can_be_deletedr      s(    

#
#H
-Cd?23??22r    c                 v   [        5       nU R                   H  nUR                   H  nUR                  S;   a  M  [	        5       nUR                  U5        UR                  UR                  -    H>  n[        XR5      (       d  M  UR                  U5      (       d  M-  UR                  U5        M@     M     M     U$ )zV
Get all vars in the program that are non-persistable and not in op's no_need_buffer.
)c_sync_comm_streamconditional_blockdatanopwhile)r*   blocksr,   r}   r_   r   r-   r0   r   r   r/   )r1   required_varsr   r4   op_infor   s         r!   _get_required_vars_of_programr      s     EM))Bww   !mGr"..1D1DD%h667;L;L< < "%%h/	 E   $ r    c                     [         R                  R                  R                  S5      S   (       a  [	        XX#5      $ [        XX#5      $ )a  
Set `skip_gc_vars` for every job in jobs.

A whole_program is split up into sub_programs according to the schedule mode,
thus a sub_program's vars might be used as the op's input of the later sub_program,
and these vars cannot be gc after executing current sub_program.
FLAGS_enable_pir_api)paddlebase	framework	get_flags_set_skip_gc_vars_in_pir_set_skip_gc_vars_in_old_ir)num_micro_batches	job_typessub_programsjobss       r!   set_skip_gc_varsr     sM     {{&&'=> (,
 	
 +,
 	
r    c           	      N   U S:  d   S5       e[        [        X5      5      n0 nUR                  5        H  u  pg[        U5      XV'   M     [	        U 5       Vs/ s H  n[        5       PM     n	n[        U5      n
[        [	        U
5      5       H  nX;   nUR                  5       nX]   nUR                  5       nXU   -  n[        R                  SU SU SU 35        US;   a  [        U5      S:X  d   SU S	U S
35       eUR                  U5        X==   U-  ss'   M     U$ s  snf )Nr>   "num_micro_batches needs to be >= 1Skip gc vars for -(): backward
backward_wr   BWhen enabling pipeline parallelism strategy, the skip_gc_vars for % subprogram must be empty, but it is .)dictzipitemsr   rD   r*   rB   rR   r}   micro_batch_idloggerdebugr   )r   r   r   r   type_to_programtype_to_required_varsr}   r1   rZ   suffixed_required_varsnum_jobsjob_idjobjob_typer   r   skip_gc_varss                    r!   r   r     sc    !G#GG!3y78O (..0&CG&L# 1 .33D-EF-Ece-EF4yH5?+l88:-7++-$n'MMzN+;3|nM	
 11|$) TU]T^  _D  EQ  DR  RS  T) 	\*.-?.! ,$ ) Gs   D"c           	      2   U S:  d   S5       e[        [        X5      5      n0 n[        R                  " U5      nUR	                  5        GHk  u  px[        5       n	[        5       n
UR                  5       R                  5        H  nU	R                  U5        M     UR                  5       R                   H  nUR                  5        H_  nUR                  (       d  M  U	R                  UR                  5        UR                  (       d  MD  U
R                  UR                  5        Ma     UR                  5        H_  nUR                  (       d  M  U	R                  UR                  5        UR                  (       d  MD  U
R                  UR                  5        Ma     M     Xv;   a  XU   -  n	X-  n	XU'   GMn     [        U 5       Vs/ s H  n[        5       PM     nn[!        U5      n[#        [        U5      5       H  nUU   nUR%                  5       nXW   n	UR'                  5       nXU   -  n[(        R+                  SU SU SU 35        US;   a  [!        U5      S:X  d   SU S	U S
35       eUR-                  U5        UU==   U	-  ss'   M     U$ s  snf )Nr>   r   r   r   r   )send_backwardr   r   r   r   r   )r   r   r   get_no_need_buffer_valuesr   r*   r+   kwargsr/   r,   operands_sourcehas_namenamer   resultsrD   rB   rR   r}   r   r   r   r   )r   r   r   r   r   r   no_need_buffer_varsr   r1   r   persistable_varskeyr4   rO   rZ   r   r   r   r   r   r   s                        r!   r   r   <  sq   !G#GG!3y78O 88I,2245'')002Cc" 3&&(,,B))+<<<!%%chh/(,,SXX6	 ,
 zz|<<<!%%chh/(,,SXX6	 $ - *::M)*7h'' 5, .33D-EF-Ece-EF4yH5?+6l88:-7++-$n'MMzN+;3|nM	
 66|$) TU]T^  _D  EQ  DR  RS  T) 	\*~.-?.! ,$ ) Gs   Jc                 ~   0 nUR                   US'   UR                  US'   UR                  US'   UR                  US'   UR                  US'   [        SU UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S.
UD6  g )N	trainableoptimize_attrregularizerdo_model_average	need_clip)
r   r}   r   shapedtype	lod_level
error_clipstop_gradientis_databelong_to_optimizerr   )r   r   r   r   r   r   r}   r   r   r   r   r   r   r   r   )	dst_blocksrc_varcopied_kwargss      r!   _create_paramr   p  s    M!(!2!2M+%,%:%:M/"#*#6#6M- (/(@(@M$%!(!2!2M+ \\\\mmmm##%%++#77 r    c                     U R                  UR                  UR                  UR                  UR                  UR
                  UR                  UR                  UR                  UR                  UR                  S9
  g )N)
r}   r   r   r   r   r   r   r   r   r   )
create_varr}   r   r   r   r   r   r   r   r   r   )r   r   s     r!   _create_interr     se    \\\\mmmm##''%%++#77  r    c           
         U(       d  U R                  U5      nOU R                  U5      nUR                  [        ;   a_  [	        USS5      nUR                  UR                  UR                  UUR                  UR                  UR                  UR                  S9  g [        U[        5      (       a  [        X5        g [        X5        g )Nr   F)r}   r   r   r   r   r   r   )rO   _var_recursiver}   __not_shape_var_type__getattrr   r   r   r   r   r   r#   r   r   r   )	src_blockr   src_varnameforce_creater   persists         r!   _create_varr     s    --,**;7||--'=%8))!//OO ' ; ; 	 	
 gy)))-)-r    c                    UR                   R                  5       nUR                  UR                   5        UR                   HF  nU R	                  U5      (       d!  U(       d  M"  U R                  U5      (       d  M:  [        XXS5        MH     UR                   HF  nU R	                  U5      (       d!  U(       d  M"  U R                  U5      (       d  M:  [        XXc5        MH     g r)   )desc	append_op	copy_fromr-   has_varr   r   r0   )r   r   src_opr   dst_op_descinput_varnameoutput_varnames          r!   _create_programr     s    ..**,K&++&//]++LY::=II	mJ	 0
 !11^,,LY::>JJ	nK	 2r    c                    U R                    H  nUR                   H  nUR                  5       S:X  a^  UR                  SS5        UR                  SS5        UR	                  5       S   nUR                  SU 35        UR                  S5        Mu  UR                  5       S	:X  d  M  UR                  SS5        UR                  SS5        UR                  S
5        UR                  S5        M     M     g)au  
This function is used to replace the function '_insert_sync_for_fthenb_1f1b'.
The finally target of this function is as follows:
    1. no need to insert the 'c_sync_calc' and 'c_sync_calc' operators
    2. 'send_v2' operator uses 'dist_attr.execution_stream' to set stream of its own.
    3. 'recv_v2' operator uses 'dist_attr.execution_stream' to set stream of its own.
zpd_op.send_v2dynamic_shapeFuse_calc_streamTring_idsend_stream_r   zpd_op.recv_v2recv_streamN)r   r,   r   set_bool_attrrw   set_execution_streamset_scheduling_priority)r1   r   r4   r   s       r!   _pir_overlap_send_recvr     s     ))BwwyO+  %8  !2D9((*Y/'',wi(@A**1-o-  %8  !2D9''6**1-   r    c                 6   U R                    GH  nSnSn[        [        UR                  5      5       H  u  pV[	        U5      (       d  M  Un  O   [        [        UR                  5      5       GH$  u  pVUR
                  S;   a  UR                  SS5        UR
                  S:X  d  M:  UR                  SS5        UR                  S5      nUR                  S	5      nUR                  S   n	UR                  U	5      n
UR                  XS-   S
SU
/0SU
/0SU0S9nUS-  nSnSn[        U5      [        [        R                  5      :X  a  XC-   n[        R                  nOXS-   S-   n[        R                  nUR                  USSU
/0SU
/0UUS.S9nU(       a  UR                  U5      nU(       a  UR                   R#                  U	5      n[%        5       nUR                   R&                  Ul        UR                   R(                  Ul        UR+                  U	U5        UR-                  U	U5        UR/                  UU5        UR/                  UU5        [        U5      [        [        R0                  5      :X  d  GM  UR                  SS5        US-  nGM'     UR3                  5         SnSn[        UR                  5       H+  u  pVUR
                  S:X  d  M  [5        U5      (       d  M)  Un  O   Uc  GM  [        [        UR                  5      5       Hn  u  pVUU:  a    OeUR
                  S:X  d  M  UR7                  S5      (       d  M7  UR8                  S   n	UR                  U	5      n
UR;                  XS-   SS9  US-  nMp     UR3                  5         GM     g)z
This implementation refers to lots of Paddle/python/paddle/base/optimizer.py.
The difference between this function with 'PipelineOptimizer' is that
'send_v2' op and 'recv_v2' op have been inserted in program by 'reshard'.
r   N)send_v2recv_v2r   Fr   r   op_roler   c_sync_calc_streamXOutindexr}   rs   ru   rw   r>   r   )r   r   pipeline_flag r   r@   )r   	enumerater9   r,   r   r}   	_set_attrrr   r-   rO   _insert_op_without_syncintr   BackwardOptimizeget_dist_op_for_program	dist_attrget_input_dist_attrr	   process_meshchunk_idset_input_dist_attrset_output_dist_attrset_op_dist_attr_for_programForwardrF   r   has_attrr0   rE   )r1   dist_contextr   offsetfirst_optimize_indexr  r4   r   r   r   rO   sync_calc_opinsert_indexnew_op_rolesync_comm_opdist_opout_dist_attrop_dist_attrbackward_recv_indexs                      r!   _insert_sync_for_fthenb_1f1br"    sg    #"4		?3IEb!!',$ 4 #4		?3IEww00_e4ww)#.6''),''),--a0ii)$<<.-#<"SEN$g.  =   !  $"w<3v#77#7#@L"(//K#(>A#5L"(//K$<<&-#<"SEN#.#*  = 	   *BB2FG(/(9(9(M(M$) (8'9#--:: %1 180A0A0J0J-$88$m %99$m %AA(, %AA(, w<3v~~#66 **?B?aKFM 4N 	""599-IEww)#r(:(:&+# . & #4		?3IE++ww..2;;3O3O..q1ii)  e <! 4 	K  r    c                 .    U H  n[        XU5        M     g r)   )r   )r   r   r,   r4   s       r!   _add_ops_into_blockr$  B  s    	b1 r    c                      U R                   S;   $ )N)fetchfetch_v2)r}   )r4   s    r!   _is_fetch_opr(  G  s    77+++r    c                 h   U R                  5       R                  n[        U5      n[        U5      S:X  a  g SnS nS nX2:  a  X   R                  S:w  a  X   R                  nUS-  nM,  US-   nXb:  a0  X   R                  S:X  a  US-  nXb:  a  X   R                  S:X  a  M  Xb:  a)  US:X  d   S5       e[	        X65       H  nXAU   l        M     OKX   R                  nUS:X  d  XE:X  d   SU S35       e[	        X65       H  nXQU   l        US-   nM     X2:  a  M  US:X  a  US:X  a  [        S5      eg g )Nr   r?   r>   zfirst_left_op_role can't be -1.z%The left and right operators of (idx[z]) have different op_role.z#all the ops don't have the op_role.)r+   r,   rB   r   rD   
ValueError)main_programall_opsops_leniopfirst_left_op_rolefirst_right_op_role	right_idxrL   s           r!   forward_complete_op_roler2  K  sw   '')--G'lG
7|q
C
-<2%!(!5!51HCaI%'*<*D*D*JQ	 %'*<*D*D*J#)R/ 5/ !0C+=CL( 1&-&8&@&@#&",)@ <C5@Z[	A
 !0C+>CL(#a-C 11 -6 R$72$=>?? %>r    c                   ^^ UU4S jnU" U S-
  5      nU" U S-   5      nXE:X  a  U$ U" U S-   5      nXV:X  a  U$ TU    R                  5       S;   a  TU    R                  S5      nUR                  5       nU H  n	U	R                  (       a5  U	R                  R                  S:w  a  U	R                  R                  S:g  s  $ U	R                  S5      (       d  Ma  U	R                  S:w  d  Ms  U	R                  s  $    g)Nc                    > U S:  d  U [        T5      :  a  gTU    nT(       a$  UR                  c  gUR                  R                  $ UR                  S5      (       a  UR                  $ g)Nr   r?   r  )rB   r  r  r  )op_idxr4   r,   	with_dists     r!   get_chunk_id$infer_chunk_id.<locals>.get_chunk_idt  s]    A:3s8+[||#||,,,{{:&&{{"r    r>   r   )zbuiltin.combinezbuiltin.splitr   r?   r  )r   resultall_used_opsr  r  r  )
r5  r,   r6  r7  prev_op_chunk_idnext_op_chunk_idnext_next_op_chunk_id
result_varr:  used_ops
    ``       r!   infer_chunk_idr@  s  s     $FQJ/#FQJ/+(!40
6{AA[''*
!..0#G  W%6%6%?%?2%E((11R77!!*--'2B2Bb2H'''	 $ r    c                 8   ^^ [        5       mUU4S jmT" U 5      $ )Nc                 F  > U R                  5       nU H  nUT;   a    gTR                  U5        UR                  (       a2  UR                  R                  S:w  a  UR                  R                  s  $ UR	                  5        H  nT" U5      nUS:w  d  M  Us  s  $    M     g)Nr?   )r:  r/   r  r  r   )rO   r:  r?  
output_varr  dfsvisiteds        r!   rD  &find_var_used_op_chunk_id.<locals>.dfs  s    '')#G'!KK   W%6%6%?%?2%E((111")//"3J":H2~' #4 $ r    )r*   )rO   rD  rE  s    @@r!   find_var_used_op_chunk_idrG    s    eG s8Or    c                    [        U 5        [        U 5        U R                  5       R                  nU R	                  5       nU R	                  5       nU R	                  5       nUR                  5       R                  nUR                  5       R                  nUR                  5       R                  nUR                  5       n	UR                  5       n
[        5       n[        U[        R                  R                  5      (       aE  [        R                  R                  [        R                  R                  5       R                  5      n[        R                  R                  R                  5       nUR!                  U5        Sn[#        [%        U5      S-
  SS5       GHE  nX.   R&                  S:w  a>  X.   R&                  S:X  a  SnO)X.   R&                  S:X  a  SnOX.   R&                  S:X  a  SnUS:X  a&  Xn   R)                  5         X~   R)                  5         M  US:X  Ga8  Xn   R)                  5         [#        X   R+                  5       5       H  nX   R-                  U5      nUR/                  5       SL d  M+  S	U S
X.   R1                  5        S
U 3n[        R2                  R5                  X~   5        [        R6                  R9                  X~   R-                  U5      U5        U	R;                  UUR=                  5       5      nUUl        UR@                  Ul         X   R-                  U5      RC                  U5        M     X   R)                  5         GM  [#        X   R+                  5       5       GHB  nX   R-                  U5      nX~   R-                  U5      nUR/                  5       SL d  UR/                  5       SL Ga  Xn   R1                  5       S:X  d  Xn   R1                  5       S:X  a  Xn   R-                  U5      R0                  nOX.   R-                  U5      nURE                  5       nS nU H  nUR1                  5       S:X  d  M  UnM     Ub  URG                  5       S   nOmS	U S
X.   R1                  5        S
U 3n[        R2                  R5                  Xn   5        [        R6                  R9                  Xn   R-                  U5      U5        UR/                  5       SL aZ  U	R;                  WUR=                  5       5      nUUl        UR@                  Ul         X   R-                  U5      RC                  U5        UR/                  5       SL d  GM  U
R;                  WUR=                  5       5      nUUl        UR@                  Ul         X~   R-                  U5      RC                  U5        GME     X   R)                  5         X~   R)                  5         GMH     X4U4$ )Noptr>   r?   bwdr   fwdr   Fvar_rX   z
pd_op.datazbuiltin.parameterzbuiltin.shadow_outputrv   )$r   r2  r+   r,   rC   _get_devicer#   r   r   	CUDAPlacedistributedParallelEnvdev_idr   	libpaddlePlace	set_placerD   rB   r   erasenum_resultsr9  	use_emptyr   pirset_insertion_point_after_C_opsset_persistable_value	add_kwargr}   
place_attrr   replace_all_uses_withr:  rw   )r+  enable_send_recv_overlapcomplete_opsfwd_programbwd_programopt_programfwd_opsbwd_opsopt_ops	opt_block	bwd_blockplace	cur_placeregionr5  rL   result_in_optr   new_result_var_in_optresult_in_bwdresult_valueused_opsshadow_output_op_usedr?  new_result_var_in_bwds                            r!   -_split_program_into_forward_backward_optimizers    sT    <(\*,,.22L$$&K$$&K$$&K&&(,,G&&(,,G&&(,,G((*I((*IME%))3344  ****,33
 %%++-IFL)A-r26''2-#++q0%--2%--2U?O!!#O!!#u_O!!#W_88:; !( 6 6s ; **,5!&<+?+D+D+F*GqNDJJ88IMM77..s3T -6,?,?m002-) 8A)4%11 *5 O**3/EE-) <0 O!!# W_88:; !( 6 6s ; ' 6 6s ; "++-6$..0E9  ,,.,>"?//15HH&55c:?? (4';'B'B3'G#/#<#<#>04-'/G&||~1HH8? 5 (0 1<#8#>#>#@#OD &*&<3G3L3L3N2OqQTPU#VD"JJ@@ ' #MM?? ' 6 6s ;T !**,5,5,?,?m002-) 8A)4%11 *5 O**3/EE- !**,5,5,?,?m002-) 8A)4%11 *5 O**3/EE-m <r O!!#O!!#I 7L [00r    c                   ^ ^^ T T   mU UU4S jnS nTR                  5       S:X  a  S/$ U" 5       (       a  S/S-  $ TR                  S5      (       a  S/$ TR                  5        H  nU" U5      (       a  M  S/s  $    S/$ )Nc                     > / SQn TR                  S5      (       d  gTS:  a  g[        S5       H%  nTTU-
     R                  5       U SU-
     :w  d  M%    g   g)N)pd_op.full_int_arraypd_op.reshaperv  rw  zpd_op.matmulrv  rw  z
pd_op.add_grad_merge_addF      T)r  rD   r   )ops_patternrZ   r,  cur_opr5  s     r!   is_reshape_matmul_pattern<_pir_get_backward_op_type.<locals>.is_reshape_matmul_pattern5  sb    	
 /00A:qAvz"'')[Q-??  r    c                 b    U R                  5        H  nUR                  S5      (       d  M    g   g)Nrx  TF)r:  r  )valuer4   s     r!   used_by_grad_merge_add9_pir_get_backward_op_type.<locals>.used_by_grad_merge_addJ  s-    $$&B{{+,, ' r    r   
backward_br   ry  rx  )rV  r  r   )r,  r5  r}  r  rp   r|  s   ``   @r!   _pir_get_backward_op_typer  0  s    V_F* q ~ ""~!!'((~ .."%f-- >! # >r    c                 v    Ub  U U 3nOUnU R                  5       nUR                  5       R                  nX4U4$ r)   )rC   r+   r,   )r1   r   r  program_namecloned_programr,   s         r!   _create_program_and_opsr  b  sE    "H:.]]_N

%
%
'
+
+C,,r    c                    [        5       n[        U R                  5       GHR  u  p4U" U5      nUR                  S/ 5      n/ nUS:X  aV  UR	                  5        HA  u  p[        5       X('   X(   R                  S5      n
[        XJU	5        UR                  U
5        MC     O}UR	                  5        Hi  u  p[        U	5      S:  d  M  X(   R                  UR                  S9n
U
R                  UR                  5        [        XJU	5        UR                  U
5        Mk     U HQ  nUR                  S5      S   nS nU H  n
U
R                  U5      (       d  M  U
n  O   U(       d  ME  [!        XMU5        MS     GMU     U$ )Nr&  r   )
parent_idxr  )r   r  r   popr   r   r   r$  r.   rB   _create_blockr  _set_forward_block_idxforward_block_idxrn   r   r   )r1   split_methodr   ibr   type_to_ops	fetch_ops
dst_blocksr}   r,   r   fetch_opr   fetch_blocks                 r!   _build_vpp_sub_programsr  n  sf   !mO"7>>2"9-OOGR0	
7(..0	(/	%+177:	#I#>!!),	 1 )..0	s8a< / 5 C C#,#7#7 !D !I 44!33 (	cB%%i0 1 "HnnS)!,GK'	0099"+K (
 {	A "/ 3D r    c                 (   U R                   R                  (       d  SU R                   l        UR                   R                  nU R                   R                  U;  a6  UR	                  U R                   R                  5        X!R                   l        gg)z
Add the extra event dependency of the two operators.
This function mainly aims for the cross-programs in pipeline parallelism,
especial for the 'send_v2' 'recv_v2' etc.
TN)r  force_record_eventevents_to_waitevent_to_recordr.   )recorder_op	waiter_opwaiter_wait_lists      r!   _add_event_dependencyr    sv       33370 !**99,,4DD 5 5 E EF-=* Er    c	           
         U R                  US   5      n	UR                  U	5      n
Uc2  U R                  US    S3U	R                  SS9nUR	                  Xz5        U R                  US    S3U	R                  S9nUR	                  X5        U R                  USSU0X{S	.UUUS
.S9n[        UU
R                  U
R                  UUS9  U$ )Nr   @reshape.outFr   r   r   z@reshape.xshape)r   r   reshape2r  )r  XShape)r   r   op_namescoper  )r  ref_mappingctxr  )	rO    get_tensor_dist_attr_for_programr   r    set_tensor_dist_attr_for_programr	  r   r  dims_mapping)r   r  xr   r   r  r  outr  var_xx_dist_attrx_shape
reshape_ops                r!   _insert_reshape_opr    s     IIadOE??FK
{aD6&++  

 	55cGqtfO$<EKKPG11'G..Qx/(
 / 
J ; --,, Jr    c                    U R                   nXA   nUR                  S5      nU(       a   SU S35       eUR                  S5      nU(       a   SU S35       eUR                  S5      nUR                  S5      n	UR                  S5      n
UR                  S	5      nUR                  S
5      nUR                  S5      nU R	                  US   5      nU R	                  U
S   5      nU R	                  US   5      nUR
                  nUR
                  nUR
                  n[        U5      [        U5      :X  d    S[        U5       S[        U5       S35       e[        U5      S:  a  USS USS :X  d   SU SU S35       eUS   US   -  /[        USS  5      QnUS   US   -  /USS  QnUR                  U5      R                  n[        U US-   UUUUUUS9n[        U US-   U
UUUUUS9nU R                  US    S3UR                  SS9nUR                  UUR                  U5      5        UR                  U5      nUR                  UR                   UR                  U5      5        UR                  UR                   UR                  U5      5        UR#                  UR                   UR                  U5      5        U R%                  US-   SUUS.SU0SSUUS.S9nUR'                  UU5        [        U US-   UR                   /UUUUUUS 9	  U R%                  US-   SXS.SU0SSUUS.S9nUR'                  UUR                  U5      5        U R)                  USS!9  g )"Ntrans_xmatmul_grad(id=J) with tran_x == True is not supported for splitting matmul_grad to matmultrans_yJ) with tran_y == True is not supported for splitting matmul_grad to matmulr  YzOut@GRADzX@GRADzY@GRADr   r   BThe rank of x must be equal to that of out_grad, but got x rank =  and out_grad rank = r   r   PThe first two dimensions of x must be equal to that of out_grad, but got x_dims: and out_grad_dims:r>   )r  r  r  r  Fr     	matmul_v2)r  r  r  T)r  r  r   r  r     )r  r  r  r  r@   )r,   rr   rn   rp   rO   r   rB   r9   get_op_dist_attr_for_programr  r  r   r   r  r  r  r   r  r	  r  rE   )r   matmul_grad_idr  r  r,   matmul_grad_optran_xtran_yr  yout_gradx_grady_gradr   r  var_out_grad
var_y_gradx_dimsout_grad_dimsy_grad_dims
new_x_dimsnew_out_grad_dimsr  new_xnew_out_grad
new_y_gradmatmul_grad_dist_attr	matmul_ops                               r!   split_matmul_grad_to_matmulr    s`    ))C(N  +F 
.))st:   +F 
.))st: 	S!AS!A##J/H""8,F""8,F!!),GIIadOE99Xa[)L6!9%J[[F &&M""Kv;#m,, 
LSQW[MYnor  tA  pB  oC  CD  	E, 6{Qa{mAa00 	
^_e^ffy  {H  zI  IJ  K	
0 )fQi';$vabz*:;Ja=++	qr	 88h  	!!	E &!!	L !!q	{,' " J 1155jA
 )EE --

LAA%H --55lC ..55jA
 --q .
#(	
 . I --i9NO	!!
 --q &(	
 . I --<<<^L 
^%0r    c                 b	   U R                   nX!   nUR                  S5      (       a   SU S35       eUR                  S5      (       a   SU S35       eUR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      nUR                  n	UR
                  n
UR
                  nUR
                  n[        U
5      [        U5      :X  d    S	[        U
5       S
[        U5       S35       e[        U
5      S:  a  U
SS USS :X  d   SU
 SU S35       eU
S   U
S   -  /[        U
SS  5      QnUS   US   -  /USS  QnUR                  n[        R                  R                  U5        [        R                  R                  XM5      nUR                  5       nU	Ul        UR                  SU5        U	UR                  S5      R                  5       l        UR                  S5      R                  5       R                  SU5        [        R                  R                  U5        [        R                  R                  Xn5      nUR                  5       nU	Ul        UR                  SU5        U	UR                  S5      R                  5       l        UR                  S5      R                  5       R                  SU5        [        R                  R                  U5        [        R                  R!                  UUSS5      nUR                  5       nU	Ul        UR                  SU5        [        R                  R                  U5        [        R                  R                  UU5      nUR                  5       nU	Ul        UR                  SU5        U	UR                  S5      R                  5       l        UR                  S5      R                  5       R                  SU5        [        R                  R                  U5        [        R                  R!                  XeSS5      nU	UR                  5       l        UR                  5       R                  SU5        UR#                  U5        UR#                  U5        UR%                  5         g )Nr  r  r  r  r  r   r>   r   r  r  r   r  r  r  TF)r,   r  operand_sourcer9  r   r   rB   r9   r  r   rX  rY  rZ  reshapeget_defining_opset_int_attrmatmulr^  rU  )r   r  r,   r  r  r  r  r  r  r   r  r  r  r  r  r  r  x_reshape_opr  out_grad_reshape_opr  new_matmul_opnew_y_grad_reshapey_grad_reshape_op
new_x_grads                            r!    _pir_split_matmul_grad_to_matmulr  e  sv   
))C(N&&y11 
.))st1 &&y11 
.))st1 	%%a(A%%a(A,,Q/H""1%F""1%F$$GWWFNNM,,Kv;#m,, 
LSQW[MYnor  tA  pB  oC  CD  	E, 6{Qa{mAa00 	
^_e^ffy  {H  zI  IJ  K	
0 )fQi';$vabz*:;Ja=++	qr	 &&H
JJ((8MM!!!0E((*L"Lj(3?FL"224<"224AAH JJ((6==((EL&668")$$Z:FM&&q)99;C&&q)99;HHH JJ(()<=%%e\4GJ..0M#Mz84
JJ((7..z;G*::< '"":x8DK$$Q'779A$$Q'779FFH JJ((8%%h5$?J+2J ( --j(C
  ,
  !34r    c                   V    \ rS rSrS rS rS rS rS rS r	S r
S	 rS
 rS rS rSrg)PipelineMemoryEstimatori  c                 ^    0 U l         / U l        [        R                  " [        5      U l        g r)   )type_to_skip_gc_varsprogram_typeslogging	getLoggerr   r   re   s    r!   rg    PipelineMemoryEstimator.__init__  s$    $&!''1r    c           
      t   X l         0 nUR                  5        H!  u  pE[        U5      X4'   0 U R                  U'   M#     [	        5       n[        U5       Hb  nX7   nX-  n	US;   a  [        U	5      S:X  d   SU SU	 S35       e[        [        U	S/[        U	5      -  5      5      n	XR                  U'   Xh-  nMd     g)z
Get the skip_gc_vars for each type of program.

The order of program_types is the same as the order in the pipeline's micro batch.
For example, in 1F1B pipeline, the order of program_types is ['forward', 'backward'].
r   r   r   r   r   r?   N)	r  r   r   r  r*   rR   rB   r   r   )
rf   r   r  r   r}   r1   r   r   r   r   s
             r!   set_program_skip_gc_vars0PipelineMemoryEstimator.set_program_skip_gc_vars  s     + ",224MD*G*P!'.0D%%d+ 5 "% /H1;M(AL55<(A- XYaXb  cH  IU  HV  VW  X-  L2$\9J2J KLL2>%%h/"3" 0r    c                    X R                   ;  a  [        SU S35      eUR                   VVs/ s H1  oDR                    H  oUR                  R                  5       U/PM      M3     nnnUR                  S S9  U R                  Xc5      nU R                   U    H"  nX;  a  M
  UU   S   U R                   U   U'   M$     0 n	U R                   U   n
U R                  R                  U5      S:  a:  U R                  U R                  R                  U5      S-
     nU R                   U   n	U R                  XgX5      u  pX4$ s  snnf )Nz9Please set the skip_gc_vars before estimating memory for z	 program.c                     U S   $ r   r   )r  s    r!   <lambda>9PipelineMemoryEstimator.estimate_memory.<locals>.<lambda>  s    qtr    )r   sizer>   )r  r*  r   r,   r   idsort_get_program_var_infor  r  _estimate_max_memory)rf   r1   program_typer  r   r4   ordered_opsvar_infor   r2   r   prev_program_type	mem_usage
max_memorys                 r!   estimate_memory'PipelineMemoryEstimator.estimate_memory  sd   888KL>Ybc 
 -4NN
,:5YYrWWZZ\2YN 	 
 	^, --kH11,?H'@HAAD%%l3H= @ 00>##L1Q6 $ 2 2""((6:!  445FGL !% 9 9<!
	 $$9
s   8E c                 H   SnSn[        5       nU H  nUR                  U5        M     U GH  u  pU
R                  S;   a  M  / nU
R                  U
R                  -    GH  nX;  a  M  X(   S==   S-  ss'   X;  a  U R                  X5      (       d  UR                  U5        U R                  R                  SU SX(   S    SX(   S    S	U S
XRU   S   -    SU
R                   SU
R                   SU
R                   35        XRU   S   -  n[        Xe5      nU R                  X5      (       a,  U R                  X5      (       d  X;  a  UR                  U5        [        Xe5      nGM     [        U5       H  nU R                  R                  SU SX(   S    SX(   S    S	U S
XRU   S   -
   SU
R                   SU
R                   SU
R                   35        XRU   S   -  nX;   d  Mw  XH==   X(   S   -  ss'   M     GM     U H  nX;  d  M
  XTU   -  nM     XV4$ )Nr   create_py_readercreate_double_buffer_readerreadcountr>   zadd z, var size: r  z,count: z,mem_usage: z -> z
,op type: z, input_arg_names: z, output_arg_names: zremove )r*   r/   r}   r-   r0   _is_persistabler   r   max_is_last_usedr.   )rf   r  r  r   r2   r  r  has_used_varsr   rX   r4   last_use_varss               r!   r  ,PipelineMemoryEstimator._estimate_max_memory  s    	
 %Hh' % !EAww  
 M..1D1DD+"7+q0+09M9M: : "%%h/KK%%xjX5G5O4P Q""*"4W"=!> ?&&/[Y(ASTZA[5[4\ ]$$&GG9,?@R@R?SSghjh{h{g|~ (!3F!;;I!$Z!;J%%h99 00DD$8%,,X6 7
3 E8  .!!hZ|H4Fv4N3O P&09: ;""+Dh=OPV=W1W0X Y  "y(;B<N<N;OOcdfdwdwcxz h/77	+ *h.@.HH* /K !` %H+(33	 % $$r    c                     U R                   U   n[        UR                  5        VVs/ s H  u  p4UPM	     snn5      nUS:  a  [        S5      eU$ s  snnf )a&  
For a given type of program, calculate the increase memory usage.

The increase memory usage is the memory usage of the variables that are setting to skip_gc_vars.
Persistable variables are not included in the increase memory usage because they are allocated when
running the startup program.
r   zONo size info for skip_gc_vars, please run estimate_memory to get var size info.)r  sumr   r*  )rf   r  r   rX   memincrease_memorys         r!   _get_increase_memory,PipelineMemoryEstimator._get_increase_memoryG  s`     00>1C1C1EF1Evqs1EFGQa   Gs   A
c           	      Z   0 nU H  u  pEUR                   S;   a  M  [        5       nUR                  U5        UR                  UR                  -    HS  nUR                  U5      (       d  M  UR                  U5      nU(       d  M5  U R                  UUUXuR                  ;   S9  MU     M     U$ )Nr  )is_input)r}   r_   r   r-   r0   r   r  _update_var_info)	rf   r  r  r  rX   r4   r   r   r  s	            r!   r  -PipelineMemoryEstimator._get_program_var_infoW  s     EAww  
 !mGr"..1D1DD((22&>>rB7))  !)-?-?!?	 *  E !0 r    c                    U(       a  UR                  U5      OUR                  U5      nX;  aH  UR                  USSSS.5        UR                  (       a  SX1   S'   g U R	                  U5      nXcU   S'   g X1   S==   S-  ss'   g )	Nr   r>   F)r  r  r   Tr   r  r  )get_serial_inputget_serial_output
setdefaultr   _get_var_size)rf   r   r  r  r  rO   var_sizes          r!   r  (PipelineMemoryEstimator._update_var_infot  s      $$X.**84 	 #1qG 48"=1))#.H)1Xv&w'1,'r    c                     UR                    Vs/ s H  o"S:X  a  SOUPM     nnU R                  X1R                  5      $ s  snf )Nr?   r>   )r   _calculate_bytesr   )rf   rO   dim	var_shapes       r!   r  %PipelineMemoryEstimator._get_var_size  s>    8;		B	)Q,		B$$Y		:: Cs   ?c                    [         R                  S[         R                  S[         R                  S[         R                  S[         R
                  S[         R                  S[         R                  S[         R                  S[         R                  S0	nU(       a  [        S US5      OSnUR                  US5      nXE-  $ )Nry  r  r   r>   c                 
    X-  $ r)   r   )r  r  s     r!   r  :PipelineMemoryEstimator._calculate_bytes.<locals>.<lambda>  s    r    r   )r   float64int64float32int32float16bfloat16int16int8uint8r   get)rf   r  r   dtype_to_sizetotal_countdtype_factors         r!   r  (PipelineMemoryEstimator._calculate_bytes  s    NNALL!NNALL!NNAOOQLL!KKLL!

 9BF%y!4q 	 %((2))r    c                 "    X;  a  gX!   S   S:H  $ )NFr  r   r   rf   r   r  s      r!   r  %PipelineMemoryEstimator._is_last_used  s    #!'*a//r    c                     X;  a  gX!   S   $ )NFr   r   r2  s      r!   r  'PipelineMemoryEstimator._is_persistable  s    #!-00r    )r   r  r  N)r   r   r   r   rg   r  r  r  r  r  r  r  r  r  r  r   r   r    r!   r  r    s<    2
48"%H@%D :-(;*(01r    r  r)   )F)T)N/)r6  )Cr  collectionsr   enumr   	functoolsr   r   paddle.baser   paddle.base.frameworkr   r   6paddle.distributed.auto_parallel.static.dist_attributer	   -paddle.distributed.auto_parallel.static.utilsr
   r   r   r   paddle.frameworkr   rM  auto_parallel.static.utilsr   VarDescVarTypeREADERSTEP_SCOPESDENSE_TENSOR_ARRAYFEED_MINIBATCH
FETCH_LISTr   INFOr   r   r'   r7   r<   rP   r]   r_   r   r   r   r   r   r   r   r   r   r   r"  r$  r(  r2  r@  rG  rs  r  r  r  r  r  r  r  r  r   r    r!   <module>rH     s    #     4  0 	LLLL$$LL++LL''LL##  
GLL	!
/T /$ F/9d3
 3
l3
4
( F1h...L..l^2
,%@P"J, ,1@1F/d	-%P>2 	-b 7:K1\Wtk1 k1r    