
    x-j'                     n   d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZmZmZmZ d d	lmZ d
dlmZ e	j        j        j        e	j        j        j        e	j        j        j        e	j        j        j        e	j        j        j        gZ  ee j!                  Z" G d de          Z#d2dZ$d Z%d Z&d Z'd Z( G d d          Z)d Z*d Z+d Z,d Z-d Z.d Z/d Z0d3dZ1d3dZ2d Z3d2d Z4d! Z5d" Z6d# Z7d4d%Z8d& Z9	 d3d'Z:d( Z;d2d)Z<d* Z=d+ Z>	 	 d5d-Z?	 d6d.Z@d/ ZA G d0 d1          ZBdS )7    NOrderedDict)Enum)reduce)core)	ParameterProgram)OperatorDistAttr)
get_loggeris_backward_opis_optimize_op6naive_set_dist_op_attr_for_program_by_mesh_and_mapping)_current_expected_place_   )OpRolec                       e Zd ZdZdZdZdS )AutoParallelStreamTypedefaultauto_parallel_mpauto_parallel_shardingN)__name__
__module____qualname__CALC_STREAM	MP_STREAMSHARDING_STREAM     d/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/passes/pass_utils.pyr   r   3   s        K"I.OOOr   r   c                 r    |t                      }nt          |t                     sJ | D ]}||vrd||<   |S )NT)r   
isinstance)list_objordered_dictobjs      r   list_to_ordered_dictr%   9   sT    "}},44444 % %l"" $Lr   c                    t                      }g }|                                 j        D ]Y}|j        D ]0}||vr*|                    |           |                    |           1|j        D ]}|                    |           Z|S N)setglobal_blockopsinput_arg_namesappendaddoutput_arg_names)programvisited_vars
input_varsopin_var_nameout_var_names         r   get_inputs_of_programr5   F   s    55LJ""$$( + +- 	. 	.K,..!!+...  ---/ 	+ 	+L\****	+r   c                     t                      }|                                 j        D ]}t          |j        |           t          |                                          S r'   )r   r)   r*   r%   r.   listkeys)r/   output_varsr2   s      r   get_outputs_of_programr:   T   sY    --K""$$( ? ?R0+>>>>  ""###r   c                    t          |                                 j                  }|dk     r||z  }|dk    r||k     sJ |dk     r||z  }|dk    r||k    s
J |            ||k     sJ |                                 } t	          |dz
  |dz
  d          D ]+}|                                                     |d           ,t	          |dz
  dd          D ]+}|                                                     |d           ,|                                  t                      }|                                 j        D ]@}|j        D ]}|	                    |           |j
        D ]}|	                    |           Ag }	|                                 j        D ]}
|
|vr|	                    |
           |	D ]+}
|                                                     |
d           ,|                                  | S )Nr      Fsync)lenr)   r*   clonerange
_remove_op_sync_with_cppr(   r+   r-   r.   varsr,   _remove_var)r/   start_op_idx
end_op_idxop_numidx
valid_varsr2   r3   r4   vars_to_removevars              r   prune_programrN   [   sR   %%''+,,Fa1!6!6!66A~~f
??zV333Z333*$$$$mmooGVaZa44 ; ;))#E)::::\A%r2.. ; ;))#E)::::J""$$( ) )- 	( 	(KNN;''''/ 	) 	)LNN<((((	) N##%%* ' 'j  !!#&&& < <**3U*;;;;Nr   c                    |s
J d            t          |                                 j                  dk    s
J d            fd|D             }|d         dk    rdg|}|d         k    r|                               t	          t          |          dz
            D ]!}||         ||dz            k     s
J d            "g }t	          t          |          dz
            D ]7}t          | ||         ||dz                      }|                    |           8t          |          }d |D             }d	 |D             }d
 t	          |          D             }|d         |d<   t	          d|          D ]C}	||	         D ]8}
t          t	          |	                    D ]}|
||         v rd||         |
<    n9Dd |D             }|||fS )ay  
    Split the program by op_indices.

    For examples, a program has 100 ops, and op_indices = [25, 60].
    Then the program is split into 3 parts, containing 25, 35 and 40
    ops respectively.

    The return values are a tuple with 3 elements: the split program
    list, the input var names of each split program, and the output
    var names of each split program.
    zop_indices cannot be emptyr   zprogram cannot be emptyc                 *    g | ]}|d k    r|n|z   S )r   r   ).0rJ   rI   s     r   
<listcomp>z!split_program.<locals>.<listcomp>   s)    JJJ##sV|JJJr   r=   r<   z"op_indices must be strictly sortedc                 ,    g | ]}t          |          S r   )r5   rQ   ps     r   rR   z!split_program.<locals>.<listcomp>   s!    CCCq'**CCCr   c                 F    g | ]}t          t          |                    S r   )r%   r:   rT   s     r   rR   z!split_program.<locals>.<listcomp>   s7       <=3A6677  r   c                 *    g | ]}t                      S r   r   )rQ   _s     r   rR   z!split_program.<locals>.<listcomp>   s    AAA1AAAr   Tc                 P    g | ]#}t          |                                          $S r   )r7   r8   )rQ   items     r   rR   z!split_program.<locals>.<listcomp>   s(    IIItdiikk**IIIr   )r@   r)   r*   r,   rB   rN   reversed)r/   
op_indicesrJ   split_programs	new_split	num_splitr1   r9   valid_output_varsir3   jrI   s               @r   split_programrc   ~   sW    33333:%%''+,,FA:::0:::JJJJzJJJJ!}%*%
"~&!!!S__q()) 
 
#C!G!44440 5444 NS__q()) ) )!':c?JsQw<OPP	i((((N##ICCNCCCJ AO  K BAi0@0@AAA'Ob1i    %a= 	 	KeAhh''  +a.008<%a(5E 1	
 JI7HIII:'888r   c                   @    e Zd ZdZd Zed             Zd Zd Zd Z	dS )OpInOutInfozc
    Record unused buffer input_vars of op and other var_names except unused buffer input_vars
    c                 `    d| _         t                      | _        t                      | _        d S )NF)	_is_buildr(   _no_need_buffer_slots_other_arg_names_setselfs    r   __init__zOpInOutInfo.__init__   s'    %(UU"$'EE!!!r   c                     | j         S r'   )rg   rj   s    r   is_buildzOpInOutInfo.is_build   s
    ~r   c                     i }|j         D ]}|                    |          ||<   i }|j        D ]}|                    |          ||<   i }|j        D ]}|                    |          ||<   |||fS r'   )input_namesinputoutput_namesoutput
attr_namesattr)rk   r2   inputs
input_nameoutputsoutput_nameattrs	attr_names           r   _get_op_attrszOpInOutInfo._get_op_attrs   s    . 	6 	6J!#*!5!5F:? 	: 	:K#%99[#9#9GK   	2 	2I!wwy11E)w%%r   c                    |                      |          \  }}}t          j        |j        |||          | _        t          | j                  dk    rd S |j        D ]=}|| j        vr2|                    |          D ]}| j        	                    |           >|j
        D ]=}|| j        vr2|                    |          D ]}| j        	                    |           >d| _        d S )Nr   T)r|   r   infer_no_need_buffer_slotstyperh   r@   rp   rq   ri   r-   rr   rs   rg   )rk   r2   rv   rx   rz   	slot_namein_nameout_names           r   
build_infozOpInOutInfo.build_info   s   !%!3!3B!7!7%)%DGVWe&
 &
" t)**a//F 	; 	;I :::!xx	22 ; ;G-11':::: 	< 	<I ::: "		) 4 4 < <H-11(;;;;r   c                 D    t          | j                  dk    p|| j        v S Nr   )r@   rh   ri   )rk   arg_names     r   	is_neededzOpInOutInfo.is_needed   s*    *++q0 5444	
r   N)
r   r   r   __doc__rl   propertyrn   r|   r   r   r   r   r   re   re      su         * * *
   X& & &  (
 
 
 
 
r   re   c                 D    |                     |           }|d uo|j         S r'   )_find_var_recursivepersistable)var_nameblockrM   s      r   var_can_be_deletedr      s)    

#
#H
-
-Cd?23?22r   c                 <   t                      }| j        D ]}|j        D ]{}|j        dv rt	                      }|                    |           |j        |j        z   D ]<}t          ||          r*|	                    |          r|
                    |           =||S )z^
    Get all vars in the program that are non-persistable and not in op's no_need_buffer.
    )c_sync_comm_streamconditional_blockdatanopwhile)r(   blocksr*   r   re   r   r+   r.   r   r   r-   )r/   required_varsr   r2   op_infor   s         r   _get_required_vars_of_programr      s     EEM 0 0) 	0 	0Bw    !mmGr""".1DD 0 0%h66 07;L;L< < 0 "%%h///	0	0" r   c                     t           j        j                            d          d         rt	          | |||          S t          | |||          S )a#  
    Set `skip_gc_vars` for every job in jobs.

    A whole_program is split up into sub_programs according to the schedule mode,
    thus a sub_program's vars might be used as the op's input of the later sub_program,
    and these vars cannot be gc after executing current sub_program.
    FLAGS_enable_pir_api)paddlebase	framework	get_flags_set_skip_gc_vars_in_pir_set_skip_gc_vars_in_old_ir)num_micro_batches	job_typessub_programsjobss       r   set_skip_gc_varsr     sb     {&&'=>> 	
 (y,
 
 	
 +y,
 
 	
r   c           	         | dk    s
J d            t          t          ||                    }i }|                                D ]\  }}t          |          ||<   d t	          |           D             }t          |          }	t          t	          |	                    D ]}
||
         }|                                }||         }|                                }|||         z  }t          
                    d| d| d|            |dv r$t          |          dk    sJ d	| d
| d            |                    |           ||xx         |z  cc<   |S )Nr<   "num_micro_batches needs to be >= 1c                 *    g | ]}t                      S r   r(   rQ   ra   s     r   rR   z/_set_skip_gc_vars_in_old_ir.<locals>.<listcomp>%      FFFceeFFFr   Skip gc vars for -(): backward
backward_wr   BWhen enabling pipeline parallelism strategy, the skip_gc_vars for % subprogram must be empty, but it is .)dictzipitemsr   rB   r@   r[   r   micro_batch_idloggerdebugr   )r   r   r   r   type_to_programtype_to_required_varsr   r/   suffixed_required_varsnum_jobsjob_idjobjob_typer   r   skip_gc_varss                   r   r   r     s    !!!#G!!!3y,7788O (..00 M Mg&CG&L&Ld## GFU3D-E-EFFF4yyH5??++ @ @6l88::-h7++--$'=n'MMMMMNMM|MM	
 	
 	
 111|$$))) TU]  T  T  EQ  T  T  T *)) 	\***~...-?....r   c           	      :   | dk    s
J d            t          t          ||                    }i }t          j        |          }|                                D ]G\  }}t                      }	t                      }
|                                                                D ]}|	                    |           |                                j	        D ]}|
                                D ]D}|j        r;|	                    |j                   |j        r|
                    |j                   E|                                D ]D}|j        r;|	                    |j                   |j        r|
                    |j                   E||v r|	||         z  }	|	|
z  }	|	||<   Id t          |           D             }t!          |          }t#          t          |                    D ]}||         }|                                }||         }	|                                }|	||         z  }t(                              d| d| d|            |dv r$t!          |          dk    sJ d	| d
| d            |                    |           ||xx         |	z  cc<   |S )Nr<   r   c                 *    g | ]}t                      S r   r   r   s     r   rR   z,_set_skip_gc_vars_in_pir.<locals>.<listcomp>Y  r   r   r   r   r   )send_backwardr   r   r   r   r   )r   r   r   get_no_need_buffer_valuesr   r(   r)   kwargsr-   r*   operands_sourcehas_namenamer   resultsrB   r@   r[   r   r   r   r   r   )r   r   r   r   r   r   no_need_buffer_varsr   r/   r   persistable_varskeyr2   rM   r   r   r   r   r   r   s                       r   r   r   <  s   !!!#G!!!3y,7788O 8II,2244 8 8'55''))0022 	# 	#Cc""""&&((, 
	7 
	7B))++ 7 7< 7!%%ch/// 7(,,SX666zz|| 7 7< 7!%%ch/// 7(,,SX666	7
 ***0::M))*7h'' GFU3D-E-EFFF4yyH5??++ @ @6l88::-h7++--$'=n'MMMMMNMM|MM	
 	
 	
 666|$$))) TU]  T  T  EQ  T  T  T *)) 	\***~...-?....r   c                     i }|j         |d<   |j        |d<   |j        |d<   |j        |d<   |j        |d<   t          d| |j        |j        |j        |j	        |j
        |j        |j        |j        |j        d
| d S )N	trainableoptimize_attrregularizerdo_model_average	need_clip)
r   r   r   shapedtype	lod_level
error_clipstop_gradientis_databelong_to_optimizerr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )	dst_blocksrc_varcopied_kwargss      r   _create_paramr   p  s    M!(!2M+%,%:M/"#*#6M- (/(@M$%!(!2M+ \\mm#%+#7      r   c                     |                      |j        |j        |j        |j        |j        |j        |j        |j        |j	        |j
        
  
         d S )N)
r   r   r   r   r   r   r   r   r   r   )
create_varr   r   r   r   r   r   r   r   r   r   )r   r   s     r   _create_interr     s`    \\mm#'%+#7      r   Fc           	         |s|                      |          }n|                     |          }|j        t          v rMt	          |dd          }|                    |j        |j        ||j        |j        |j	        |j
                   d S t          |t                    rt          ||           d S t          ||           d S )Nr   F)r   r   r   r   r   r   r   )rM   _var_recursiver   __not_shape_var_type__getattrr   r   r   r   r   r   r!   r   r   r   )	src_blockr   src_varnameforce_creater   persists         r   _create_varr     s     8--,,**;77|---'=%88)!/O ' ; 	 	
 	
 	
 	
 	
 gy)) 	.)W-----)W-----r   c                    |j                                         }|                    |j                    |j        D ]@}|                     |          s|r'|                     |          rt          | |||           A|j        D ]@}|                     |          s|r'|                     |          rt          | |||           Ad S r'   )desc	append_op	copy_fromr+   has_varr   r   r.   )r   r   src_opr   dst_op_descinput_varnameoutput_varnames          r   _create_programr     s    .**,,K&+&&&/ K K]++ 	K	K&::=II	K 	9m\JJJ 1 L L^,, 	L	L&::>JJ	L 	9nlKKK	L Lr   c                 &   | j         D ]}|j        D ]}|                                dk    rt|                    dd           |                    dd           |                                d         }|                    d|            |                    d           |                                d	k    rV|                    dd           |                    dd           |                    d
           |                    d           	dS )a  
    This function is used to replace the function '_insert_sync_for_fthenb_1f1b'.
    The finally target of this function is as follows:
        1. no need to insert the 'c_sync_calc' and 'c_sync_calc' operators
        2. 'send_v2' operator uses 'dist_attr.execution_stream' to set stream of its own.
        3. 'recv_v2' operator uses 'dist_attr.execution_stream' to set stream of its own.
    zpd_op.send_v2dynamic_shapeFuse_calc_streamTring_idsend_stream_r   zpd_op.recv_v2recv_streamN)r   r*   r   set_bool_attrrz   set_execution_streamset_scheduling_priority)r/   r   r2   r   s       r   _pir_overlap_send_recvr     s$     . .) 	. 	.BwwyyO++  %888  !2D999((**Y/''(@w(@(@AAA**1----o--  %888  !2D999''666**1---	.. .r   c                 8   | j         D ]}d}d}t          t          |j                            D ]\  }}t	          |          r|} nt          t          |j                            D ]4\  }}|j        dv r|                    dd           |j        dk    r|                    dd           |                    d          }|                    d	          }|j        d         }	|	                    |	          }
|
                    ||z   d
d|
gid|
gid|i          }|dz  }d}d}t          |          t          t          j                  k    r||z   }t          j        }n||z   dz   }t          j        }|
                    |dd|
gid|
gi||d          }|r|                    |          }|r|j                            |	          }t%                      }|j        j        |_        |j        j        |_        |                    |	|           |                    |	|           |                    ||           |                    ||           t          |          t          t          j                  k    r|                    dd           |dz  }6|                                 d}d}t          |j                  D ]#\  }}|j        dk    rt5          |          r|} n$|t          t          |j                            D ]n\  }}||k    r nb|j        dk    rV|                    d          rA|j        d         }	|	                    |	          }
|                    ||z   d           |dz  }o|                                 dS )z
    This implementation refers to lots of Paddle/python/paddle/base/optimizer.py.
    The difference between this function with 'PipelineOptimizer' is that
    'send_v2' op and 'recv_v2' op have been inserted in program by 'reshard'.
    r   N)send_v2recv_v2r   Fr  r   op_roler   c_sync_calc_streamXOutindexr   rv   rx   rz   r<   r   )r  r   pipeline_flag r  r>   )r   	enumerater7   r*   r   r   	_set_attrru   r+   rM   _insert_op_without_syncintr   BackwardOptimizeget_dist_op_for_program	dist_attrget_input_dist_attrr
   process_meshchunk_idset_input_dist_attrset_output_dist_attrset_op_dist_attr_for_programForwardrD   r   has_attrr.   rC   )r/   dist_contextr   offsetfirst_optimize_indexr	  r2   r  r   r   rM   sync_calc_opinsert_indexnew_op_rolesync_comm_opdist_opout_dist_attrop_dist_attrbackward_recv_indexs                      r   _insert_sync_for_fthenb_1f1br'    s4     e e#"4	??33 	 	IE2b!! ',$
 #4	??33 F	  F	 IE2w000_e444w)##.666''),,''),,-a0ii))$<<&.-#<"SEN$g.  =     !  $"w<<3v#7#777#7&#@L"(/KK#(6>A#5L"(/K$<<&-#<"SEN#.#*   = 	  	    *BB2FFG (/(9(M(M$) ) (8'9'9#-: %1 180A0J-$88$m   %99$m   %AA(,   %AA(,   w<<3v~#6#666 **?B???aKF""59-- 	 	IE2w)##r(:(:#&+#& #4	??33 	 	IE2+++w...2;;3O3O..q1ii))  e <<<!Ke er   c                 2    |D ]}t          | ||           d S r'   )r   )r   r   r*   r2   s       r   _add_ops_into_blockr)  B  s0     2 2	9b11112 2r   c                     | j         dv S )N)fetchfetch_v2)r   )r2   s    r   _is_fetch_opr-  G  s    7+++r   c                    |                                  j        }t          |          }t          |          dk    rd S d}d }d }||k     r||         j        dk    r||         j        }|dz  }*|dz   }||k     r-||         j        dk    r|dz  }||k     r||         j        dk    ||k    r1|dk    s
J d            t	          ||          D ]}|||         _        nR||         j        }|dk    s||k    sJ d| d            t	          ||          D ]}|||         _        |dz   }||k     |dk    r|dk    rt          d          d S d S )Nr   r=   r<   zfirst_left_op_role can't be -1.z%The left and right operators of (idx[z]) have different op_role.z#all the ops don't have the op_role.)r)   r*   r@   r  rB   
ValueError)main_programall_opsops_leniopfirst_left_op_rolefirst_right_op_role	right_idxrJ   s           r   forward_complete_op_roler7  K  s   ''))-G'llG
7||q
C
--3<2%%!(!51HCaIg%%')*<*D*J*JQ	 g%%')*<*D*J*JG##)R///5 0// !i00 > >C+=GCL((&-i&8&@#&",,)-@@@@[C[[[ A@A
 !i00 ( (C+>GCL(#a-CC5 --6 R$72$=$=>???  $=$=r   Tc                    fd} || dz
            } || dz             }||k    r|S  || dz             }||k    r|S |                                           dv r|                              d          }|                                }|D ]T}	|	j        r"|	j        j        dk    r|	j        j        dk    c S |	                    d          r|	j        dk    r	|	j        c S UdS )Nc                     | dk     s| t                    k    rdS |          }r|j        dS |j        j        S |                    d          r|j        S dS )Nr   r=   r  )r@   r  r  r  )op_idxr2   r*   	with_dists     r   get_chunk_idz$infer_chunk_id.<locals>.get_chunk_idt  sk    A::3s88++2[ 		|#r|,,{{:&& {"rr   r<   r   )zbuiltin.combinezbuiltin.splitr   r=   r  )r   resultall_used_opsr  r  r  )
r:  r*   r;  r<  prev_op_chunk_idnext_op_chunk_idnext_next_op_chunk_id
result_varr>  used_ops
    ``       r   infer_chunk_idrD  s  sA         $|FQJ//#|FQJ//+++(L!44000
6{AAA[''**
!..00# 	( 	(G  (W%6%?2%E%E(1R7777!!*-- ('2Bb2H2H''''2r   c                 D    t                      fd |           S )Nc                    |                                  }|D ]q}|v r dS                     |           |j        r|j        j        dk    r|j        j        c S |                                D ]} |          }|dk    r|c c S rdS )Nr=   )r>  r-   r  r  r   )rM   r>  rC  
output_varr  dfsvisiteds        r   rH  z&find_var_used_op_chunk_id.<locals>.dfs  s    ''))# 
	( 
	(G'!!rrKK     (W%6%?2%E%E(1111")//"3"3 ( (J"s:H2~~' &( rr   r   )rM   rH  rI  s    @@r   find_var_used_op_chunk_idrJ    s:    eeG      3s88Or   c                    t          |            t          |            |                                 j        }|                                 }|                                 }|                                 }|                                j        }|                                j        }|                                j        }|                                }	|                                }
t                      }t          |t          j        j	                  r@t          j        	                    t          j
                                        j                  }t          j        j                                        }|                    |           d}t#          t%          |          dz
  dd          D ]}||         j        dk    r;||         j        dk    rd}n'||         j        dk    rd}n||         j        dk    rd}|dk    r5||                                          ||                                          |dk    r}||                                          t#          ||                                                   D ]}||                             |          }|                                du rd	| d
||                                          d
| }t          j                            ||                    t          j                            ||                             |          |           |	                    ||                                          }||_        |j         |_         ||                             |          !                    |            ||                                          t#          ||                                                   D ]}||                             |          }||                             |          }|                                du s|                                du rK||                                         dk    s||                                         dk    r!||                             |          j        }n||                             |          }|"                                }d }|D ]}|                                dk    r|}||#                                d         }nd	| d
||                                          d
| }t          j                            ||                    t          j                            ||                             |          |           |                                du ri|	                    ||                                          }||_        |j         |_         ||                             |          !                    |           |                                du ri|
                    ||                                          }||_        |j         |_         ||                             |          !                    |           ||                                          ||                                          |||fS )Noptr<   r=   bwdr   fwdr   Fvar_rX   z
pd_op.datazbuiltin.parameterzbuiltin.shadow_outputry   )$r   r7  r)   r*   rA   _get_devicer!   r   r   	CUDAPlacedistributedParallelEnvdev_idr   	libpaddlePlace	set_placerB   r@   r  erasenum_resultsr=  	use_emptyr   pirset_insertion_point_after_C_opsset_persistable_value	add_kwargr   
place_attrr   replace_all_uses_withr>  rz   )r0  enable_send_recv_overlapcomplete_opsfwd_programbwd_programopt_programfwd_opsbwd_opsopt_ops	opt_block	bwd_blockplace	cur_placeregionr:  rJ   result_in_optr   new_result_var_in_optresult_in_bwdresult_valueused_opsshadow_output_op_usedrC  new_result_var_in_bwds                            r   -_split_program_into_forward_backward_optimizerv    s    <(((\***,,..2L$$&&K$$&&K$$&&K&&((,G&&((,G&&((,G((**I((**IMME%)344 
 ****,,3
 
 %++--IFL))A-r266 d$ d$'2--F#+q00f%-22f%-22U??FO!!###FO!!####u__FO!!###WV_88::;;   !( 6 6s ; ; **,,55N&NN<+?+D+D+F+FNNNNDJ88IIIM77..s33T   -6,?,?m0022- -) 8A)4%1 *5 FO**3//EE-   FO!!#### WV_88::;; 8 8 !( 6 6s ; ; ' 6 6s ; ; "++--66$..00E99  ,,..,>>"6?//115HHH&v55c::? (4F';'B'B3'G'G#/#<#<#>#>04-'/ @ @G&||~~1HHH8? 50<#8#>#>#@#@#ODD $W&#V#V<3G3L3L3N3N#V#VQT#V#VD"J@@ '   #M?? ' 6 6s ; ;T   !**,,55,5,?,?m0022- -) 8A)4%1 *5 FO**3//EE-   !**,,55,5,?,?m0022- -) 8A)4%1 *5 FO**3//EE-   FO!!###FO!!####[00r   c                                fd}d }                                 dk    rdgS  |            rdgdz  S                     d          rdgS                                 D ]} ||          sdgc S dgS )Nc                      g d}                      d          sdS dk     rdS t          d          D ]/}|z
                                           | d|z
           k    r dS 0dS )N)pd_op.full_int_arraypd_op.reshapery  rz  zpd_op.matmulry  rz  z
pd_op.add_grad_merge_addF      T)r  rB   r   )ops_patternra   r1  cur_opr:  s     r   is_reshape_matmul_patternz<_pir_get_backward_op_type.<locals>.is_reshape_matmul_pattern5  s    	
 	
 	
 /00 	5A::5q 	 	Avz"''))[Q-???uu @tr   c                 d    |                                  D ]}|                    d          r dS dS )Nr{  TF)r>  r  )valuer2   s     r   used_by_grad_merge_addz9_pir_get_backward_op_type.<locals>.used_by_grad_merge_addJ  sC    $$&& 	 	B{{+,, ttur   r   
backward_br   r|  r{  )rY  r  r   )r1  r:  r  r  rs   r  s   ``   @r   _pir_get_backward_op_typer  0  s    V_F      *   q  ~  "" "~!!'(( ~ .."" " "%%f-- 	" >!!!	" >r   c                 |    || | }n|}|                                  }|                                j        }|||fS r'   )rA   r)   r*   )r/   r   r  program_namecloned_programr*   s         r   _create_program_and_opsr  b  sM    ".H..]]__N

%
%
'
'
+C,,r   c                 H   t                      }t          | j                  D ]}\  }} ||          }|                    dg           }g }|dk    rm|                                D ]W\  }}	t                      ||<   ||                             d          }
t          ||
|	           |                    |
           Xn|                                D ]y\  }}	t          |	          dk    ra||         
                    |j                  }
|
                    |j                   t          ||
|	           |                    |
           z|D ]P}|                    d          d         }d }|D ]}
|
                    |          r|
} n|rt!          |||           Q|S )Nr+  r   )
parent_idxr  )r   r  r   popr   r	   r   r)  r,   r@   _create_blockr  _set_forward_block_idxforward_block_idxrq   r   r   )r/   split_methodr   ibr   type_to_ops	fetch_ops
dst_blocksr   r*   r   fetch_opr   fetch_blocks                 r   _build_vpp_sub_programsr  n  s   !mmO"7>22  B  BI"l9--OOGR00	
77(..00 - -	c(/		%+D177::	#Iy#>>>!!),,,,	- )..00 	1 	1	cs88a<< / 5 C C#,#7 !D ! !I 44!3   (	9cBBB%%i000! 		B 		BHnnS))!,GK'  	0099 "+KE  B	;AAA		B r   c                     | j         j        sd| j         _        |j         j        }| j         j        |vr-|                    | j         j                   ||j         _        dS dS )z
    Add the extra event dependency of the two operators.
    This function mainly aims for the cross-programs in pipeline parallelism,
    especial for the 'send_v2' 'recv_v2' etc.
    TN)r  force_record_eventevents_to_waitevent_to_recordr,   )recorder_op	waiter_opwaiter_wait_lists      r   _add_event_dependencyr    sq      3 8370 !*9,4DDD 5 EFFF-=	*** EDr   /c	           
         |                      |d                   }	|                    |	          }
|<|                     |d          d|	j        d          }|                    ||
           |                     |d          d|	j                  }|                    ||
           |                     |dd|i||d	|||d
          }t          ||
j        |
j        ||           |S )Nr   @reshape.outFr   r   r   z@reshape.xshape)r   r   reshape2r  )r  XShape)r   r  op_namescoper  )r  ref_mappingctxr  )	rM    get_tensor_dist_attr_for_programr   r    set_tensor_dist_attr_for_programr  r   r  dims_mapping)r   r	  xr   r  r  r  outr  var_xx_dist_attrx_shape
reshape_ops                r   _insert_reshape_opr    s0    IIadOOE??FFK
{aD&&&+  
 

 	55c;GGGqt$<$<$<EKPPG11';GGG..Qxw//(
 
 / 
 
J ; -,    Jr   c                    | j         }||         }|                    d          }|rJ d| d            |                    d          }|rJ d| d            |                    d          }|                    d          }	|                    d          }
|                    d	          }|                    d
          }|                    d          }|                     |d                   }|                     |
d                   }|                     |d                   }|j        }|j        }|j        }t          |          t          |          k    s+J dt          |           dt          |           d            t          |          dk    r'|dd         |dd         k    sJ d| d| d            |d         |d         z  gt          |dd                    }|d         |d         z  g|dd          }|                    |          j	        }t          | |dz   ||||||          }t          | |dz   |
|||||          }|                     |d          d|j        d          }|                    ||                    |                     |                    |          }|                    |j        |                    |                     |                    |j        |                    |                     |                    |j        |                    |                     |                     |dz   d||dd|idd||d          }|                    ||           t          | |dz   |j        g|||||| 	  	         |                     |dz   d|
|	dd|idd||d          }|                    ||                    |                     |                     |d!           d S )"Ntrans_xmatmul_grad(id=J) with tran_x == True is not supported for splitting matmul_grad to matmultrans_yJ) with tran_y == True is not supported for splitting matmul_grad to matmulr  YzOut@GRADzX@GRADzY@GRADr  r   BThe rank of x must be equal to that of out_grad, but got x rank =  and out_grad rank = r   r   PThe first two dimensions of x must be equal to that of out_grad, but got x_dims: and out_grad_dims:r<   )r  r  r  r  Fr     	matmul_v2)r  r  r  T)r  r  r  r  r     )r  r  r  r  r>   )r*   ru   rq   rs   rM   r   r@   r7   get_op_dist_attr_for_programr  r  r   r   r  r  r  r   r  r  r  rC   )r   matmul_grad_idr  r  r*   matmul_grad_optran_xtran_yr  yout_gradx_grady_gradr  r  var_out_grad
var_y_gradx_dimsout_grad_dimsy_grad_dims
new_x_dimsnew_out_grad_dimsr  new_xnew_out_grad
new_y_gradmatmul_grad_dist_attr	matmul_ops                               r   split_matmul_grad_to_matmulr    sU    )C(N  ++F  t.ttt :   ++F  t.ttt : 	S!!AS!!A##J//H""8,,F""8,,F!!),,GIIadOOE99Xa[))L6!9%%J[F &M"Kv;;#m,,,,, 	ESQW[[  	E  	Eor  tA  pB  pB  	E  	E  	E -,, 6{{Qac{mAaC0000 K_e  K  K  {H  K  K  K 100 )fQi';$vabbz*:*:;Ja=++	qrr	 88   	!!	 	 	E &!!	 	 	L !!q	''' "  J 1155jAA  
 )EE  --
LAA%HH   --55lCC   ..55jAA  
 --q ..
#(	
 
 .  I --i9NOOO	!!
 
 
 
 --q A&&(	
 
 .  I --<<<^LL   
^%00000r   c                 
   | j         }||         }|                    d          rJ d| d            |                    d          rJ d| d            |                    d          }|                    d          }|                    d          }|                    d          }|                    d          }|j        }	|j        }
|j        }|j        }t          |
          t          |          k    s+J d	t          |
           d
t          |           d            t          |
          dk    r'|
dd         |dd         k    sJ d|
 d| d            |
d         |
d         z  gt          |
dd                    }|d         |d         z  g|dd          }|j        }t          j
                            |           t          j                            ||          }|                                }|	|_        |                    d|           |	|                    d                                          _        |                    d                                                              d|           t          j
                            |           t          j                            ||          }|                                }|	|_        |                    d|           |	|                    d                                          _        |                    d                                                              d|           t          j
                            |           t          j                            ||dd          }|                                }|	|_        |                    d|           t          j
                            |           t          j                            ||          }|                                }|	|_        |                    d|           |	|                    d                                          _        |                    d                                                              d|           t          j
                            |           t          j                            ||dd          }|	|                                _        |                                                    d|           |                    |           |                    |           |                                 d S )Nr  r  r  r  r  r   r<   r   r  r  r   r  r  r  TF)r*   r  operand_sourcer=  r  r   r@   r7   r  r   r[  r\  r]  reshapeget_defining_opset_int_attrmatmulra  rX  )r   r  r*   r  r  r  r  r  r  r  r  r  r  r  r  r  r  x_reshape_opr  out_grad_reshape_opr  new_matmul_opnew_y_grad_reshapey_grad_reshape_op
new_x_grads                            r    _pir_split_matmul_grad_to_matmulr  e  sk   
)C(N&&y11  t.ttt 1 &&y11  t.ttt 1 	%%a((A%%a((A,,Q//H""1%%F""1%%F$GWFNM,Kv;;#m,,,,, 	ESQW[[  	E  	Eor  tA  pB  pB  	E  	E  	E -,, 6{{Qac{mAaC0000 K_e  K  K  {H  K  K  K 100 )fQi';$vabbz*:*:;Ja=++	qrr	 &H
J((888M!!!Z00E((**L"Lj(333?FL""2244<""2244AAH   J((666=((3DEEL&6688")$$Z:::FM&&q))99;;C&&q))99;;HHH   J(()<===%%e\4GGJ..00M#Mz8444
J((777..z;GG*::<< '"":x888DK$$Q''7799A$$Q''7799FFH   J((888%%h5$??J+2J  (  --j(CCC
  ,,,
  !3444r   c                   P    e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd ZdS )PipelineMemoryEstimatorc                 ^    i | _         g | _        t          j        t                    | _        d S r'   )type_to_skip_gc_varsprogram_typeslogging	getLoggerr   r   rj   s    r   rl   z PipelineMemoryEstimator.__init__  s(    $&!'11r   c           
         || _         i }|                                D ]!\  }}t          |          ||<   i | j        |<   "t	                      }t          |          D ]t}||         }||z  }	|dv r$t          |	          dk    sJ d| d|	 d            t          t          |	dgt          |	          z                      }	|	| j        |<   ||z  }udS )z
        Get the skip_gc_vars for each type of program.

        The order of program_types is the same as the order in the pipeline's micro batch.
        For example, in 1F1B pipeline, the order of program_types is ['forward', 'backward'].
        r   r   r   r   r   r=   N)	r  r   r   r  r(   r[   r@   r   r   )
rk   r   r  r   r   r/   r   r   r   r   s
             r   set_program_skip_gc_varsz0PipelineMemoryEstimator.set_program_skip_gc_vars  s2    + ",2244 	1 	1MD'*G*P*P!$'.0D%d++!$ // 	4 	4H1(;M(+AAL555<((A--- XYa  X  X  IU  X  X  X .--  L2$\9J9J2J K KLLL2>D%h/"m3""	4 	4r   c                    || j         vrt          d| d          d |j        D             }|                    d            |                     ||          }| j         |         D ]#}||vr||         d         | j         |         |<   $i }| j         |         }| j                            |          dk    r5| j        | j                            |          dz
           }	| j         |	         }|                     ||||          \  }
}|
|fS )Nz9Please set the skip_gc_vars before estimating memory for z	 program.c                 X    g | ]'}|j         D ]}|j                                        |g(S r   )r*   r   id)rQ   r   r2   s      r   rR   z;PipelineMemoryEstimator.estimate_memory.<locals>.<listcomp>  sJ     
 
 
#(UY
 
?ARWZZ\\2
 
 
 
r   c                     | d         S r   r   )r  s    r   <lambda>z9PipelineMemoryEstimator.estimate_memory.<locals>.<lambda>  s
    qt r   )r   sizer<   )r  r/  r   sort_get_program_var_infor  r	  _estimate_max_memory)rk   r/   program_typer  ordered_opsvar_infor   r0   r   prev_program_type	mem_usage
max_memorys               r   estimate_memoryz'PipelineMemoryEstimator.estimate_memory  sg   t888cLccc  
 
,3N
 
 
 	^^,,, --k<HH1,? 	 	Hx''@HAAD%l3H==
 0>##L11Q66 $ 2"((66:!  45FGL !% 9 9<!
 !
	: *$$r   c                    d}d}t                      }|D ]}|                    |           |D ]\  }	}
|
j        dv rg }|
j        |
j        z   D ]+}||vr||         dxx         dz  cc<   ||vr|                     ||          s|                    |           | j                            d| d||         d          d||         d          d	| d
|||         d         z    d|
j         d|
j         d|
j                    |||         d         z  }t          ||          }| 	                    ||          r/|                     ||          s||vr|
                    |           t          ||          }-t          |          D ]}| j                            d| d||         d          d||         d          d	| d
|||         d         z
   d|
j         d|
j         d|
j                    |||         d         z  }||v r||xx         ||         d         z  cc<   |D ]}||vr|||         z  }||fS )Nr   create_py_readercreate_double_buffer_readerreadcountr<   zadd z, var size: r  z,count: z,mem_usage: z -> z
,op type: z, input_arg_names: z, output_arg_names: zremove )r(   r-   r   r+   r.   _is_persistabler   r   max_is_last_usedr,   )rk   r  r  r   r0   r  r  has_used_varsr   rX   r2   last_use_varss               r   r  z,PipelineMemoryEstimator._estimate_max_memory  s    	
 % 	( 	(Hh''''  .	I .	IEArw   
 M.1DD 8 88++"7+++q0+++=009M9Mh: :0 "%%h///K%%~x ~ ~Xh5G5O ~ ~"*8"4W"=~ ~&/~ ~5>(ASTZA[5[~ ~ %'G~ ~ AC@R~ ~ ikh{~ ~   (!3F!;;I!$Z!;!;J%%h99 7 008DD7$L88%,,X666 Y77

  .. 	I 	I!!zh z zHX4Fv4N z z&x09z z"+z z1:Xh=OPV=W1Wz z !#z z =?<Nz z egdwz z   Xh/77	|++ ***hx.@.HH***	I % 	4 	4H|++\(33	*$$r   c                     | j         |         }t          d |                                D                       }|dk     rt          d          |S )aN  
        For a given type of program, calculate the increase memory usage.

        The increase memory usage is the memory usage of the variables that are setting to skip_gc_vars.
        Persistable variables are not included in the increase memory usage because they are allocated when
        running the startup program.
        c                     g | ]\  }}|S r   r   )rQ   rX   mems      r   rR   z@PipelineMemoryEstimator._get_increase_memory.<locals>.<listcomp>P  s    FFFvq#sFFFr   r   zONo size info for skip_gc_vars, please run estimate_memory to get var size info.)r  sumr   r/  )rk   r  r   increase_memorys       r   _get_increase_memoryz,PipelineMemoryEstimator._get_increase_memoryG  sa     0>FF1C1C1E1EFFFGGQa   r   c           	      2   i }|D ]\  }}|j         dv rt                      }|                    |           |j        |j        z   D ]O}|                    |          s|                    |          }|r |                     |||||j        v            P|S )Nr  )is_input)r   re   r   r+   r.   r   r  _update_var_info)	rk   r  r  r  rX   r2   r   r   r#  s	            r   r  z-PipelineMemoryEstimator._get_program_var_infoW  s      	 	EArw   
 !mmGr""".1DD  ((22 &>>rBB ))  !)R-?!?	 *    r   c                 2   |r|                     |          n|                    |          }||vrP|                    |dddd           |j        rd||         d<   d S |                     |          }|||         d<   d S ||         dxx         dz  cc<   d S )	Nr   r<   F)r  r  r   Tr   r  r  )get_serial_inputget_serial_output
setdefaultr   _get_var_size)rk   r   r#  r  r  rM   var_sizes          r   r  z(PipelineMemoryEstimator._update_var_infot  s     5G$$X...**844 	 8##1qGG    48"=1))#..H)1HXv&&&Xw'''1,'''''r   c                 Z    d |j         D             }|                     ||j                  S )Nc                 "    g | ]}|d k    rdn|S )r=   r<   r   )rQ   dims     r   rR   z9PipelineMemoryEstimator._get_var_size.<locals>.<listcomp>  s$    BBB#))QQBBBr   )r   _calculate_bytesr   )rk   rM   	var_shapes      r   r  z%PipelineMemoryEstimator._get_var_size  s/    BB	BBB	$$Y	:::r   c                 @   t           j        dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        dt           j	        di	}|rt          d |d          nd}|                    |d          }||z  S )Nr|  r  r   r<   c                     | |z  S r'   r   )r  r  s     r   r  z:PipelineMemoryEstimator._calculate_bytes.<locals>.<lambda>  s
    A r   r   )r   float64int64float32int32float16bfloat16int16int8uint8r   get)rk   r  r   dtype_to_sizetotal_countdtype_factors         r   r  z(PipelineMemoryEstimator._calculate_bytes  s    NAL!NAL!NAOQL!KL!

 9BHF%%y!444q 	 %((22\))r   c                 2    ||vrdS ||         d         dk    S )NFr  r   r   rk   r   r  s      r   r  z%PipelineMemoryEstimator._is_last_used  s'    8##5!'*a//r   c                 *    ||vrdS ||         d         S )NFr   r   r,  s      r   r  z'PipelineMemoryEstimator._is_persistable  s"    8##5!-00r   N)r   r   r   rl   r  r  r  r  r  r  r  r  r  r  r   r   r   r  r    s        2 2 2
4 4 48"% "% "%H@% @% @%D     :- - -(; ; ;* * *(0 0 01 1 1 1 1r   r  r'   )F)T)Nr  )r  )Cr  collectionsr   enumr   	functoolsr   r   paddle.baser   paddle.base.frameworkr   r	   6paddle.distributed.auto_parallel.static.dist_attributer
   -paddle.distributed.auto_parallel.static.utilsr   r   r   r   paddle.frameworkr   rP  auto_parallel.static.utilsr   VarDescVarTypeREADERSTEP_SCOPESDENSE_TENSOR_ARRAYFEED_MINIBATCH
FETCH_LISTr   INFOr   r   r%   r5   r:   rN   rc   re   r   r   r   r   r   r   r   r   r   r   r'  r)  r-  r7  rD  rJ  rv  r  r  r  r  r  r  r  r  r   r   r   <module>r?     s    # # # # # #                    4 4 4 4 4 4 4 4                      0 / / / / / 	LL$L+L'L#  
GL	!	!
/ / / / /T / / /     $ $ $     F/9 /9 /9d3
 3
 3
 3
 3
 3
 3
 3
l3 3 3
  4
 
 
(     F1 1 1h  .  . . . ..L L L L. . ..l l l l^2 2 2
, , ,%@ %@ %@P" " " "J  , ,1@1 @1 @1 @1F/ / /d	- 	- 	- 	-% % %P> > >2 	- - - -b 7:K1 K1 K1 K1\W W Wtk1 k1 k1 k1 k1 k1 k1 k1 k1 k1r   