
    Αi                    |   S SK r S SKrS SKrS SKrS SKrS SKJr  S SKrS SK	r	S SK
Jr  S SKJr  S SKJr  S SKJr  S SKJrJr  S SKJr  S	S
KJrJr  SSKJrJrJr  \R<                  R>                  r\R<                  RA                  5       r!\RD                  RF                  RH                  \RD                  RF                  RJ                  \RD                  RF                  RL                  \RD                  RF                  RN                  \RD                  RF                  RP                  /r)S/r*/ SQr+/ SQr,SS jr-S r.S r/S r0S r1S r2S r3S r4S r5S r6S r7S r8S r9S r:SS jr;SS jr<S  r=S! r>S" r?S# r@S$ rAS% rBSS& jrCS' rDS( rES) rFS* rG   SS+ jrHS, rI SS- jrJS. rKS/ rLS0 rMS1 rNS2 rOSS3 jrPS4 rQS5 rRS6 rSS7 rTSS8 jrUS9 rVS: rWS; rXS< rYS= rZS> r[S? r\S@ r]SA r^SB r_SC r`SD raSE rbSF rcSG rdSH reSI rfSJ rgSK rhSL ri " SM SN5      rjSO rkSP rlSQ rmSR rnSS roST rpSU rqSV rrSW rsSX rtSY ruSZ rvS[ rwS\ rxS] ryS^ rzS_ r{S` r|Sa r}Sb r~Sc rSd rSe rSf r   SSg jr      SSh jrSi rSj r\" \5      rSk rSl rSm rSn rSo rSp rSq rSr rSs\St\Su\4Sv jrSSw jrSx rSy rSz r SS{ jr    SS| jrS}\S~\4S jrS rg)    N)reduce)use_pir_api)pir)wrap_decorator)core)is_belong_to_optimizeris_parameter)Variable   )ProcessMeshmerge_process_meshes   )DistTensorSpecOperatorDistAttrTensorDistAttr	expand_v2)sumsqrtfill_constantelementwise_maxelementwise_divstack
reduce_sum)zbuiltin.combinezbuiltin.splitpd_op.pylayerzcf.yieldzcf.tuple_pushzcf.tuple_popzcf.stack_createzcf.has_elementsc                 D   [         R                  " U5      nSUl        UR                  (       d`  UR	                  U 5        [         R
                  " 5       n[         R                  " S5      nUR                  U5        UR                  U5        U$ UR	                  U 5        U$ )NFz>%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s)	logging	getLogger	propagatehandlerssetLevelStreamHandler	FormattersetFormatter
addHandler)	log_levelnameloggerlog_handler
log_formats        m/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/auto_parallel/static/utils.py
get_loggerr+   G   s    t$FF??	"++-&&L

 	  ,+& M 		"M    c                 D    U[        U 5      * :  a  U[        U 5      :  a  gg)NTF)len)listindexs     r*   is_valid_list_indexr1   W   s!    T
us4y0r,   c                     U S:w  a  ggNTF mappings    r*   is_dim_shardr8   ^       "}r,   c                     U S:X  a  ggr3   r5   r6   s    r*   is_dim_replicater;   e   r9   r,   c                 4   U c  g[        S U  5       5      (       d  g[        [        U 5      5       H(  nX   S:  d  X   [        UR                  5      :  d  M(    g   [        [        UR                  5      5       H  nU R	                  U5      S:  d  M    g   g)NFc              3   B   #    U  H  n[        U[        5      v   M     g 7fN)
isinstanceint).0ds     r*   	<genexpr>&verify_dims_mapping.<locals>.<genexpr>o   s     8<az!S!!<s   r4   r   T)allranger.   shapecount)dims_mappingprocess_meshis      r*   verify_dims_mappingrL   l   s    8<8883|$%?R<?c,:L:L6M#M & 3|))*+a 1$ , r,   c                    / nU  H  nUc  UR                  S5        M  UR                  UR                  R                  U5         S:X  a  UR                  S5        MX  UR                  UR                  R                  U5      5        M     U$ )Nr4   r   )appendrG   	dim_namesr0   )
shard_specrJ   rI   shards       r*   convert_to_dims_mappingrR   z   s}    L=# 6 6 < <U CDI# 6 6 < <U CD  r,   c                     / nU  H:  nUS:X  a  UR                  S 5        M  UR                  UR                  U   5        M<     U$ Nr4   )rN   rO   )rI   rJ   rP   dim_mappings       r*   convert_to_shard_specrV      sH    J#"d#l44[AB	 $
 r,   c                 v   [        U 5      [        U5      :w  a  gU  H3  nUb  [        U[        5      (       d    gUc  M"  X2R                  ;  d  M3    g   [	        X5      n[        XB5      (       d  g[        [        U5      5       H3  nXE   S:w  d  M  X   S:  d  M  X   UR                  XE      -  S:w  d  M3    g   g)NFr4   r   T)r.   r?   strrO   rR   rL   rF   rG   )rP   tensor_shaperJ   rQ   rI   rK   s         r*   verify_shard_specrZ      s    
:#l++Zs%;%;.D.D!D	 
 +:DL|::3|$%Or!!#,"4"4\_"EEJ & r,   c                 d    U (       d  g U S   nU  H  nUS:X  a  UnM  US:X  a  M  X:X  a  M    g    U$ )Nr   r4   r5   )dim_mappingscompatible_mappingr7   s      r*   compute_compatible_dim_mappingr^      sH    %a#!(]*   r,   c                     U (       d  g [        U S   5      nU  H#  nUc   S5       e[        U5      U:X  a  M   S5       e   / n[        U 6  H-  n[        [        U5      5      nUc    g UR	                  U5        M/     U$ )Nr   z8Dims mapping must not be None for compatible computationzKThe length of dims_mapping in list must be same for compatible computation.)r.   zipr^   r/   rN   )dims_mapping_listlengthrI   compatible_resultr\   compatible_dim_mappings         r*   compute_compatible_dims_mappingre      s    "1%&F)' 	
F	
' < F* 	
Y	
*	 * ./!?"
 ")  !78 0 r,   c                 P    S nU (       d  U$ U  H  nUc  M  Ub  X:X  a  UnM    g    U$ r>   r5   )process_mesh_listcompatible_process_meshrJ   s      r*   compute_compatible_process_meshri      s>    "&&)#'/*:*6' * #"r,   c                 ^   [        U 5      [        U5      :X  d   eSn/ n[        [        U 5      5       H2  n[        X   X   5      (       d   eUR                  X   X      5        M4     [	        U5      nUc  g[        [        U 5      5       H  nXPU   X      :w  d  M  XPU   X   '   SnM     U$ NFT)r.   rF   r1   rN   r^   )ra   
index_listchangedr\   rK   rd   s         r*   )compute_compatible_and_update_dim_mappingrn      s     !S_444GL3()*"#4#7GGGG-0?@ + <LI%3()*!q%9*-%HH2Ha /G + Nr,   c                 2    U [         R                  " 5       -   $ )z=
Append auto parallel suffix for distributed attribute name.
)r   kAutoParallelSuffixr&   s    r*   append_distributed_attr_suffixrr      s     $**,,,r,   c                 J    U R                  [        R                  " 5       5      $ )z>
Remove auto parallel suffix from distributed attribute name.
)stripr   rp   rq   s    r*   remove_distributed_attr_suffixru      s     ::d..011r,   c                    SSK Jn  Uc  U" 5       nUR                  5       (       d   S5       eU R                   H  nUR                  R                  5        HB  nUR                  U5      nUR                  U5      nUc  M*  UR                  5       (       a  MA      g   UR                   HB  nUR                  W5      nUR                  U5      n	U	c  M*  UR                  5       (       a  MA      g   M     g)Nr   get_default_distributed_contextz8Distributed attributes must be initialized before check.FT)dist_contextrx   is_initialized_for_programblocksvarsvaluesget_dist_tensor_for_graph get_tensor_dist_attr_for_programis_validopsget_dist_op_for_graphget_op_dist_attr_for_program)
programry   rx   blocktensordist_tensortensor_dist_attropdist_opop_dist_attrs
             r*   "check_distributed_attr_for_programr      s    =682244 B4 jj'')F&@@HK+LL  !,{7K7K7M7M * ))B"88@G'DDRHL(73C3C3E3E	    r,   c                     [         R                  " 5       nUR                  5         SSKJnJn  Uc  U" 5       n[        U SS9  O!U" 5       nU" U5        [        U SS9  U" U5        UR                  5         g)z
This function reuses the original program output ability with a distributed context.
Using lock can avoid multiple threads change the default distributed context simultaneously.
r   )rx   set_default_distributed_contextNT)flush)	threadingLockacquirery   rx   r   printrelease)r   ry   lockrx   r   original_default_contexts         r*   print_program_with_dist_attrr     se    
 >>DLLN
 68gT"#B#D '5gT"'(@ALLNr,   c                 d   X0;   d   SU SU  35       eU R                  U5      n[        X5      n[        X   5       Vs/ s H  oeSS PM	     nn[        X   5       H
  nXgU   U'   M     U Vs/ s H  n[        X5      PM     nnU V	s/ s H  oU	   PM	     n
n	[	        U
5      $ s  snf s  snf s  sn	f )af  
Given a rank and the processes mesh the rank belongs to,
compute the communication peers of the rank based on the give axis in the mesh.

Example: 16 processes managed in a 4-Dimensional mesh with shape of [2, 2, 2, 2].
the rank communication peers of rank 0 (included) are following:
in axis 0: [0, 1]
in axis 1: [0, 2]
in axis 2: [0, 4]
in axis 3: [0, 8]
zrank [z] is NOT in processes group N)r0   _linear_idx2coordinaterF   _coordinate2linear_idxsorted)	processesrG   axisrankrank_relative
coordinaterK   coordinates_in_groupranks_in_group_relativeidxranks_in_groups              r*   _get_comm_groupr   )  s      
29+> OOD)M'=J383EF3EaqM3EF 5;()Q%  
 /.J 	u1.   1HH0Gn0GNH.!! G Is   B#-B(B-c                 B    U R                  U5      n[        X5      nXR   $ )z
Given a rank and the processes mesh the rank belongs to,
compute the index of the rank in given axis.

Example: 27 processes managed in a 3-Dimensional mesh with shape of [3, 3, 3].
the index of rank 22 are:
in axis 0: 1
in axis 1: 1
in axis 2: 2
)r0   r   )r   rG   r   r   r   r   s         r*   _get_idx_in_axisr   L  s%     OOD)M'=Jr,   c                 X   [        U 5      [        U5      :X  d   SU  SU 35       e[        [        U 5      5       H2  nX   S:  d   SU SU 35       eX   X   :  a  M$   SU SU  SU 35       e   U S   nUS   n[        [        U 5      S	-
  SS5       H  nXCX   -  -  nX0U   -  nM     U$ )
a[  
convert a coordinate in multidimensional mesh space into a scala idx in linear space.

it use Row-major order for dimension conversion.
so it has:  [most_significant_dim, ..., least_significant_dim]
assume:

    the size of i-th dimension to be:  S[i]
    the index of j-th dimension is: I[j]

linear_idx of a n dimensional coordinate is:

    I[n-1] * (S[n-2] * S[n-3] * S[n-4] *     ....    S[0]) +
    I[n-2] * (         S[n-3] * S[n-4] *     ....    S[0]) +
    I[n-3] * (                  S[n-4] *     ....    S[0]) +
    ...
    I[1]   * (                                       S[0]) +
    I[0]

zCcoordinate should have the same size as mesh shape, but got shape: z, coordinate: r   zindex in dimension [z"] is least than zero. coordinate: z"index beyond extent in dimension [z
]. shape: r4   r   )r.   rF   )
mesh_shaper   rK   base
linear_idxs        r*   r   r   _  s    : z?c*o- 
Mj\Yghrgst- 3z?#}! 	
"1#%G
|T	
! }z}, 	
0:j\XbWcd	
,	 $ b>DBJ 3z?Q&B/Z]**
1 0 r,   c                    US:  d   SU S35       eU[         R                  " U 5      :  d   SU  SU 35       eSnS/[        U 5      -  n[        [	        [        U 5      5      5       H   nX-  n[        XPU   -  5      X4'   X U   -  nM"     U$ )a  
mapping a linear scala into multidimensional mesh space, return it coordinate in that space.

it is the inverse function of _coordinate2linear_idx.
assume:

    the size of i-th dimension to be:  S[i]
    the index of j-th dimension is: I[j]

the coordinate given linear_idx is:

    I[0] = linear_idx                                  % S[0]
    I[0] = (linear_idx / S[0])                         % S[1]
    I[0] = (linear_idx / (S[0] * S[1]))                % S[2]
    ....

r   zlinear index [z] is least than zeroz5linear index beyond the extent of mesh shape. shape: z, linear index: r   r4   )npprodr.   reversedrF   r@   )r   r   r   r   rK   offsets         r*   r   r     s    & ?MnZL8LMM?
++ 
?
|K[\f[gh+ DJ'JeC
O,-"F]23
1 . r,   c                 N   S nU R                    H_  nX$R                  ;   d  M  UR                  UR                  :X  d  M0  [        UR                  UR                  R	                  U5      5      n  O   Ub#  UR                  [        WR                  U5         $ UR                  S   $ Nr   )process_meshesprocess_idsrG   r   r0   r   )ry   target_meshr   r   meshs        r*   _get_corresponding_rankr     s    
 J++###

k6G6G(G/

D,,2248J  , &&"4::z:
 	
 &&q))r,   c                 h   U R                   nUR                  nUR                  R                   n[        U5      [        U5      :X  d   SU SU S35       e/ n[	        [        U5      5       HC  nX&   S:X  d  X6   S:X  a  UR                  X&   5        M(  UR                  X&   XCU      -  5        ME     U$ )Nzvariable shape [z] and dim_mapping [z] is NOT match !r4   )rG   rI   rJ   r.   rF   rN   )var	dist_attr	var_shaper7   r   	new_shaper   s          r*   _get_unshard_dist_shaper     s    		I$$G!!''Dy>S\) 
9+%8	AQR) IS^$>R7<2#5Y^,Y^d3<.@@A	 % r,   c                 J   SSK Jn  Uc  U" 5       nU R                  5        H  nUR                  (       d  M  UR	                  U5      n[        XE5      nUR                  R                  U5        UR                  nS/[        U5      -  nXul        UR                  XE5        M     g )Nr   rw   r4   )ry   rx   	list_varsis_datar   r   desc	set_shaperI   r.    set_tensor_dist_attr_for_program)dist_main_progdist_startup_progry   rx   r   r   inverse_shaperU   s           r*   make_data_unshardr     s    =68'');;;+LL  4CJMHH}-*77K$[!11K,7)99#P *r,   c                 <   SSSS.nU (       d  U$ [        U [        5      (       d  [        S[        U 5       S35      eU R	                  5        HK  u  p#US;  a  [        SU S35      e[        U[        5      (       d  [        S[        U5       S35      eX1U'   MM     U$ )z(Update default addition_info with inputsr   )epochbatch
batch_sizez7The type of 'addition_info' should be 'dict', but got ''.z[The key of 'addition_info' should be one of the ['epoch', 'batch', 'batch_size'], but got 'z7The value of 'addition_info' should be 'int', but got ')r?   dict	TypeErrortypeitems
ValueErrorr@   )addition_infoadd_infoitemvalues       r*   _update_addition_infor     s    Qa8Ht,,]+,B0
 	

 )..0KD;; BBFrK  eS))   $U}B0  #TN 1 r,   c                 D   U (       d  U $ [        U [        5      (       ak  U  Hc  n[        U[        5      (       d  [        S[	        U5       S35      e[
        R                  R                  U5      (       a  MV  [        SU S35      e   U $ [        S[	        U 5       S35      e)z!Validity check of input file pathz0The type of file path should be 'str', but got 'r   zThe file path 'z' does not exist.z1The type of file path should be 'list', but got ')	r?   r/   rX   r   r   ospathexistsr   )	file_pathfiles     r*   _check_valid_pathr     s    	It	$	$DdC((  $T
|2/  77>>$'' ?4&8I!JKK  Y(,
 	
r,   c                    U (       d  [        S5      e[        U [        5      (       d  [        S[	        U 5       S35      eU R                  5        Hs  u  p[        U[        5      (       d  [        S[	        U5       S35      e[        U[        R                  R                  5      (       a  M]  [        S[	        U5       S35      e   U $ )Nz'param_dict' cannot be None.z4The type of 'param_dict' should be 'dict', but got 'r   z:The type of key of 'param_dict' should be 'str', but got 'zDThe type of value of 'param_dict' should be 'DenseTensor', but got ')
r   r?   r   r   r   r   rX   paddler   DenseTensor)
param_dictr&   r   s      r*   _check_param_dictr      s    788
D))Z()-
 	

 &++-KDdC((  $T
|2/  eV[[%<%<==  $U}B0  . r,   c                    U (       d  U $ [        U [        5      (       d  [        S[        U 5       S35      eU R	                  5        H  u  p[        U[
        5      (       d  [        S[        U5       S35      e[        U[        5      (       d  [        S[        U5       S35      e/ SQn[        UR                  5       5      U:w  d  M  [        SUR                  5        S35      e   U $ )	Nz3The type of 'dist_attr' should be 'dict', but got 'r   z@The type of param name of 'dist_attr' should be 'str', but got 'z=The type of distributed attribute should be 'dict', but got ''process_shapeprocess_grouprI   rO   ziThe key of distributed attribute should be '['process_shape', 'process_group', 'dims_mapping']', but got .)	r?   r   r   r   r   rX   r/   keysr   )r   r&   r   attrs       r*   _check_dist_attrr   7  s    	4((Y(,
 	

 %??,KDdC((  $T
|2/  eT**  $U}A/ D EJJL!T) $zz|nA/ % -. r,   c                    SSK Jn  [        U [        R                  R
                  5      (       d   e[        U[        5      (       d   eUc  U" 5       n[        U5      nU(       d  [        XU5        [        XU5        g[        S5      e)a  
Save model parameter state, optimizer state, distributed attribute and
additional information of each rank.

Args:
    program(Program): The program to be saved.
    checkpoint_path(str): The path of the checkpoint file to be saved.
    dist_attr_path(str): The path of distributed attribute file to be saved.
    addition_info(dict, optional): Additional information, key should be selected in ['epoch', 'batch', 'batch_size'].
        Default values are 0, when 'addition_info' is None. Default: None.
    is_integrated(bool, optional): Whether to integrate param before save. Default: False.
    dist_context(DistributedContext ,optional): collect related distributed information for program

Returns:
    None

Examples:
    .. code-block:: python

        >>> import os
        >>> from paddle.distributed.auto_parallel.static.utils import save_distributed_checkpoint

        >>> step = 16000
        >>> global_batch_size = 32
        >>> path = os.path.join("./output", "step_%d" % step)
        >>> os.makedirs(path, exist_ok=True)
        >>> program = paddle.static.Program()

        >>> add_info = {'batch': step, "batch_size": global_batch_size}
        >>> save_distributed_checkpoint(program, path, path, add_info)

r   rw   Nz/Integrating parameter has not been implemented.)ry   rx   r?   r   staticProgramboolr   _save_distributed_state_dict_save_distributed_attributeNotImplementedError)r   checkpoint_pathdist_attr_pathr   is_integratedry   rx   s          r*   save_distributed_checkpointr   Z  sy    P >gv}}445555mT****68)-8M$W_M#G\J "=
 	
r,   c                     [        U 5      (       d   S5       e[        U5      (       d   S5       e[        U 5      n[        U5      nUS   nUS   nXCU4$ )ab  
Load parameter, optimizer, distributed attribute and addition_info.

Args:
    checkpoint_path(list[str]): model parameter file path, must be in order of rank id.
    dist_attr_path(list[str]): distributed attribute file path, must be in order of rank id.

Returns:
    param_dict(dict): parameters' value of all ranks.
    dist_attr(dict): parameters' distributed attribute.
    addition_info(dict): additional information user saved in last training.

Notes:
    The return, 'addition_info', is belonging to the first file of checkpoint_path by default.

Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('Depends on external files.')
        >>> from paddle.distributed.auto_parallel.static.utils import load_distributed_checkpoint

        >>> ckpt_path = [
        ...     './model_state_rank0.pdmodel',
        ...     './model_state_rank1.pdmodel',
        ... ]
        >>> dist_attr_path = [
        ...     './dist_attr_rank0.pdattr',
        ...     './dist_attr_rank1.pdattr',
        ... ]
        >>> param_dict, dist_attr, add_info = load_distributed_checkpoint(ckpt_path, dist_attr_path)
!'checkpoint_path' cannot be None. 'dist_attr_path' cannot be None.modelr   )r   _load_distributed_state_dict_load_distributed_attribute)r   r   state_dict_infor   r   r   s         r*   load_distributed_checkpointr     sl    @ _-- +- ^,,P.PP,2?CO+N;I )J#O4M-//r,   c                 \   SSK Jn  [        U[        R                  R
                  5      (       d   e[        U 5      (       d   S5       e[        U5      (       d   S5       eUc  U" 5       n[        U 5      n[        U5      n[        X#5      nUS   nUS   n	[        XU5      n
[        X5        U	$ )a  
Load parameter, optimizer, distributed attribute and addition_info into model.

Args:
    checkpoint_path(list[str]): model parameter file path, must be in order of rank id.
    dist_attr_path(list[str]): distributed attribute file path, must be in order of rank id.
    program(Program): the program to be updated with checkpoint_path.
    dist_context(DistributedContext ,optional): collect related distributed information for program

Returns:
    addition_info(dict): user saved in last train.

Notes:
    The return, 'addition_info', is belonging to the first file of checkpoint_path by default.

Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('Depends on external files.')
        >>> from paddle.distributed.auto_parallel.static.utils import load_checkpoint_into_program

        >>> exe.run(startup_program)
        >>> ckpt_path = [
        ...     './model_state_rank0.pdmodel',
        ...     './model_state_rank1.pdmodel',
        ... ]
        >>> dist_attr_path = [
        ...     './dist_attr_rank0.pdattr',
        ...     './dist_attr_rank1.pdattr',
        ... ]
        >>> load_checkpoint_into_program(ckpt_path, dist_attr_path, main_program)
r   rw   r   r   r   r   )ry   rx   r?   r   r   r   r   r   r   get_dist_attrmerge_and_slice_parameterload_parameter_into_program)r   r   r   ry   rx   all_state_dict_infoall_pre_dist_attrall_cur_dist_attrall_param_dictr   sliced_param_dicts              r*   load_checkpoint_into_programr    s    F >gv}}445555_-- +- ^,,P.PP,686G3NC%g<(1N'8M1+<   1;r,   c                     [        U [        5      (       d   eU(       a)  [        U[        R                  R                  5      (       d   eU (       d  gUR                  U 5        g)z
Load parameters into program.

Args:
    param_dict(dict): parameters' name and value.
    program(Program): the program to be updated
N)r?   r   r   r   r   set_state_dict)r   r   s     r*   r  r    sJ     j$''''z'6==+@+@AAAA:&r,   c                 <   [         R                  R                  5       n[        R                  R                  USU S35      n[        X5      [         R                  R                  5       S.n[         R                  " XT5        [        R                  " SU S35        g)z,Save distributed attribute of all parametersdist_attr_rankz.pdattr)r   
world_sizez(Already saved distributed attribute to 'r   N)r   distributedget_rankr   r   joinr   get_world_sizesaver   info)r   r   ry   rank_iddist_attr_namedist_attr_dicts         r*   r   r     s       ))+GWW\\.	9N w5((779N KK/LL;N;K2NOr,   c                     0 nU  H[  n[         R                  " U5      nUS   nU[        U 5      :X  d   S5       eUS   R                  5        H  u  pVXQ;  d  M  XaU'   M     M]     U$ )z:Load parameters' distributed attribute from dist_attr_pathr  zMThe number of 'dist_attr_path' must be equal to the last training world size.r   )r   loadr.   r   )r   total_dist_attrdist_attr_filer   pre_world_sizer&   r   s          r*   r   r     sx    O(KK/	"<0^!44 	
[	
4 $G,224JD*(,% 5 ) r,   c                 H   [         R                  R                  5       n[        R                  R                  USU S35      nU R                  5       [         R                  R                  5       US.n[         R                  " XT5        [        R                  " SU S35        g)zSave parameters' state_dictmodel_state_rankz.pdmodel)r   r  r   zAlready saved model to 'r   N)r   r  r  r   r   r  
state_dictr  r  r   r  )r   r   r   r   ckpt_file_namer  s         r*   r   r   '  s    &&(DWW\\+D6:N ##%((779&J
 KK
+LL+O+<B?@r,   c                 x   0 n[        U 5       H  u  p#[        R                  " USS9nUS   nU[        U 5      :X  d   S5       eUS:X  a  US   nUS   R	                  5        HL  u  pxXq;   a)  X   R                  [        R                  " U5      5        M3  [        R                  " U5      /X'   MN     M     UWS.n	U	$ )	z0Load parameters' state_dict from checkpoint_pathT)return_numpyr  zNThe number of 'checkpoint_path' must be equal to the last training world size.r   r   r   )r   r   )	enumerater   r  r.   r   rN   r   array)
r   all_state_dictr   	ckpt_filer   r  r   r&   r   r  s
             r*   r   r   6  s    N#O4 ++idC(6_!55 	
\	
5 !8+O<M*7399;KD%$++BHHUO<(*'8$	 < 5  & r,   c                    0 n[        5       (       Ga  U R                  5       R                  nU H  nUR                  5       S:X  dH  UR                  5       S:X  d  M-  UR	                  S5      (       d  ME  UR                  5       S   (       d  M_  UR                  nUR                  S5      R                  5       nUR                  5       S:X  a  UR                  S5      OUR                  S5      nUR                  nUR                  UR                  UR                  UR                  S.X''   M     U$ SS	KJn	  [#        U [$        R&                  R(                  5      (       d   eUc  U	" 5       nU R+                  5        H  n
[-        U
5      (       d  [/        U
5      (       d  M%  UR1                  U
5      nUR                  nUR                  nUR                  R                  nUR                  UR                  UUS.X*R                  '   M     U$ )
zc
Get distributed attribute of current rank.

Args:
    program(Program): main program for training
zbuiltin.parameterz
pd_op.datapersistabler   parameter_namer&   r   r   rw   )r   global_blockr   r&   has_attrattrsr   resultas_tensor_dist_attrstr_attrrJ   rG   r   rI   rO   ry   rx   r?   r   r   r   r   r	   r   r   )r   ry   r   r   r   r   var_dist_attrvar_namerJ   rx   r   r   rI   rO   s                 r*   r   r   N  s    I}}""$((Bwwy//	\)KK..HHJ}--!|| , 3 3A 6 J J L wwy$77 KK 01V, 
  -99%1%7%7%1%=%=$1$>$>!-!7!7	'	# P ' 	B'6==#8#89999:<L$$&CC  $:3$?$? AA#F !  0<</<<,99CC	%1%7%7%1%=%=$0!*	'	((# ' r,   c                    [        U5      (       d   S5       e[        U [        5      (       d   S[        U 5       S35       eU R	                  5        Hi  u  p4[        U[
        5      (       d  [        S[        U5       S35      e[        U[        5      (       a  [        S U 5       5      (       a  M`  [        S5      e   Uc  0 $ / n/ n[        R                  " S5        UR                  5        H  nXq;  a  UR                  U5        M  X   nX'   n	X:X  a?  [        R                  R                  5       n
U	S   R!                  U
5      nX   U   nXU'   Mg  X   nUS	   nU	S	   n[#        [%        U5      5      S
:  d  SU;  a  ['        X5      nUX'   O	US   nUX'   [#        [%        U5      5      S
:  d  SU;  d  M  [)        UU	5      nUX'   M     U H,  nXr;  d  M
  UR                  U5        U R+                  U5        M.     U(       a  [,        R.                  " SU S35        U(       a  [,        R.                  " SU S35        U $ )au  
Merge parameters with previous dist_attr and slice parameters with current dist_attr

Args:
    dist_param_dict(dict): parameters' value of all ranks.
    pre_dist_attr(dict): parameters' dist_attr of last training process.
    cur_dist_attr(dict): parameters' dist_attr of current training process.

Returns:
    dist_param_dict(dict): parameters' value of current rank.
z'pre_dist_attr' cannot be None.z8The type of 'dist_param_dict' should be 'dict', but got r   zXThe key of 'dist_param_dict' is parameter's name, and its type should be 'str', but got c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7fr>   )r?   r   ndarray)rA   vs     r*   rC   ,merge_and_slice_parameter.<locals>.<genexpr>  s!      2
/4!Jq"**%%us   ')zoThe value of 'dist_param_dict' is parameter's value of all ranks, and its type should be 'list(numpy.ndarray)'.z$Start to merge and slice parameters.r   rI   r   r4   r   zParameters 'z)' are not found in last training process.z,' are not found in current training process.)r   r?   r   r   r   rX   r   r/   rE   r   r  r   rN   r   r  r  r0   r.   set_merge_parameter_with_dist_attr_slice_parameter_with_dist_attrpopwarningswarn)dist_param_dictpre_dist_attrcur_dist_attrr&   r   param_not_in_preparam_not_in_curr0  pre_attrcur_attrr  r0   param	pre_parampre_dims_mappingcur_dims_mappingcomplete_paramsliced_params                     r*   r  r    s    M**M,MM*ot,, 
B4CXBYYZ[, ',,.$$$99=dAG  %&&c 2
/42
 /
 /
 @  / 	LL78!&&((##H- * *((113G_-33G<E#-e4E(-H%#-	#N3#N3s#$%)R7G-G<N )7O%&q\N(6O%s#$%)R7G-G:L )5O%? )B "(##H-) "
 +,,UV	
 +,,XY	
 r,   c                 j   SSK Jn  US   nUS   nUS   nUR                  U S   R                  XC5      n/ n/ nU HO  n	UR	                  XX4U5      n
UR                  U	5      nX;  d  M.  UR                  U
5        [        UX   U
U5        MQ     [        U5      S:X  d  U(       a   S5       eUS   S   nU$ )z*Merge parameter with distributed attributer   	ResharderrI   r   r   r   zFail to merge parameter)	reshardrK  compute_complete_shaperG   compute_partition_indexr0   rN   _merge_parameterr.   )
param_listr   rK  rI   r   r   complete_shapepartition_param_listmerged_partitionprocesspartition_indexr0   rG  s                r*   r7  r7    s    "^,Lo.Mo.M551]N  #;;\-
 ##G,2##O4$!	 ! #$)1E !E *!,Q/Nr,   c                 p   [        U [        R                  R                  5      (       a  [        R
                  " U 5      OU n US   nUS   nUS   n[        U R                  X#U5      n[        X[        U5      5      n[        R                  R                  5       n[        XpR                  X#U5      nXh   n	U	$ )z*Slice parameter with distributed attributerI   r   r   )r?   r   r   r   r   r#  _get_split_indicesrG   _slice_parameterr.   r  r  _get_sliced_param_index)
rC  r   rI   r   r   partition_index_listsliced_param_listr  sliced_param_indexrH  s
             r*   r8  r8    s     &eV[[-D-DEE5 
 ^,Lo.Mo.M-\- )S)=%>   ))+G0l= %8Lr,   c                 *   SSK Jn  [        U 5      S:X  a<  Sn[        U S   S   5       H  u  pgUS   S:w  d  US   X6   :w  d  M  Sn  O   U(       a  gU (       d  U R	                  X45        gSnU[        U 5      :  a  UR                  X   S   U5      u  n	n
nU	S:w  a_  U
S:X  a  [        R                  " X   S   U4U	S9nO[        R                  " XU   S   4U	S9nU R                  U5        [        U UUU5        gUS-  nU[        U 5      :  a  M  gg)	a  
Merge partial parameters to a complete one.

Returns:
    None

Examples:
    .. code-block:: python

        >>> import numpy as np
        >>> from paddle.distributed.auto_parallel.static.utils import _merge_parameter

        >>> partition_param_list = [(np.array([[[1.11, 1.12]]]), [[0, 1],[0, 1],[0, 2]])]
        >>> param = np.array([[[1.13, 1.14]]])
        >>> partition_index = [[0, 1],[0, 1],[2, 4]]
        >>> complete_shape = [2, 2, 4]

        >>> _merge_parameter(partition_param_list, param, partition_index, complete_shape)
        >>> print(partition_param_list)
        [(array([[[1.11, 1.12, 1.13, 1.14]]]), [[0, 1],[0, 1],[0, 4]])]

r   rJ  Tr   FNr4   r   )
rL  rK  r.   r"  rN   compute_concat_infor   concatenater9  rO  )rR  rC  rU  rQ  rK  is_complete_datar   r   rK   concat_axisfirst_ordernew_partition	new_params                r*   rO  rO    sI   2 #
 A%"#7#:1#=>ICAw!|tAw.*==#(  ? ##U$<=#*++
 --$'*O	 b !# "-03U;+!I !#Q 7 :;+!I %((+ (!"	 FA5 #*++r,   c                    Uc  U R                  5       nUR                   GH  nUR                  5        H  n[        XS9  M     UR	                  5       [
        ;   a  M>  UR                  b  MM  / n/ n/ nUR                  5        H  nUR                  5       nUcP  UR                  [        R                  " 5       5        S n	UR                  5       R                  n
U
b  U
R                  n	OUR                  U5        UR                  n	U	c  M  X;  d  M  UR                  U	5        M     UR                  5        Hz  nUR                  5       nUc&  UR                  [        R                  " 5       5        M<  UR                  U5        UR                  U;  d  M_  UR                  UR                  5        M|     [        U5      S:  d  GM  [        U5      S:X  a  US   nO[        U5      n[        R                   " UUU5      Ul        GM     g )N)r   r   r   )r)  r   r{   _complete_op_dist_attrr&   partition_skip_op_listr   operands_sourcerN   r   	Attributeget_defining_oprJ   resultsr.   r   create_op_dist_attribute)r   r   r   	sub_blockmeshesoperand_attrsresult_attrsoperandtmp_attr
value_meshtmp_op_dist_attrr,  r   s                r*   rg  rg  Y  s   }$$&iiI"7< %779..<<FML--/",,.#!((9!%J'.'>'>'@'J'J$'3%5%B%B
!((2!)!6!6J)j.FMM*- 0 **,!++-# ''8 ''1,,F:h&;&;< ' 6{Qv;!#!!9D/7D";;!  K r,   c           	          / n[        U R                  5      U-
  n[        R                  " XU   US9nUS:X  a  U$ U H!  nUR	                  [        XaUS-
  5      5        M#     U$ )a  
Slice a complete parameter.

Returns:
    sliced_param_list(list): sliced parameters with 'partition_index_list'

Examples:
    .. code-block:: python

        >>> import numpy as np
        >>> from paddle.distributed.auto_parallel.static.utils import _slice_parameter

        >>> complete_param = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
        >>> rank = 2
        >>> complete_shape = [1, 1, 6]
        >>> dims_mapping = [-1, -1, 0]
        >>> process_shape = [3]
        >>> process_group = [0, 1, 2]

        >>> sliced_param_list = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
        >>> print(sliced_param_list)
        [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]

r^  r   )r.   rG   r   splitextendrX  )rG  rZ  rb   r[  r   rH  rC  s          r*   rX  rX    st    2 ~##$v-D88T2L {  U&1*E	
  r,   c                     SSK Jn  UR                  XX#U5      nSn[        U5       H>  u  pX(   S:X  a  U	n
O	XX(      -  n
U
S:X  a  Xh   S   nOXh   S   S-   U
-  nXyU
-  -  U-   nM@     U$ )a  
Get sliced_param's index of current rank in all sliced parameters list.

Returns:
    sliced_param_index(int): the index of sliced param in sliced_param_list

Examples:
    .. code-block:: python

        >>> import numpy as np
        >>> from paddle.distributed.auto_parallel.static.utils import _get_sliced_param_index

        >>> complete_param = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
        >>> rank = 2
        >>> complete_shape = [1, 1, 6]
        >>> dims_mapping = [-1, -1, 0]
        >>> process_shape = [3]
        >>> process_group = [0, 1, 2]

        >>> slice_param = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
        >>> print(slice_param)
        [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]

        >>> index = _get_sliced_param_index(rank, complete_shape, dims_mapping,
        ...                                 process_shape, process_group)
        >>> print(index)
        2
r   rJ  r   r4   )rL  rK  rN  r"  )r   rQ  rI   r   r   rK  rU  r\  rK   rG   slice_shaper0   s               r*   rY  rY    s    > #77l=O n-?b K#AAK!#&q)E$'*Q.;>E/K3GH5P . r,   c                 2   SSK Jn  / nU HQ  nUR                  X`XU5      nU(       a2  [        [	        U5      5       H  nXX   R                  Xx   5        M     MO  UnMS     [        [        S UU 5      5      nU V	s/ s H  n	[        U	5      PM     nn	U$ s  sn	f )a  
Get split indices of every dimension.

Returns:
    split_indices_list(list): the split indices of every dimension of the parameter

Examples:
    .. code-block:: python

        >>> import numpy as np
        >>> from paddle.distributed.auto_parallel.static.utils import _get_split_indices

        >>> complete_param = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
        >>> complete_shape = [1, 1, 6]
        >>> dims_mapping = [-1, -1, 0]
        >>> process_shape = [3]
        >>> process_group = [0, 1, 2]

        >>> index = _get_split_indices(complete_shape, dims_mapping, process_shape, process_group)
        >>> print(index)
        [[], [], [2, 4]]
r   rJ  c                 :    [        [        U 5      U1-
  S1-
  5      $ r   )r/   r6  xys     r*   <lambda>$_get_split_indices.<locals>.<lambda>  s    c!fslaS01r,   )	rL  rK  rN  rF   r.   rx  r/   mapr   )
rQ  rI   r   r   rK  split_indices_listrT  rU  dimr~  s
             r*   rW  rW    s    2 # #;;\-
 S12"'../CD 3 "1 ! 1	
 .@@-?&)-?@ As   ;Bc                     [        U R                  S5      5      n[        U R                  ;   =(       a?    U[        [        R
                  5      :H  =(       d    U[        [        R                  5      :H  $ )Nop_role)r@   r   OP_ROLE_KEY
attr_namesOpRoleForwardLossr   r  s     r*   is_forward_opr    sO    "'')$%G"--' 3v~~&&E'S5E*Er,   c                     [         U R                  ;   =(       a:    [        U R                  5       [            5      [        [        R
                  5      -  $ r>   )r  r  r@   	all_attrsr  Backwardr   s    r*   is_backward_opr    >    "--' C
{#-FOO- r,   c                     [         U R                  ;   =(       a:    [        U R                  5       [            5      [        [        R
                  5      -  $ r>   )r  r  r@   r  r  Optimizer  s    r*   is_optimize_opr    r  r,   c                     [         U R                  ;   =(       aD    [        U R                  5       [            5      [        [        R
                  R                  5      -  $ r>   )r  r  r@   r  r  r  LRSchedr  s    r*   is_lr_sched_opr  $  sD    "--' %C
{#-FOO##$-% %r,   c                     [         U R                  ;   =(       aT    [        U R                  5       [            5      [        [        R
                  5      [        [        R                  5      -  :H  $ r>   )r  r  r@   r  r  r  r  r  s    r*   
is_loss_opr  *  sL    "--' 2C
{#-
fnn
FKK 0
0-2 2r,   c                     [         U R                  ;  a  g[        U R                  5       [            5      nU[        [        R
                  5      -  =(       a    U[        [        R                  5      -  $ )NF)r  r  r@   r  r  r  r  r  s     r*   is_loss_grad_opr  0  sM    "--'",,.-.GS))HgFKK8H.HHr,   c                     U R                   R                  S5      =(       a*    U R                   R                  S5      R                  S5      $ )Nop_namescopez/gradient_clip)r   r*  r   
startswithr  s    r*   is_gradient_clip_opr  7  s;    77N+ #1j!"#r,   c                     U R                   R                  S5      =(       a    SU R                   R                  S5      ;   $ )Nr  /auto_parallel/reshard)r   r*  r   r  s    r*   is_reshard_opr  =  s8    77 C
"bggll>&B
BCr,   c                 8    U R                   R                  S5      $ )N_p)r   endswithr  s    r*   
is_prim_opr  C  s    77D!!r,   c                 $    U R                  S5      $ )Nring_id)r*  r  s    r*   
is_comm_opr  G  s    ;;y!!r,   c                    / nU R                    HT  n[        U5      (       d  M  [        UR                  R	                  5       5      S:X  d   S5       eUR                  U5        MV     [        U5      S:X  d   S5       eUS   $ )Nr   z#loss op should only output loss varz"num of loss op is not equal to oner   )r   r  r.   r   output_arg_namesrN   )r   loss_opsr   s      r*   get_loss_opr  K  sz    Hiib>>rww//12a7 57 OOB  x=ACCCA;r,   c                    [        5       nX%l        [        U[        [        R
                  45      (       a  [        U5      Ul        O?[        U[        R                  5      (       a  X5l        O[        U S[        U5       35      eUR                  S5      (       a"  UR                  S5        UR                  S5        UR                  S5      (       a
  US   Ul        U R                  X5        U$ )Nz8 must be a instance of ProcessMesh or list, but receive mark_annotatedrI   rJ   chunk_id)r   rI   r?   r/   r   r3  r   rJ   r   r   r   getr  r  r   )ry   r   rI   rJ   kwargsr   s         r*   set_var_dist_attrr  X  s    %'$0!,rzz 233(3L(A%	L$"2"2	3	3(4%nTUYZfUgThi
 	
 zz"##''7''7zz*$*:$6!11#Hr,   c                 b   Uc   eUc   e[        5       nU R                  R                  5        H  nUR                  Xb5        M     U R                  R	                  5        H  nUR                  Xr5        M     Xl        UR                  S5      (       a
  US   Ul        UR                  X5        g )Nr  )
r   r   input_arg_namesset_input_dims_mappingr  set_output_dims_mappingrJ   r  r  set_op_dist_attr_for_program)new_oprJ   ref_mappingctxr  new_op_dist_attrinput_varnameoutput_varnames           r*   6naive_set_dist_op_attr_for_program_by_mesh_and_mappingr  m  s     ###"""')446//K 7 ++66800M 9 %1!zz*$*:$6!$$V>r,   c                 0   Uc   e[        5       nU R                  R                  5        HJ  nU R                  R	                  U5      nUR                  U5      R                  nUR                  XW5        ML     U R                  R                  5        HJ  nU R                  R	                  U5      nUR                  U5      R                  nUR                  X5        ML     Xl
        SU;   a
  US   Ul        SU;   a
  US   Ul        UR                  X5        g )Nis_recomputer  )r   r   r  r   r   r   rI   r  r  r  rJ   r  r  r  )	r  rJ   r  r  r  r  r   r7   r  s	            r*   *naive_set_dist_op_attr_for_program_by_meshr    s     ###')446ll}-66s;HH//G 7 !++668ll~.66s;HH00I 9
 %1!(.~(>%V$*:$6!$$V>r,   c           	         SnU R                   nU R                  R                  nUR                  5       S:X  d  UR                  5       S:X  a  gUR	                  5       n/ nSU;   a  UR                  S5      n/ nUR                  5        H  nU R                  U5      nUR                  (       a  M'  UR                  U5      n	[        U	5      S:  a;  [        U	SS  5       H)  u  pUS:X  a  M   UR                  5        SU
 SU S	35       e   [        U	5      S:  d  M  UR                  U	S
   5        M     UR                  5        GH1  nU R                  U5      nUR                  (       a  M(  UR                  U5      n	Xu;  aq  [        U	5      S:  a;  [        U	SS  5       H)  u  pUS:X  a  M   UR                  5        SU
 SU S	35       e   [        U	5      S:  a  UR                  U	S
   5        M  M  U	S
   S:X  d   UR                  5        SW S	35       e[        U	5      S:  a;  [        U	SS  5       H)  u  pUS:X  a  M   UR                  5        SU
 SU S	35       e   UR                  U	S   5        GM4     [!        U5      nUc   S5       eUR                  5        HY  nU R                  U5      nUR                  (       a  M'  UR                  U5      n	[        U	5      S:  d  MI  XS
   :w  d  MS  XS
'   SnM[     UR                  5        Hp  nU R                  U5      nUR                  (       a  M'  UR                  U5      n	Xu;  a#  [        U	5      S:  a  XS
   :w  a  XS
'   SnM\  M^  M`  XS   :w  d  Mj  XS'   SnMr     U$ )NFrG   sliceXShaper   r4   zD only the batch dimension (0-dim) can be sharded, but the dimension z is sharded by z part.r   z^ only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by r   zN only the batch dimension (1-dim) of XShape can be sharded, but the dimension #There is no compatible dim mapping.T)r   	serial_opr   r   output_namesoutputr  get_serial_inputr	   get_input_dims_mappingr.   r"  rN   r  get_serial_outputget_output_dims_mappingr^   )r   rm   r   op_descr  xshape_arg_namesbatch_dim_mappingsarg_nameserial_tensorrI   r   r7   rd   s                r*   +update_op_dims_mapping_by_default_dist_implr    s   G$$L$$G||~ GLLNg$='')L<">>(3++-00:%%#::8D|q  ),qr*: ;"} ||~&&jknjoo~  @G  H  HN  O} !< |!%%l1o6 . ,,.11(;%%#;;HE+< 1$$-l12.>$?LC"b= "<<>**norns  tC  DK  CL  LR  S= %@ < A%")),q/: &  ?b( <<>"  #A  BI  AJ  JP  Q( < 1$$-l12.>$?LC"b= "<<>**xy|x}  ~M  NU  MV  V\  ]= %@ %%l1o6- /0 <<NO!- -- ++-00:%%#::8D|!&<Q&O4OG . ,,.11(;%%#;;HE+L!Q&*1o="8Q > ' &a8"8Q /" Nr,   c                 \   SnU R                   nU R                  R                  nUR                  5       n0 n0 nSnU H?  nUR	                  U5      n	U[        U	5      :  a  [        U	5      nXU'   [        U	5      Xh'   MA     / n
U Hs  nXh   U:  aU  [        U5       Vs/ s H  nSPM     nn[        Xh   5       H  nXvU   -
  U-   nXX   U   X'   M     U
R                  U5        M`  U
R                  XX   5        Mu     UR                  5       nU H6  nUR                  U5      n	[        U	5      U:X  d   eU
R                  U	5        M8     [        U
5      nUc   S5       eU H  nXh   U:  aa  [        Xh   5       Vs/ s H  nSPM     nn[        Xh   5       H  nXvU   -
  U-   nUU   X'   M     XU   :w  a  UR                  X5        SnMj  Ml  UXX   :w  d  Mv  UR                  UU5        SnM     U H0  nUR                  U5      n	UU	:w  d  M  UR                  UU5        SnM2     U$ s  snf s  snf )NFr4   r  T)r   r  r   r  r  r.   rF   rN   r  r  re   r  r  )r   rm   r   r  r  input_dims_mapping_dictinput_dims_mapping_lensmax_dims_mapping_lenr  rI   ra   _new_dims_mappingrK   new_idxr  compatible_dims_mappings                    r*   4update_op_dims_mapping_by_elementwise_like_dist_implr    s   G$$L$$G--/O  ##::8D#l"33#&|#4 ,8),/,=) $ #",/CC,12F,GH,Gq,GH2<=(8+LL -D,Ma,P )	 >
 $$%56$$%<%FG $ //1$#;;HE< $8888  . %
 >>OP". -. $",/CC!"9"CD DqD    2<=(8+LL '>g&F #	 >
  8#DD33HO E '*A*KK335 % $( %#;;HE"l2001 G % Na  I, s   H$8H)c                    SSK Jn  U R                  n[        R                  " U5      n/ nUc  [
        R                  R                  5       O[        UR                  S5      5      n[        U5       HM  n[        R                  " U5      n	U" 5       U	l        UR                  X5      u  n
n
nnn
UR                  U5        MO     U$ )z2Get all distributed main programs by dist_context.r   )DistributedOperatorContextGPU)ry   r  clustercopydeepcopyr   r  r  r.   get_all_devicesrF   _dist_op_context_get_dist_programrN   )serial_program_infory   parallelizerr  r  copied_parallelizerall_dist_main_programranksr  used_dist_contextr  dist_startup_programdist_main_programs                r*    get_all_distributed_main_programr  -  s     9!))G--5 ? 	))+((/0 

 < MM,7-G-I*  11'M	
 $$%67   ! r,   c                   p    \ rS rSr S
S jr\S 5       r\S 5       r\S 5       r\S 5       r	\S 5       r
S	rg)SerialProgramInfoiJ  Nc                 @    Xl         X l        X0l        X@l        XPl        g r>   )_train_program_startup_program_loss
_optimizer_cluster)selftrain_programstartup_programloss	optimizerr  s         r*   __init__SerialProgramInfo.__init__K  s     , /
#r,   c                     U R                   $ r>   )r  r  s    r*   r  SerialProgramInfo.train_programT  s    """r,   c                     U R                   $ r>   )r  r  s    r*   r  !SerialProgramInfo.startup_programX  s    $$$r,   c                     U R                   $ r>   )r  r  s    r*   r  SerialProgramInfo.loss\  s    zzr,   c                     U R                   $ r>   )r  r  s    r*   r  SerialProgramInfo.optimizer`  s    r,   c                     U R                   $ r>   )r  r  s    r*   r  SerialProgramInfo.clusterd  s    }}r,   )r  r  r  r  r  r>   )__name__
__module____qualname____firstlineno__r  propertyr  r  r  r  r  __static_attributes__r5   r,   r*   r  r  J  sn    GK  # # % %      r,   r  c                    S nSS K Jn  UR                  5       nUR                  5         SnSSSSSS	S
S.n/ n/ SQnU  GH  n0 n	UR	                  5       R
                  n
UR	                  5       R                   GH  nSnUR                  U;   a  XUR                  R                  5       '   M4  UR                  (       a$  [        XR                  S      R                  5      OSn[        UR                  S5      5      [        [        R                   5      :X  a{  SUR                  ;   aj  UR                  S S nXR#                  5       ;   a  X^   nUR%                  USUS9nU(       a
  U" XU
5      nOUR%                  XS9nU(       a  SU" XU
5      -  nO[        UR                  S5      5      [        [        R&                  5      :X  aY  UR                  UR#                  5       ;   a  X[R                     OUR                  nUR%                  U5      nU(       a	  U" XU
5      nXUR                  R                  5       '   GM     UR)                  U	5        GM     U$ )Nc                 (   Sn [        U S   5      nU S   nSnSnUR                  S5      nSnU GH5  n	SU	;   a  SOSnX;   d  M  U	S U	R                  U5      S-
   n
U	R                  S5      nU	R                  S	5      nUS:  a  US:  a  X:  d   S
5       eU	US-   U R                  S5      nU Vs/ s H  n[        UR	                  5       5      PM     nnSnU[        S US5      -  nUR                  S:X  a
  U
S:X  a  SOSn
UR                   HR  nUR                  5       U
:X  d  M  UR                  U5       H"  nUU   nU[        S UR                  5      -  nM$       GM3     GM8     US:  a  US:  d   S5       eXe-  U-  nU$ !   Us $ = fs  snf )Nr   op_timeconfig
z
(Variable)z(list<Variable>r   []zGet shape failed.,c                 
    X-  $ r>   r5   r}  s     r*   r  Dget_standalone_cost_data.<locals>._compute_runtime.<locals>.<lambda>  s    qur,   c_embeddingweightwidsc                 
    X-  $ r>   r5   r}  s     r*   r  r    s    QUr,   zGet input size failed.)floatrw  findr@   rt   r   r   input_nameslowerinputrG   )op_costr   r|   runtime	op_configtotal_static_input_sizetotal_actual_input_sizeparsed_infovariabler  arg_name_lowershape_left_boundaryshape_right_boundaryrG   r~  dtype_factorr  r0  r   actual_runtimes                       r*   _compute_runtime2get_standalone_cost_data.<locals>._compute_runtimej  s   	GI./G H%	"#"#ood+D , 4:K  !%&?		((;a(?!@&*iin#'+yy~$'!+,q0,B' '	'C '!+.B%*  277AQWWY7 '62DeQ+OO'77m+-9u # !#H~~'>9(*(:H"&x.C3v 2CII8 3 );
  !//  > '*/F/J 	
$	
J
 $=G 	 [	N, 8s   F 1#FFr   r   	embeddingmatmul	transposereshape	unsqueezer   divide)r  	matmul_v2
transpose2reshape2
unsqueeze2r   r   )create_py_readercreate_double_buffer_readerreadassignfloat32r  _gradF)forwarddtype)rE  )paddle.cost_model
cost_model	CostModelstatic_cost_datar)  r|   r   r   r   idr  rX   rE  r@   r   r  r  r   get_static_op_timer  rN   )distributed_programsr1  cmrG  DEFAULT_MULTIPLEOP_NAME_MAPPINGstandalone_cost_datanot_enum_opsdistributed_program	cost_datar|   r   r&  rE  forward_op_namer%  op_names                    r*   get_standalone_cost_datarV  i  s9   1f #J!"!!#O L  4	"//166%22488BGww,&*1"''**,' %% D++A./556 
 2779%&#foo*>>bgg%&(ggcrlO&*>*>*@@*9*J(;;'e < G "27"E","?"?+ #@ # #&'*:7*M&MGRWWY'(C,?? ww/"6"6"88 $GG, 
 %77@.wDAG&-bggjjl#I 9L 	##I.S  4V  r,   c                     UR                  5       nUR                  5       nX2R                  ;   a  U R                  U5        g XBR                  ;   a  U R                  U5        g [	        S5      e)Nz6Cannot find the original id in the distributed context)rJ  original_id_dist_ops_for_programset_original_idAssertionError)dist_op_descr  ry   op_idop_original_ids        r*   set_dist_op_desc_original_idr_    sg    JJLE((*N222$$U+	==	=$$^4 D
 	
r,   c                 ^    U c  U $ [        U [        [        45      (       a  [        U 5      $ U /$ r>   )r?   r/   tuple)r   s    r*   to_listrb    s.    }%$''E{7Nr,   c                    [         R                  R                  X S[        R                  R                  5        35      n[        US5       nUR                  [        U 5      5        S S S 5        g ! , (       d  f       g = f)Nz	_program.r  )	r   r   r  r   r  r  openwriterX   )r   r   r&   filenamefs        r*   debug_programrh    s\    ww||i 2 2 ; ; =>?H 
h		G 
		s   A11
A?c                 T    SSK Jn  U" 5        H  nUR                  U :X  d  M  Us  $    g )Nr   )get_all_process_groups)r   rj  rJ  )r  rj  gs      r*   ring_id_to_process_grouprl    s'    5#%447?H & r,   c                     SS/nU R                    H2  nUR                   H  nU H  nXCR                  ;   d  M        g   M!     M4     g)N
_grad_gradtriple_gradTF)r{   r   r   )r   higher_order_op_suffixr   r   suffixs        r*   find_higher_order_backward_oprr    sE    *M:))B0WW$ 1    r,   c                     [        U [        5      (       d   eSU R                  ;  d   e[        S U R                  S5      $ )zA
input:
    - var: variable
return:
    number of element in var
r4   c                 
    X-  $ r>   r5   r}  s     r*   r  get_var_numel.<locals>.<lambda>!  s    qur,   r   )r?   r
   rG   r   )r   s    r*   get_var_numelrv    s=     c8$$$$SYY$cii33r,   c                 l   [        U [        R                  R                  5      (       a  U R	                  5       $ [        U [        R
                  R                  5      (       a;  [        U R                  [        5      (       a  U R                  $ U R                  5       $ [        S[        U 5       S35      e)Nzg'optimizer' must be object of class `paddle.optimizer.Optimizer` or `paddle.static.Optimizer`, but got r   )
r?   r   r  	Optimizerget_lrr   _learning_rater   r   r   r  s    r*   ry  ry  $  s    )V--7788!!	Iv}}66	7	7i..66+++++--66:9o5FaI
 	
r,   c                    SS K nSSKJn  / nSnU" 5       nUR                  R	                  S5      u  px[        U5      U-   n	S n
SnUR                  UR                  UR                  5      n
U
R                  Xy45        U
R                  S5        0 nU  GH<  nXR                  ;  a  M  [        UR                  5      S:X  Ga  UR                  R                  U5      nUS:X  a  S	OS
nU(       Ga  UR                  S   nUR                  U   R	                  S5      u  nn[        U5      U-   nUR                  UR                  UR                  5      nUR                  UU45        UR                  [!        U5      R#                  S5      5        UR%                  U5      R'                  S5      n[        U5      nUU:w  a  [)        SU SU S35      e[+        SUR                   S35        UR-                  5         OUR                  S   n UU;  aR  U
R/                  5       u  nn[        UR%                  U5      R'                  5       5      nUUU'   UR1                  U5        OYUU   R                  [!        U5      R#                  S5      5        UU   R-                  5         [+        SUR                   S35        OM  UR3                  5         GM?     U
R-                  5         g )Nr      )_get_global_envi  :i   
   r   TFr   zutf-8z0Please check comm pair, the recv rank should be z	 but got r   zIt is able to instantiate z as sender now.z as receiver now.)socket
collectiver~  current_endpointrw  r@   AF_INETSOCK_STREAMbindlistenr  r.   r0   trainer_endpointsconnectsendrX   encoderecvdecoder   r   closeacceptrN   instantiate)all_process_groupscur_rankr  r~  has_recv_by_socket	magic_numgenvcur_rank_ipcur_rank_portcur_rank_recv_portserver_socket	buff_sizeclient_socketsr   r0   is_send	recv_rankrecv_rank_iprecv_rank_portconnect_portclient_socketr   	send_rank	recv_addrs                           r*   initialize_pg_in_full_moder  3  s   -ID!%!6!6!<!<S!AK]+i7MIMM&..&2D2DEM89N+...}""#q(!''--h7E#qjdeG)//2	/3/E/E0%* -n  #>2Y> &NNF$6$6! %%|\&BC""3x=#7#7#@A$)))4;;GD4y9$$J9+U^_c^ddef  4]5H5H4IY ##%)//2	 (::3@3G3G3I0y"=#5#5i#@#G#G#IJ/<t,*11$7&y166M009 'y177989L9L8MM^_   	!!#[ ,\ r,   c                     U R                  S5      =(       a/    SU R                  S5      ;   =(       a    SU R                  S5      ;  $ )Nr  z/auto_parallel/rc
exclude_rcr*  r   r  s    r*   is_recompute_opr  w  s?    
N# 	8277>#::	8 77r,   c                 Z    U R                  S5      =(       a    SU R                  S5      ;   $ )Nr  r  r  r  s    r*   is_recompute_exclude_opr    s-    ;;~& <277< , r,   c           	      :   SSK Jn  U(       d  g UR                  nUR                  (       d  g / n[	        U [
        R                  R                  5      (       a  [        U S5      (       ak  U R                  R                  S;   aQ  [        U R                  S5      (       a6  U R                  R                  n[        U5      S:  a  UR                  5         OUR                  nOUR                  nU(       d  g UR                  5       nU" XwR                   5      nUR#                  5         UR%                  U5      n	/ n
SnSnUS-   [        U	5      :  a  US:X  ae  XS-      nXR&                  ;  a  US-  nM5  UR&                  U   S	   nU(       a.  [)        U5      S
:  a  U
R+                  S
[)        U5      S-   /5        OoUR-                  X   /XS-      /5      u  nnnU(       a)  UR/                  UU5      nU
R+                  UUS-   /5        O [0        R2                  " SU SUS-    S35        US-  nUS-   [        U	5      :  a  M  [5        U
5       HJ  u  nn[7        US
   US   5       H.  nUR                   U   R9                  SS[;        U5      -   5        M0     ML     g )Nr}  )RecomputeStategpt)GPTForPretrainingGPTForPretrainingAutocheckpointsr   r4   r   var_as_output_opsr   zCould not recompute op range [z] - [z] r  z/auto_parallel/rc_)passes.auto_parallel_recomputer  	recomputeenabler?   r   nnLayerhasattr	__class__r  r  r  r.   r9  r)  r   build_statssort_checkpointsvar_op_depsmaxrN   is_subgraph_update_segment_startr   debugr"  rF   	_set_attrrX   )r   lossesstrategyr   r  r  ckptsr   rc_stater  segments	start_idxpre_segment_end_idx	ckpt_nameop_idx_listflagmin_idxmax_idxrK   segmentjs                        r*   set_recompute_segmentsr    sk   @""I
 E%))E5!!((
 		=11II))E5zA~		))E%%  "EeYY/H++E2KHI
a-#k*
*?#M2I 4 44Q	"..y9:MNKs;/!3C$4q$8 9:%-%9%9'(;1}+E*F&"D'7 "880 'A+ 674WIU7Q;-rR 	Q	- a-#k*
*0  )
7wqz71:.AIIaL"" 4s1v = / *r,   c                 J   UR                  U5      nUR                  nUR                  nXR                  ;  a  [	        X$U 5      nOU nUS   nUS:  aQ  UR
                  U   S:  a>  [        UR                  UR
                  UU5      n[        U5      UR                  U5      4$ g)Nr   r4   r   )r   r   )	r   rJ   rI   r   r   rG   r   r.   r0   )	r  r   ry   r   rJ   rI   r  batch_size_axisgroup_rankss	            r*   get_input_split_infor    s    #DDSI#00L#00L///),hO"1oO 2 2? Ca G%$$	
 ;!2!27!;;;r,   c                     U bc  S U l         S U l        U R                  (       aD  [        U R                  [        R
                  R                  5      (       a  SU R                  l        U $ )NT)_parameter_list_param_groups
_grad_clipr?   r   r  ClipGradByGlobalNorm_async_add_nr{  s    r*   validate_optr    sX    $(	!"&	J  &))"@"@%
 %
 15I  -r,   c                     SSK JnJn  SSKJn  U" 5       R
                  nU" US/5      n[        U5      S:  a  SOS /[        [        U R                  5      S-
  5       Vs/ s H  nS PM     sn-   nU" XU5      $ s  snf )Nr   )r   shard_tensorr   )get_world_process_groupdp)		interfacer   r  r   r  r  r.   rF   rG   )r~  r   r  r  world_ranksrJ   r  rP   s           r*   set_data_parallelr    s}    56)+11K{TF3Lk*Q.$D9CL1,-=--= J 44	=s    A:c                    U R                   (       d  gU R                  R                  5       R                   Vs/ s H  nUR                  PM     nn[        U5      [        [        5      -  (       d  U R                   (       a  ggs  snf rk   )data_parallel_original_serial_main_programr)  r   r   r6  __not_naive_data_parallel_op__)ry   r   ops_types      r*   is_naive_data_parallelr     sq    %% <<IIKOOOB 	O  
 MC >??

$
$s   Bc                 D   UR                   nUbj  [        R                  " UR                  UR                  [        [        UR                  5      5       Vs/ s H  nS[        U5      -   PM     sn5      U l         UR                  U l        UR                  U l	        g s  snf NrB   )
rJ   r   r   rG   r   rF   r.   rX   rI   	annotated)cpp_dist_attrpy_dist_attrpy_process_meshrK   s       r*   _copy_tensor_dist_attr_to_cppr    s    "//O"%)%5%5!!''#(_-B-B)C#DE#DaS3q6\#DE&
"
 ".!:!:M*44M Fs   Bc                     SSK Jn  U R                   nUb   U" UR                  UR                  S9Ul         U R                  Ul        U R
                  Ul        g Nr   )r   )rG   r   )rJ   r   rG   r   rI   r  )r  r  r   cpp_process_meshs       r*   _copy_tensor_dist_attr_from_cppr    sT    *$11#$/"(((44%
! !. : :L*44Lr,   c                    UR                   nUbj  [        R                  " UR                  UR                  [        [        UR                  5      5       Vs/ s H  nS[        U5      -   PM     sn5      U l         UR                  U l        UR                  U l	        UR                  U l
        UR                  U l        UR                  R                  5        H!  u  pEU R                  U5      n[        Xe5        M#     UR                   R                  5        H!  u  pEU R#                  U5      n[        Xe5        M#     g s  snf r  )rJ   r   r   rG   r   rF   r.   rX   	impl_typeimpl_idxr  r  inputs_dist_attrsr   get_input_dist_attrr  outputs_dist_attrsget_output_dist_attr)r  r  r  rK   r&   py_tensor_dist_attrcpp_tensor_dist_attrs          r*   _copy_op_dist_attr_to_cppr  )  s   "//O"%)%5%5!!''#(_-B-B)C#DE#DaS3q6\#DE&
"
 +44M)22M!-!:!:M*44M%1%C%C%I%I%K!,@@F%&:P &L &2%D%D%J%J%L!,AA$G%&:P &M Fs   D=c                    SSK Jn  U R                   nUb   U" UR                  UR                  S9Ul         U R                  Ul        U R
                  Ul        U R                  Ul        U R                  Ul        U R                  R                  5        H!  u  pEUR                  U5      n[        XV5        M#     U R                  R                  5        H!  u  pEUR                  U5      n[        XV5        M#     g r  )rJ   r   rG   r   r  r  r  r  r  r   r  r  r  r  )r  r  r   r  r&   r  r  s          r*   _copy_op_dist_attr_from_cppr  =  s    *$11#$/"(((44%
! +44L)22L - : :L*44L&3&E&E&K&K&M"*>>tD' 	
 'N
 '4&F&F&L&L&N"*??E' 	
 'Or,   c                 0   U R                   R                  5        H-  n[        UR                  R                  UR                  5        M/     U R
                  R                  5        H-  n[        UR                  R                  UR                  5        M/     g r>   )_dist_tensors_for_programr}   r  r  r   rY  r  r  ry   r   r   s      r*   _copy_dist_attr_to_cppr  V  su    #==DDF%%%//1F1F	
 G
  55<<>!''):):	
 ?r,   c                 0   U R                   R                  5        H-  n[        UR                  R                  UR                  5        M/     U R
                  R                  5        H-  n[        UR                  R                  UR                  5        M/     g r>   )r   r}   r  r  r   rY  r  r  r  s      r*   _copy_dist_attr_from_cppr  b  su    #==DDF'%%//1F1F	
 G
  55<<>#''):):	
 ?r,   c                    U R                    H  nUR                  5       (       aG  UR                  5       b6  U R                  U5      nUR                  5       R                  n[        X25        UR                  5       (       d  Mv  UR                  5       c  M  U R                  U5      nUR                  5       R                  n[        X25        M     g r>   )
serial_ordered_nodesis_varr   get_tensor_dist_attr_for_graphr   r  is_opr   get_op_dist_attr_for_graphr  ry   noder  r  s       r*    _copy_dist_attr_to_cpp_for_graphr  n  s    11;;==TXXZ3'FFtLL HHJ00M)-F::<<DGGI1'BB4HL GGI//M%mB 2r,   c                    U R                    H  nUR                  5       (       aG  UR                  5       b6  U R                  U5      nUR                  5       R                  n[        X25        UR                  5       (       d  Mv  UR                  5       c  M  U R                  U5      nUR                  5       R                  n[        X25        M     g r>   )
r  r  r   r  r   r  r	  r   r
  r  r  s       r*   "_copy_dist_attr_from_cpp_for_graphr  z  s    11;;==TXXZ3'FFtLL HHJ00M+MH::<<DGGI1'BB4HL GGI//M'D 2r,   c                 d   [        5       (       a  g[        UR                  5      S:  d   SU S35       e[        UR                  5      S:  d   SU S35       eUR	                  U5      R
                  nUR	                  U5      R
                  n	X:X  d   SU SU	 S35       eS n
U
" UR                   Vs/ s H  oR                  U5      PM     sn5      nU
" UR                   Vs/ s H  oR                  U5      PM     sn5      n[        U UUUU[        R                  UUUUS	S
9$ s  snf s  snf )z8
dependency: prior_op should be run before posterior_op
Nr   z9first op of dependency should at least have one output. [r  z9second op of dependency should at least have one input. [z5two ops of dependency should have same mesh but got [z] and [c                     U  Vs/ s H  oR                   (       a  M  UPM     n n[        U 5      S:  d   eU  Vs/ s H  o[        U5      4PM     nnUR                  S S9  US   S   $ s  snf s  snf )Nr   c                     U S   $ )Nr   r5   r~  s    r*   r  Rinsert_dependencies_for_two_ops.<locals>._select_best_depend_var.<locals>.<lambda>  s    AaDr,   keyr4   )r	   r.   rv  sort)r|   r   vars_with_numelss      r*   _select_best_depend_var@insert_dependencies_for_two_ops.<locals>._select_best_depend_var  sr    #<t+;+;t<4y1}}AEF#-"45F.1#A&& =Fs   A*A*A/F)rJ   r  syncr  use_nop)
is_sequential_runr.   r  r  r   rJ   r   insert_dependencies_for_varsr  r  )r   r   prior_opposterior_opry   r  r  r  prior_op_meshposterior_meshr  r&   	first_var
second_vars                 r*   insert_dependencies_for_two_opsr%    sl    x(()Q. 
CH:QO. |++,1 
CL>QRS1 !==l  ">>l  * 
?gVdUeefg*' (%-%>%>?%>T4%>?I )%1%A%AB%AT4%ABJ ("!!  	@ 	Cs   5D(&D-c                    U(       a  [        5       (       a  g[        U[        5      (       a  U/n[        U[        5      (       a  U/nU H%  nU R                  UR                  5      (       a  M%   e   U H%  nU R                  UR                  5      (       a  M%   e   UR                  US   5      nUc  UR                  nUc   eSn
U
(       a  U R                  USSU0SU0S9nOU R                  USUUS	.SU0S9nUR                  [        U5        U(       d  US
/:w  Ga  [        5       nSUl        SUl        UUl        UUl        UR                  Ul        UR                  R!                  5        HA  nU R#                  U5      nUR                  U5      R$                  nUR'                  UU5        MC     UR                  R)                  5        HA  nU R#                  U5      nUR                  U5      R$                  nUR+                  UU5        MC     UR-                  UU5        U	b  UR                  SSU	 35        U(       a  U R/                  5         U$ )z[
dependency: op that generates prior_vars should be run before op that generates post_vars
Nr   TnopXOut)r   inputsoutputsdepend)r(  Depr4   defaultr  /)r  r?   r
   has_varr&   r   rJ   _insert_op_without_syncr  r  r   r  r  r  r  r   r  r   rI   r  r  r  r  _sync_with_cpp)r   r   
prior_vars	post_varsry   oprolerJ   r  r  r  r  skip_insert_when_sequential_run	prior_varpost_varpost_dist_attr	depend_opdepend_op_dist_attrr  r   r7   r  s                        r*   r  r    sj   $ '+<+>+>*h'' \
)X&&K		}}Y^^,,,,  }}X]]++++  "BB9Q<PN%22###G11Z I& 2 
	 11! I& 2 
	 V, |t+.0'($(1%+7(+7('5'>'>$&^^;;=M))M*C"CCl   66}gN > (nn==?N))N+C"CCl   77P @ 	11*	
 Na~,>?r,   c                 &    SU R                   ;   a  gg)Nc_TF)r   r  s    r*   is_dep_skip_opr>   	  s    rwwr,   c                    ^  U 4S jnU$ )Nc                     > [         R                  R                  5       (       a  T" U 0 UD6$ [         R                  R                  R                  5          T" U 0 UD6sS S S 5        $ ! , (       d  f       g = fr>   )r   	frameworkin_dynamic_moder   dygraphguard)argsr  funcs     r*   __impl__!_dygraph_guard_.<locals>.__impl__(	  sW    ++--((($$**,T,V, -,,s   A''
A5r5   )rF  rG  s   ` r*   _dygraph_guard_rI  '	  s    - Or,   c                  F    [        [        R                  " S5      S   5      $ )N!FLAGS_new_executor_sequential_run)r   r   	get_flagsr5   r,   r*   r  r  5	  s&    <=/	
 r,   c                     [        U R                  5      S:  a  S/ 4$ [        U R                  5      n[        U5      U4$ )Nr   r   )r.   r   get_sub_process_mesh)ry   sub_process_meshess     r*   get_pp_degreerP  =	  sA    
<&&'!+"u-l.I.IJ!"$666r,   c                 @   U R                  5       R                  n/ n[        U5       H\  u  p4SUR                  5       ;   d  M  UR                  (       d  M.  UR                  R
                  nXR;  d  MK  UR                  U5        M^     [        U5      n[        US S9nU$ )Npd_opc                      U R                   S   $ r   )r   r  s    r*   r  1get_sub_process_mesh_by_program.<locals>.<lambda>Q	  s    !--*:r,   r  )	r)  r   r"  r&   r   rJ   rN   rN  r   )dist_programall_opsr   r   r   rJ   rO  s          r*   get_sub_process_mesh_by_programrW  E	  s    '')--GNW%bggiBLLL<<44L1%%l3	 & .n= : r,   c                    [        5       n[        R                  " U 5      nU H  nU[        UR                  5      -  nM     / nSn[	        U5       H`  u  pc[        [        UR                  5      5      [        U5      :X  a  UR                  U5        MC  [        UR                  5      U:  d  M^  SnMb     U(       a#  [        U5       H  nUR                  U5        M     U$ rk   )	r6  r  r  r   r"  r.   rN   r   r9  )r   r   rO  pmglobal_pm_idx
has_sub_pmr   s          r*   rN  rN  W	  s    %K~6 s2>>** ! MJ/0s2>>"#s;'77  % ;.J	 1 M*C""3' + r,   c                 n    S n[        U R                  5       H  u  p4XR                  ;   d  M  Un  U$    U$ r>   )r"  r   r   )ry   r   pp_idxr   rJ   s        r*   get_pp_stager^  m	  s@    F&|'B'BC+++FM	 D Mr,   c                 T    [         R                  R                  5       n[        X5      $ r>   )r   r  r  get_pp_stage_by_rank)	pp_degreer  s     r*   get_pp_stage_by_pp_degreerb  v	  s!    !!**,H44r,   c                     S nU R                    H,  n[        X15      nUb  XB:w  a    g XB:X  d   SU SU 35       eUnM.     U$ )Nz;Can't get pp_stage by process_mesh with different pp_stage z and )r   r`  )rJ   ra  pp_stage_for_process_meshr   pp_stages        r*   get_pp_stage_by_process_meshrf  {	  sg     $(('8$048 MhZW\]v\wx8 %-! ) %$r,   c                 R    [         R                  R                  5       nX!-  nX-  nU$ r>   )r   r  r  )r   ra  	word_sizepp_group_sizere  s        r*   r`  r`  	  s+    ""113I*M$HOr,   r"  r  r  c                    / n/ n0 nU R                   nU Ha  nU R                  R                  U5      n	UR                  R	                  U5      n
U
R
                  n[        X5      nUR                  U5        Mc     U Ha  nU R                  R                  U5      n	UR                  R	                  U5      n
U
R
                  n[        X5      nUR                  U5        Mc     U H   nUR                  R                  U5      Xm'   M"     XEU4$ )a  
Get data used in inferring distributed attributes, including:
  1. DistTensorSpec for each input and output tensor of this dist_op.
  2. Operator attributes of this dist_op, e.g. transpose_x in matmul op.

Args:
  dist_op: the DistributedOperator
  input_names: list, name of the dist_op's input tensors
  output_names: list, name of the dist_op's output tensors
  attr_names: list, attribute name of the dist_op's corresponding serial op

Returns:
  input_specs: list, DistTensorSpec for each input tensor of the dist_op
  output_specs: list, DistTensorSpec for each output tensor of the dist_op
  attrs: dict, attribute map of the dist op

Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('Depends on other ops.')
        >>> from paddle.distributed.auto_parallel.static.utils import wrap_data_for_completion

        >>> op_desc = dist_op.serial_op.desc
        >>> input_name_list = []
        >>> output_name_list = []

        >>> input_name_list.append(op_desc.input('X')[0]) # 'X' is the arg name for op
        >>> input_name_list.append(op_desc.input('Y')[0])
        >>> output_name_list.append(op_desc.output('Out')[0])

        >>> attr_name_list = ['trans_x', 'trans_y']
        >>> input_specs, output_specs, attrs = wrap_data_for_completion(
        ...        dist_op,
        ...        input_name_list,
        ...        output_name_list,
        ...        attr_name_list)

)r  r   r  r   _var_recursiverG   r   rN   r  r   r   )r   r"  r  r  input_specsoutput_specsr+  r  r&   r   r   rY   	dist_spec	attr_names                 r*   wrap_data_for_completionrp  	  s    T KLE!!I ",,@@Foo,,T2yy"<B	9%  ",,AA$Goo,,T2yy"<B	I&   	$>>..y9   e++r,   c                     U R                   R                  R                  U5      R                  nU(       a  U R                  R                  U5      nOU R                  R                  U5      n[        X45      $ r>   )r  r   rk  rG   r   r  r  r   )r   r&   is_inputrY   r   s        r*   get_dist_tensor_specrs  	  s`    $$**99$?EEL",,@@F",,AA$G,99r,   c                     U R                   R                  n[        UR                  5       5      S:X  d   S5       eUS   nU$ )Nr   zinvalid grad_var_to_var)r  grad_var_to_varr.   r   )ry   grad_var_to_var_mapru  s      r*   get_grad_var_to_varrw  	  sG    &77GG"'')*a/J1JJ/)!,Or,   c                    SSK JnJn  Sn[        U R	                  5       R
                  5       GH  u  pgUR                  S5      (       GaM  UR                  S5      S:X  Ga7  / SQnUR                  R                  5       U;   Ga  UR                  R                  5       n	SU	;   d  SU	;   d  S	U	;   a]  SU	;   a  UR                  R                  S5      O;SU	;   a  UR                  R                  S5      OUR                  R                  S	5      n
UR                  R                  5       nS
U;   d  SU;   a<  S
U;   a  UR                  R                  S
5      OUR                  R                  S5      nW
S   UR                  5       ;   a  W H  nX*S      X-'   M     UR                  R                   (       d  GM  UR                  U5      (       d  GM  UR                  U5      [#        UR$                  5      -  (       d  GM  UR                  U5      [#        UR&                  5      -  (       d  GM  UnGM     US:w  a  U R	                  5       R
                  US-
     nUR                  R                  S
5      S   nU R	                  5       R
                  U   nUR                  R                  S
5      S   nUUR                  5       ;  a  XU'   g g g )Nr   )r  r  r4   r  r  )rw  r@  castc_concatconcatr  
all_gatherr(  Inputr~  r)  outr   )/paddle.distributed.fleet.meta_optimizers.commonr  r  r"  r)  r   r*  r   r   r   r"  r$  r  r  r   ampr  r@   r  r  )r   r  ru  r  r  first_backward_op_idxr   r   reshard_op_typesr"  r*  r  r+  r  scale_loss_opscale_loss_var_namefirst_backward_opscale_loss_grad_var_names                     r*   update_grad_var_to_varr  	  sn    W113778 KK'''+CC  ww||~!11 gg113;&+-k) +- c*  '+5 GGMM'2!#s!3   "ww335L(E\,A !L0 u-WW^^E2 
 !9 4 4 66")2A)2L/ #*
 LLK((%FOO(<<<%FKK(888$'!c 9h ",,.223H13LM+0077>qA#002667LM#4#9#9#@#@#G#J #?+?+?+AA8K45 B #r,   c                     U R                   nU H;  nUR                  S:X  a  Xl        UR                  5        H  n[        XA5        M     M=     g rT   )r   r  r{   set_all_ops_op_role)r   r  rV  r   rn  s        r*   r  r  0
  s>    iiG:: JI	3 % r,   c                    [         R                  n[         R                  nU(       a  U(       d
   SU 35       eU(       d
   SU 35       e[        U 5      S:X  d   S[        U 5       35       eX#-  nU" U S   USS9nU" U S   USS9nU" U S	   USS9n	/ n
[	        U5       H8  nXX-  US-   U-   -  n
U
R                  X   5        U
R                  X   5        M:     U" U
SS9$ U" U SS9$ )
a  fuse function for fusing weights

(1) fuse_attention_qkv
    q => [q1,q2,q3,q4]
    k => [k1,k2,k3,k4] or [k1,k2] for GQA
    v => [v1,v2,v3,v4] or [v1,v2] for GQA
    fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4]
            or for GQA [q1,q2,k1,v1,q3,q4,k2,v2]
(2) fuse_attention_ffn
    directly fuse weights to 1 parts
    [gate_weight], [up_weight] => [gate_weight, up_weight]

Args:
    fuse_params (_type_): to be fused weights
    is_qkv (bool, optional): for attention qkv weights. Defaults to False.
    num_heads (_type_, optional): query heads. Defaults to None.
    num_key_value_heads (_type_, optional): key and value heads. Defaults to None.

Returns:
    _type_: fused weights
3num_heads should be number of heads for Q, but got Mnum_key_value_heads should be number of key_value_heads for K and V, but got r}  zKfuse_params length is not equal 3, it should be Q K V list. but got length r   r4   r^  r   r   )r   r{  rw  r.   rF   rN   )fuse_paramsis_qkv	num_headsnum_key_value_heads	concat_fnsplit_fnnum_query_groupsq_listk_listv_list	qkv_pairsrK   s               r*   fuse_param_funcr  9
  s7   0 I||H 	
A)M	
y # 	
[\o[pq	
" ;1$ 	
YZ]^iZjYkl	
$ %;+a.)"=+a.*=BG+a.*=BG	*+A$A1A'A I VY'VY' , ,, 2..r,   c                    [         R                  n[         R                  nU(       a  U(       d
   SU 35       eU(       d
   SU 35       eX4-  n/ / / pnU" XSU-  -   SS9n[        U5       HY  nXXS-   -  US-   US-   -  S-
   -  nU	R	                  XS-   US-   -  S-
     5        U
R	                  XS-   US-   -  S-
     5        M[     U" USS9U" U	SS9U" U
SS94$ U" XSS9$ )a)  split function for splitting weights

(1) fuse_attention_qkv
    fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4]
            or for GQA [q1,q2,k1,v1,q3,q4,k2,v2]
    after split
    q => [q1,q2,q3,q4]
    k => [k1,k2,k3,k4] or [k1,k2] for GQA
    v => [v1,v2,v3,v4] or [v1,v2] for GQA
(2) fuse_attention_ffn
    directly split weight to 2 parts
    [gate_weight, up_weight] => [gate_weight], [up_weight]

Args:
    fused_param (_type_): len(fused_param)=1, only one weight to be split
    split_nums (int, optional): split_nums. Defaults to 2.
    is_qkv (bool, optional): for attention qkv weights. Defaults to False.
    num_heads (_type_, optional): query heads. Defaults to None.
    num_key_value_heads (_type_, optional): key and value heads. Defaults to None.

Returns:
    _type_: split weights
r  r  r   r4   r^  r   )r   r{  rw  rF   rN   )fused_param
split_numsr  r  r  r  r  r  r  r  r  split_headsrK   s                r*   split_param_funcr  q
  sF   < I||H 	
A)M	
y # 	
[\o[pq	
" %;!#RQ)<%<<2
 *+A)*a!e8H18L-M. F MM+1u1AA1E&F&JKLMM+1u1AA1E&F&JKL , f2&f2&f2&
 	
 b99r,   global_meshsub_mesh_dimc                 |   U R                   n[        U5      nX:  d  US:  a  U* U:  a  [        SU SU S35      eUS:  a  X-  n[        R                  " U R
                  5      R                  U5      n[        R                  " XBU   US9n/ nU H'  nUR                  [        XpR                  5      5        M)     U$ )Nr   z"The sub_mesh_dim should between (-z, r  r^  )rG   r.   r   r   r#  r   r6  rw  rN   r   rO   )r  r  r   	mesh_ndimr   split_process_idssub_mesh_listsub_process_idss           r*   
split_meshr  
  s    ""JJI ql]Y602i[J
 	
 a!((;223;;JGK-L M,)>)>?	
 -
 r,   c                    U R                  5       nUR                  5       R                  R                  5       S:w  a  g[        R
                  R                  U5        [        R                  R                  SU R                  U R                  S9nUR                  U R                  5       5        U R                  U5        g)al  
Update the subblock within a pylayer operation by modifying its output argument.

This function optimizes a pylayer operation by removing unnecessary outputs from the 'cf.yield' step.

Args:
    trivale_value (pir::Value): The output argument of the pylayer operation to be modified.

Example:
    (1) Original pylayer operation:
        (%1, %2) = "pd_op.pylayer" (%0) {
            () = "cf.tuple_pop" [id:1]
            (%3, %4) = "dist_op.xxx" [id:2]
            () = "cf.yield" [id:3] (%3, %4)
        }
    (2) After calling `update_pylayer_output(%4)`, the updated pylayer operation removes the unused output:
        (%1) = "pd_op.pylayer" (%0) {
            () = "cf.tuple_pop" [id:1]
            (%3) = "dist_op.xxx" [id:2]
            () = "cf.yield" [id:3] (%3)
        }

Args:
    trivale_value(pir::Value): The output argument of the pylayer op to be updated.
r   N_fake_pylayer_out)r&   rG   rE  )rk  get_parent_block	parent_opr&   r   r   set_insertion_pointr   datarG   rE  set_typer   replace_all_uses_with)trivial_value	define_op
fake_values      r*   update_pylayer_outputr  
  s    4 --/I!!#--224G
JJ""9-## !!!! $ J
 **,-''
3r,   )auto_parallelr>   )NFN)FFN)NFFNFT)T)FNN)r   FNN)r  r   r   r   r:  	functoolsr   numpyr   r   paddle.base.frameworkr   paddle.base.libpaddler   paddle.base.wrapped_decoratorr   paddle.frameworkr   paddle.framework.io_utilsr   r	   paddle.staticr
   rJ   r   r   dist_attributer   r   r   op_proto_and_checker_makerr  kOpRoleAttrNamer  VarDescVarTypeREADERSTEP_SCOPESDENSE_TENSOR_ARRAYFEED_MINIBATCH
FETCH_LIST__no_shape_var_type__r  _g_gradient_clip_opsrh  r+   r1   r8   r;   rL   rR   rV   rZ   r^   re   ri   rn   rr   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   r  r7  r8  rO  rg  rX  rY  rW  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rV  r_  rb  rh  rl  rr  rv  ry  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r%  r  r>  rI  dygraph_guardr  rP  rW  rN  r^  rb  rf  r`  r/   rp  rs  rw  r  r  r  r  r@   r  r  r5   r,   r*   <module>r     s     	      - % " J " < L L		(	(	/	/--==? 	LLLL$$LL++LL''LL##  #.  	  	* ,# "-20. "F&0f!H*."Q&4
*. N 7
t)0Z <@6r'P A02jSl F0BJ,^$N/d-`%2I#C""
*?&?0M`BJ!: >v r
$	4
AHHV0
5 	5
5Q(
2	
	
	C	E$ 	=N 	$(Wt /7$,5
%C,C,.2C,@DC,L:BLJ4 DH5/t <:~K s 4$4r,   