
    x-j                       d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZ ddlmZmZmZ ej        j        Zej                                         Z!ej"        j#        j$        ej"        j#        j%        ej"        j#        j&        ej"        j#        j'        ej"        j#        j(        gZ)dgZ*g dZ+g dZ,ddZ-d Z.d Z/d Z0d Z1d Z2d Z3d Z4d Z5d Z6d Z7d Z8d Z9d Z:ddZ;dd Z<d! Z=d" Z>d# Z?d$ Z@d% ZAd& ZBdd'ZCd( ZDd) ZEd* ZFd+ ZG	 	 	 dd-ZHd. ZI	 dd/ZJd0 ZKd1 ZLd2 ZMd3 ZNd4 ZOdd5ZPd6 ZQd7 ZRd8 ZSd9 ZTdd:ZUd; ZVd< ZWd= ZXd> ZYd? ZZd@ Z[dA Z\dB Z]dC Z^dD Z_dE Z`dF ZadG ZbdH ZcdI ZddJ ZedK ZfdL ZgdM ZhdN Zi G dO dP          ZjdQ ZkdR ZldS ZmdT ZndU ZodV ZpdW ZqdX ZrdY ZsdZ Ztd[ Zud\ Zvd] Zwd^ Zxd_ Zyd` Zzda Z{db Z|dc Z}dd Z~de Zdf Zdg Zdh Z	 	 	 ddiZ	 	 	 	 	 	 ddkZdl Zdm Z ee          Zdn Zdo Zdp Zdq Zdr Zds Zdt Zdu ZdvedwedxefdyZddzZd{ Zd| Zd} Z	 dd~Z	 	 	 	 ddZdedefdZd ZdS )    N)reduce)use_pir_api)pir)wrap_decorator)core)is_belong_to_optimizeris_parameter)Variable   )ProcessMeshmerge_process_meshes   )DistTensorSpecOperatorDistAttrTensorDistAttr	expand_v2)sumsqrtfill_constantelementwise_maxelementwise_divstack
reduce_sum)zbuiltin.combinezbuiltin.splitpd_op.pylayerzcf.yieldzcf.tuple_pushzcf.tuple_popzcf.stack_createzcf.has_elementsauto_parallelc                 B   t          j        |          }d|_        |j        sg|                    |            t          j                    }t          j        d          }|                    |           |                    |           n|                    |            |S )NFz>%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s)	logging	getLogger	propagatehandlerssetLevelStreamHandler	FormattersetFormatter
addHandler)	log_levelnameloggerlog_handler
log_formats        m/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/auto_parallel/static/utils.py
get_loggerr,   G   s    t$$FF? 	#	"""+--&L
 

 	  ,,,+&&&&	"""M    c                 X    |t          |            k    r|t          |           k     rdS dS )NTF)len)listindexs     r+   is_valid_list_indexr2   W   s/    T
us4yy00tur-   c                     | dk    rdS dS NTF mappings    r+   is_dim_shardr9   ^       "}}tur-   c                     | dk    rdS dS r4   r6   r7   s    r+   is_dim_replicater<   e   r:   r-   c                 \   | dS t          d | D                       sdS t          t          |                     D ]/}| |         dk     s| |         t          |j                  k    r dS 0t          t          |j                            D ]}|                     |          dk    r dS dS )NFc              3   @   K   | ]}t          |t                    V  d S N)
isinstanceint).0ds     r+   	<genexpr>z&verify_dims_mapping.<locals>.<genexpr>o   s,      88az!S!!888888r-   r5   r   T)allranger/   shapecount)dims_mappingprocess_meshis      r+   verify_dims_mappingrL   l   s    u88<88888 u3|$$%%  ?R<?c,:L6M6M#M#M55 $N3|)**++  a  1$$55 %4r-   c                    g }| D ]}||                     d           |j        |j                            |                   dk    r|                     d           Y|                     |j                            |                     |S )Nr5   r   )appendrG   	dim_namesr1   )
shard_specrJ   rI   shards       r+   convert_to_dims_mappingrR   z   s    L E E=#### 6 < <U C CDII#### 6 < <U C CDDDDr-   c                     g }| D ]>}|dk    r|                     d            |                     |j        |                    ?|S Nr5   )rN   rO   )rI   rJ   rP   dim_mappings       r+   convert_to_shard_specrV      s_    J# C C"d####l4[ABBBBr-   c                    t          |           t          |          k    rdS | D ]*}|t          |t                    s dS |||j        vr dS +t	          | |          }t          ||          sdS t          t          |                    D ]=}||         dk    r/||         dk    r#||         |j        ||                  z  dk    r dS >dS )NFr5   r   T)r/   r@   strrO   rR   rL   rF   rG   )rP   tensor_shaperJ   rQ   rI   rK   s         r+   verify_shard_specrZ      s    
:#l++++u  Zs%;%;55l.D!D!D55*:|DDL|\:: u3|$$%%  Or!!Q!##Q,"4\!_"EEJJ554r-   c                 Z    | sd S | d         }| D ]}|dk    r|}|dk    r||k    r d S |S )Nr   r5   r6   )dim_mappingscompatible_mappingr8   s      r+   compute_compatible_dim_mappingr^      sc     t%a  ##!(]]7**44r-   c                    | sd S t          | d                   }| D ]+}|
J d            t          |          |k    s
J d            ,g }t          |  D ]8}t          t          |                    }| d S |                    |           9|S )Nr   z8Dims mapping must not be None for compatible computationzKThe length of dims_mapping in list must be same for compatible computation.)r/   zipr^   r0   rN   )dims_mapping_listlengthrI   compatible_resultr\   compatible_dim_mappings         r+   compute_compatible_dims_mappingre      s     t"1%&&F) 
 
''F ('' <  F***Y +*** ./ 9 9!?"
 "
 ")44  !78888r-   c                 <    d }| s|S | D ]}||||k    r|} d S |S r?   r6   )process_mesh_listcompatible_process_meshrJ   s      r+   compute_compatible_process_meshri      sY    " '&&)  #'/*l::*6''tt $ #"r-   c                    t          |           t          |          k    sJ d}g }t          t          |                     D ]G}t          | |         ||                   sJ |                    | |         ||                             Ht	          |          }|dS t          t          |                     D ]-}|| |         ||                  k    r|| |         ||         <   d}.|S NFT)r/   rF   r2   rN   r^   )ra   
index_listchangedr\   rK   rd   s         r+   )compute_compatible_and_update_dim_mappingrn      s    !!S__4444GL3())** A A"#4Q#7AGGGGG-a0A?@@@@;LII%u3())**  !%6q%9*Q-%HHH2Ha A/GNr-   c                 .    | t          j                    z   S )zE
    Append auto parallel suffix for distributed attribute name.
    )r   kAutoParallelSuffixr'   s    r+   append_distributed_attr_suffixrr      s     $*,,,,r-   c                 N    |                      t          j                              S )zF
    Remove auto parallel suffix from distributed attribute name.
    )stripr   rp   rq   s    r+   remove_distributed_attr_suffixru      s     ::d.00111r-   c                    ddl m} |
 |            }|                                s
J d            | j        D ]}|j                                        D ]F}|                    |          }|                    |          }||                                s  dS G|j	        D ]F}|
                    |          }|                    |          }	|	|                                s  dS GdS )Nr   get_default_distributed_contextz8Distributed attributes must be initialized before check.FT)dist_contextrx   is_initialized_for_programblocksvarsvaluesget_dist_tensor_for_graph get_tensor_dist_attr_for_programis_validopsget_dist_op_for_graphget_op_dist_attr_for_program)
programry   rx   blocktensordist_tensortensor_dist_attropdist_opop_dist_attrs
             r+   "check_distributed_attr_for_programr      s4   ======66882244  B 4   j'')) 	 	F&@@HHK+LL    !,{7K7K7M7M,uuu) 	 	B"88@@G'DDRHHL(73C3C3E3E(uuu		
 4r-   c                 *   t          j                    }|                                 ddlm}m} | |            }t          | d           n1 |            } ||           t          | d            ||           |                                 dS )z
    This function reuses the original program output ability with a distributed context.
    Using lock can avoid multiple threads change the default distributed context simultaneously.
    r   )rx   set_default_distributed_contextNT)flush)	threadingLockacquirery   rx   r   printrelease)r   ry   lockrx   r   original_default_contexts         r+   print_program_with_dist_attrr     s    
 >DLLNNN       
 6688gT"""""#B#B#D#D ''555gT""""''(@AAALLNNNNNr-   c                 X   	 | v sJ d| d                                    |          }t          |          		fdt          |                   D             }t          |                   D ]}|||         |<   fd|D             } fd|D             }t          |          S )a  
    Given a rank and the processes mesh the rank belongs to,
    compute the communication peers of the rank based on the give axis in the mesh.

    Example: 16 processes managed in a 4-Dimensional mesh with shape of [2, 2, 2, 2].
    the rank communication peers of rank 0 (included) are following:
    in axis 0: [0, 1]
    in axis 1: [0, 2]
    in axis 2: [0, 4]
    in axis 3: [0, 8]
    zrank [z] is NOT in processes group c                 $    g | ]}d d          S r?   r6   )rB   rK   
coordinates     r+   
<listcomp>z#_get_comm_group.<locals>.<listcomp>=  s!    FFFaJqqqMFFFr-   c                 0    g | ]}t          |          S r6   )_coordinate2linear_idx)rB   r   rG   s     r+   r   z#_get_comm_group.<locals>.<listcomp>C  s3        	uj11  r-   c                      g | ]
}|         S r6   r6   )rB   idx	processess     r+   r   z#_get_comm_group.<locals>.<listcomp>G  s    HHHinHHHr-   )r1   _linear_idx2coordinaterF   sorted)
r   rG   axisrankrank_relativecoordinates_in_grouprK   ranks_in_group_relativeranks_in_groupr   s
   ``       @r+   _get_comm_groupr   )  s     9>>>9>>  OOD))M'}==JFFFF5t3E3EFFF 5; * *()Q%%   .   IHHH0GHHHN.!!!r-   c                 \    |                      |          }t          ||          }||         S )a  
    Given a rank and the processes mesh the rank belongs to,
    compute the index of the rank in given axis.

    Example: 27 processes managed in a 3-Dimensional mesh with shape of [3, 3, 3].
    the index of rank 22 are:
    in axis 0: 1
    in axis 1: 1
    in axis 2: 2
    )r1   r   )r   rG   r   r   r   r   s         r+   _get_idx_in_axisr   L  s/     OOD))M'}==Jdr-   c                    t          |           t          |          k    sJ d|  d|             t          t          |                     D ]C}||         dk    sJ d| d|             ||         | |         k     sJ d| d|  d|             D| d         }|d         }t          t          |           d	z
  dd          D ]}||||         z  z  }|| |         z  }|S )
a  
    convert a coordinate in multidimensional mesh space into a scala idx in linear space.

    it use Row-major order for dimension conversion.
    so it has:  [most_significant_dim, ..., least_significant_dim]
    assume:

        the size of i-th dimension to be:  S[i]
        the index of j-th dimension is: I[j]

    linear_idx of a n dimensional coordinate is:

        I[n-1] * (S[n-2] * S[n-3] * S[n-4] *     ....    S[0]) +
        I[n-2] * (         S[n-3] * S[n-4] *     ....    S[0]) +
        I[n-3] * (                  S[n-4] *     ....    S[0]) +
        ...
        I[1]   * (                                       S[0]) +
        I[0]

    zCcoordinate should have the same size as mesh shape, but got shape: z, coordinate: r   zindex in dimension [z"] is least than zero. coordinate: z"index beyond extent in dimension [z
]. shape: r5   r   )r/   rF   )
mesh_shaper   rK   base
linear_idxs        r+   r   r   _  s3   : z??c*oo---tjtthrtt .-- 3z??## 
 
!}!!!T1TT
TT "!! !}z!},,,dddjddXbdd -,,, b>DBJ 3z??Q&B//  dZ]**

1r-   c                 V   |dk    sJ d| d            |t          j        |           k     sJ d|  d|             d}dgt          |           z  }t          t	          t          |                               D ]-}||z  }t          || |         z            ||<   || |         z  }.|S )a	  
    mapping a linear scala into multidimensional mesh space, return it coordinate in that space.

    it is the inverse function of _coordinate2linear_idx.
    assume:

        the size of i-th dimension to be:  S[i]
        the index of j-th dimension is: I[j]

    the coordinate given linear_idx is:

        I[0] = linear_idx                                  % S[0]
        I[0] = (linear_idx / S[0])                         % S[1]
        I[0] = (linear_idx / (S[0] * S[1]))                % S[2]
        ....

    r   zlinear index [z] is least than zeroz5linear index beyond the extent of mesh shape. shape: z, linear index: r   r5   )npprodr/   reversedrF   rA   )r   r   r   r   rK   offsets         r+   r   r     s    & ???MZMMM???
+++++h
hh\fhh ,++ DJ'JeC
OO,,--  d"FZ]233
1
1 r-   c                    d }| j         D ]J}||j        v r?|j        |j        k    r/t          |j        |j                            |                    } nK| |j        t          |j        |                   S |j        d         S Nr   )process_meshesprocess_idsrG   r   r1   r   )ry   target_meshr   r   meshs        r+   _get_corresponding_rankr     s    
 J+  4###
k6G(G(G/
D,22488 J E &"4:z::
 	
 &q))r-   c                    | j         }|j        }|j        j         }t          |          t          |          k    sJ d| d| d            g }t	          t          |                    D ]`}||         dk    s||         dk    r|                    ||                    6|                    ||         |||                  z             a|S )Nzvariable shape [z] and dim_mapping [z] is NOT match !r5   )rG   rI   rJ   r/   rF   rN   )var	dist_attr	var_shaper8   r   	new_shaper   s          r+   _get_unshard_dist_shaper     s    	I$G!'Dy>>S\\)))R9RRRRR *)) IS^^$$ B BS>R73<2#5#5Ys^,,,,Ys^d73<.@@AAAAr-   c                 R   ddl m} |
 |            }|                                 D ]}|j        rv|                    |          }t          ||          }|j                            |           |j        }dgt          |          z  }||_        |
                    ||           d S )Nr   rw   r5   )ry   rx   	list_varsis_datar   r   desc	set_shaperI   r/    set_tensor_dist_attr_for_program)dist_main_progdist_startup_progry   rx   r   r   inverse_shaperU   s           r+   make_data_unshardr     s    ======6688'')) 
Q 
Q; 		Q+LL    4C9IJJMH}---*7K$[!1!11K,7)99#?OPPP
Q 
Qr-   c                 Z   dddd}| s|S t          | t                    s t          dt          |            d          |                                 D ]V\  }}|dvrt          d| d          t          |t                    s t          dt          |           d          |||<   W|S )z(Update default addition_info with inputsr   )epochbatch
batch_sizez7The type of 'addition_info' should be 'dict', but got ''.z[The key of 'addition_info' should be one of the ['epoch', 'batch', 'batch_size'], but got 'z7The value of 'addition_info' should be 'int', but got ')r@   dict	TypeErrortypeitems
ValueErrorrA   )addition_infoadd_infoitemvalues       r+   _update_addition_infor     s   Qa88H t,, 0]++0 0 0
 
 	

 )..00 	# 	#KD%;;; KBFK K K   eS))  0 $U0 0 0   #HTNNr-   c                 P   | s| S t          | t                    rn| D ]i}t          |t                    s t          dt	          |           d          t
          j                            |          st          d| d          j| S t          dt	          |            d          )z!Validity check of input file pathz0The type of file path should be 'str', but got 'r   zThe file path 'z' does not exist.z1The type of file path should be 'list', but got ')	r@   r0   rX   r   r   ospathexistsr   )	file_pathfiles     r+   _check_valid_pathr     s     
	It	$	$ 
 	L 	LDdC(( / $T

/ / /   7>>$'' L !J4!J!J!JKKKL,Y, , ,
 
 	
r-   c                    | st          d          t          | t                    s t          dt	          |            d          |                                 D ]y\  }}t          |t                    s t          dt	          |           d          t          |t          j        j	                  s t          dt	          |           d          z| S )Nz'param_dict' cannot be None.z4The type of 'param_dict' should be 'dict', but got 'r   z:The type of key of 'param_dict' should be 'str', but got 'zDThe type of value of 'param_dict' should be 'DenseTensor', but got ')
r   r@   r   r   r   r   rX   paddler   DenseTensor)
param_dictr'   r   s      r+   _check_param_dictr      s    7888
D)) -Z((- - -
 
 	

 &++-- 
	 
	KD%dC(( / $T

/ / /   eV[%<== 0 $U0 0 0  
 r-   c                    | s| S t          | t                    s t          dt          |            d          |                                 D ]\  }}t          |t
                    s t          dt          |           d          t          |t                    s t          dt          |           d          g d}t          |                                          |k    r%t          d|                                 d          | S )	Nz3The type of 'dist_attr' should be 'dict', but got 'r   z@The type of param name of 'dist_attr' should be 'str', but got 'z=The type of distributed attribute should be 'dict', but got ''process_shapeprocess_grouprI   rO   ziThe key of distributed attribute should be '['process_shape', 'process_group', 'dims_mapping']', but got .)	r@   r   r   r   r   rX   r0   keysr   )r   r'   r   attrs       r+   _check_dist_attrr   7  s`    	4(( ,Y, , ,
 
 	

 %??,, 	 	KD%dC(( / $T

/ / /   eT** / $U/ / /    D EJJLL!!T)) /$zz||/ / /   * r-   Fc                    ddl m} t          | t          j        j                  sJ t          |t                    sJ |
 |            }t          |          }|s$t          | ||           t          | ||           dS t          d          )a1  
    Save model parameter state, optimizer state, distributed attribute and
    additional information of each rank.

    Args:
        program(Program): The program to be saved.
        checkpoint_path(str): The path of the checkpoint file to be saved.
        dist_attr_path(str): The path of distributed attribute file to be saved.
        addition_info(dict, optional): Additional information, key should be selected in ['epoch', 'batch', 'batch_size'].
            Default values are 0, when 'addition_info' is None. Default: None.
        is_integrated(bool, optional): Whether to integrate param before save. Default: False.
        dist_context(DistributedContext ,optional): collect related distributed information for program

    Returns:
        None

    Examples:
        .. code-block:: python

            >>> import os
            >>> from paddle.distributed.auto_parallel.static.utils import save_distributed_checkpoint

            >>> step = 16000
            >>> global_batch_size = 32
            >>> path = os.path.join("./output", "step_%d" % step)
            >>> os.makedirs(path, exist_ok=True)
            >>> program = paddle.static.Program()

            >>> add_info = {'batch': step, "batch_size": global_batch_size}
            >>> save_distributed_checkpoint(program, path, path, add_info)

    r   rw   Nz/Integrating parameter has not been implemented.)ry   rx   r@   r   staticProgramboolr   _save_distributed_state_dict_save_distributed_attributeNotImplementedError)r   checkpoint_pathdist_attr_pathr   is_integratedry   rx   s          r+   save_distributed_checkpointr   Z  s    P >=====gv}455555mT*****6688)-88M 
$Wm_MMM#G^\JJJJJ "=
 
 	
r-   c                     t          |           s
J d            t          |          s
J d            t          |           }t          |          }|d         }|d         }|||fS )a  
    Load parameter, optimizer, distributed attribute and addition_info.

    Args:
        checkpoint_path(list[str]): model parameter file path, must be in order of rank id.
        dist_attr_path(list[str]): distributed attribute file path, must be in order of rank id.

    Returns:
        param_dict(dict): parameters' value of all ranks.
        dist_attr(dict): parameters' distributed attribute.
        addition_info(dict): additional information user saved in last training.

    Notes:
        The return, 'addition_info', is belonging to the first file of checkpoint_path by default.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('Depends on external files.')
            >>> from paddle.distributed.auto_parallel.static.utils import load_distributed_checkpoint

            >>> ckpt_path = [
            ...     './model_state_rank0.pdmodel',
            ...     './model_state_rank1.pdmodel',
            ... ]
            >>> dist_attr_path = [
            ...     './dist_attr_rank0.pdattr',
            ...     './dist_attr_rank1.pdattr',
            ... ]
            >>> param_dict, dist_attr, add_info = load_distributed_checkpoint(ckpt_path, dist_attr_path)
    !'checkpoint_path' cannot be None. 'dist_attr_path' cannot be None.modelr   )r   _load_distributed_state_dict_load_distributed_attribute)r   r   state_dict_infor   r   r   s         r+   load_distributed_checkpointr    s    @ _--  + - ^,,PP.PPP,2?CCO+N;;I )J#O4My-//r-   c                    ddl m} t          |t          j        j                  sJ t          |           s
J d            t          |          s
J d            |
 |            }t          |           }t          |          }t          ||          }|d         }|d         }	t          |||          }
t          |
|           |	S )a  
    Load parameter, optimizer, distributed attribute and addition_info into model.

    Args:
        checkpoint_path(list[str]): model parameter file path, must be in order of rank id.
        dist_attr_path(list[str]): distributed attribute file path, must be in order of rank id.
        program(Program): the program to be updated with checkpoint_path.
        dist_context(DistributedContext ,optional): collect related distributed information for program

    Returns:
        addition_info(dict): user saved in last train.

    Notes:
        The return, 'addition_info', is belonging to the first file of checkpoint_path by default.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('Depends on external files.')
            >>> from paddle.distributed.auto_parallel.static.utils import load_checkpoint_into_program

            >>> exe.run(startup_program)
            >>> ckpt_path = [
            ...     './model_state_rank0.pdmodel',
            ...     './model_state_rank1.pdmodel',
            ... ]
            >>> dist_attr_path = [
            ...     './dist_attr_rank0.pdattr',
            ...     './dist_attr_rank1.pdattr',
            ... ]
            >>> load_checkpoint_into_program(ckpt_path, dist_attr_path, main_program)
    r   rw   r   r   Nr   r   )ry   rx   r@   r   r   r   r   r   r   get_dist_attrmerge_and_slice_parameterload_parameter_into_program)r   r   r   ry   rx   all_state_dict_infoall_pre_dist_attrall_cur_dist_attrall_param_dictr   sliced_param_dicts              r+   load_checkpoint_into_programr    s    F >=====gv}455555_--  + - ^,,PP.PPP,66886GG3NCC%g|<<(1N'8M1)+<    17;;;r-   c                     t          | t                    sJ |rt          |t          j        j                  sJ | sdS |                    |            dS )z
    Load parameters into program.

    Args:
        param_dict(dict): parameters' name and value.
        program(Program): the program to be updated
    N)r@   r   r   r   r   set_state_dict)r   r   s     r+   r  r    sa     j$'''''Az'6=+@AAAAA :&&&&&r-   c                 B   t           j                                        }t          j                            |d| d          }t          | |          t           j                                        d}t          j        ||           t          j
        d| d           dS )z,Save distributed attribute of all parametersdist_attr_rankz.pdattr)r   
world_sizez(Already saved distributed attribute to 'r   N)r   distributedget_rankr   r   joinr  get_world_sizesaver   info)r   r   ry   rank_iddist_attr_namedist_attr_dicts         r+   r   r     s      ))++GW\\9999 N w55(7799 N K///LNNNNNOOOOOr-   c                     i }| D ]d}t          j        |          }|d         }|t          |           k    s
J d            |d                                         D ]\  }}||vr|||<   e|S )z:Load parameters' distributed attribute from dist_attr_pathr  zMThe number of 'dist_attr_path' must be equal to the last training world size.r   )r   loadr/   r   )r   total_dist_attrdist_attr_filer   pre_world_sizer'   r   s          r+   r   r     s    O( - -K//	"<0^!4!4444[ 544 $G,2244 	- 	-JD$?**(,%	- r-   c                 L   t           j                                        }t          j                            |d| d          }|                                 t           j                                        |d}t          j        ||           t          j
        d| d           dS )zSave parameters' state_dictmodel_state_rankz.pdmodel)r   r  r   zAlready saved model to 'r   N)r   r  r  r   r   r  
state_dictr  r  r   r  )r   r   r   r   ckpt_file_namer"  s         r+   r   r   '  s    &&((DW\\:D::: N ##%%(7799& J
 K
N+++L?O???@@@@@r-   c                    i }t          |           D ]\  }}t          j        |d          }|d         }|t          |           k    s
J d            |dk    r|d         }|d                                         D ]O\  }}||v r.||                             t          j        |                     7t          j        |          g||<   P||d}	|	S )	z0Load parameters' state_dict from checkpoint_pathT)return_numpyr  zNThe number of 'checkpoint_path' must be equal to the last training world size.r   r   r   )r   r   )	enumerater   r  r/   r   rN   r   array)
r   all_state_dictr   	ckpt_filer  r  r   r'   r   r  s
             r+   r   r   6  s   N#O44 9 9Y +idCCC(6_!5!5555\ 655 !88+O<M*7399;; 	9 	9KD%~%%t$++BHUOO<<<<(*'8t$$		9  &  r-   c                    i }t                      r|                                 j        }|D ]}|                                dk    sG|                                dk    r|                    d          r|                                d         r|j        }|                    d                                          }|                                dk    r|	                    d          n|	                    d          }|j
        }|j        |j        |j        |j        d||<   ndd	lm}	 t#          | t$          j        j                  sJ |
 |	            }|                                 D ]h}
t-          |
          st/          |
          rH|                    |
          }|j
        }|j        }|j
        j        }|j        |j        ||d||
j        <   i|S )
zs
    Get distributed attribute of current rank.

    Args:
        program(Program): main program for training
    zbuiltin.parameterz
pd_op.datapersistabler   parameter_namer'   r   r   rw   )r   global_blockr   r'   has_attrattrsr   resultas_tensor_dist_attrstr_attrrJ   rG   r   rI   rO   ry   rx   r@   r   r   r   r   r	   r   r   )r   ry   r   r   r   r   var_dist_attrvar_namerJ   rx   r   r   rI   rO   s                 r+   r  r  N  s    I}} )""$$( 	 	Bwwyy///		\))KK.. *HHJJ}- *  "| , 3 3A 6 6 J J L L wwyy$777 KK 0111V,, 
  -9%1%7%1%=$1$>!-!7	' '	(#	* 	BAAAAA'6=#899999::<<L$$&& 	 	CC   $:3$?$?  AA#FF !  0</<,9C	%1%7%1%=$0!*	' '	#(# r-   c                    t          |          s
J d            t          | t                    sJ dt          |            d            |                                 D ]w\  }}t          |t
                    s t          dt          |           d          t          |t                    rt          d |D                       st          d          x|i S g }g }t          j
        d           |                                D ]}||vr|                    |           ||         }||         }	||	k    rMt          j                                        }
|	d	                             |
          }| |         |         }|| |<   | |         }|d
         }|	d
         }t#          t%          |                    dk    sd|vrt'          ||          }|| |<   n|d         }|| |<   t#          t%          |                    dk    sd|vrt)          ||	          }|| |<   |D ]0}||vr*|                    |           |                     |           1|rt-          j        d| d           |rt-          j        d| d           | S )a  
    Merge parameters with previous dist_attr and slice parameters with current dist_attr

    Args:
        dist_param_dict(dict): parameters' value of all ranks.
        pre_dist_attr(dict): parameters' dist_attr of last training process.
        cur_dist_attr(dict): parameters' dist_attr of current training process.

    Returns:
        dist_param_dict(dict): parameters' value of current rank.
    z'pre_dist_attr' cannot be None.z8The type of 'dist_param_dict' should be 'dict', but got r   zXThe key of 'dist_param_dict' is parameter's name, and its type should be 'str', but got c              3   J   K   | ]}t          |t          j                  V  d S r?   )r@   r   ndarray)rB   vs     r+   rD   z,merge_and_slice_parameter.<locals>.<genexpr>  s?       2
 2
*+Jq"*%%2
 2
 2
 2
 2
 2
r-   zoThe value of 'dist_param_dict' is parameter's value of all ranks, and its type should be 'list(numpy.ndarray)'.Nz$Start to merge and slice parameters.r   rI   r   r5   r   zParameters 'z)' are not found in last training process.z,' are not found in current training process.)r   r@   r   r   r   rX   r   r0   rE   r   r  r   rN   r   r  r  r1   r/   set_merge_parameter_with_dist_attr_slice_parameter_with_dist_attrpopwarningswarn)dist_param_dictpre_dist_attrcur_dist_attrr'   r   param_not_in_preparam_not_in_curr4  pre_attrcur_attrr  r1   param	pre_parampre_dims_mappingcur_dims_mappingcomplete_paramsliced_params                     r+   r  r    sW    M**MM,MMM*ot,,  [4CXCX[[[ , ',,..  e$$$ 	G9=dG G G   %&& 	c 2
 2
/42
 2
 2
 /
 /
 	 @  	 	L7888!&&(( 5 5=((##H--- * *x(1133G_-33G<<E#H-e4E(-OH%#H-	#N3#N3s#$$%%))R7G-G-G<8 N )7OH%%&q\N(6OH%s#$$%%))R7G-G-G: L )5OH%! * *=((##H---))) 
V+VVV	
 	
 	
  
Y+YYY	
 	
 	
 r-   c                    ddl m} |d         }|d         }|d         }|                    | d         j        ||          }g }g }|D ]a}	|                    |	||||          }
|                    |	          }|
|vr-|                    |
           t          || |         |
|           bt          |          dk    s|r
J d            |d         d         }|S )z*Merge parameter with distributed attributer   	ResharderrI   r   r   r   zFail to merge parameter)	reshardrN  compute_complete_shaperG   compute_partition_indexr1   rN   _merge_parameterr/   )
param_listr   rN  rI   r   r   complete_shapepartition_param_listmerged_partitionprocesspartition_indexr1   rJ  s                r+   r:  r:    s/   """"""^,Lo.Mo.M551]L N    #;;^\=-
 
 ##G,,"222##O444$5!	   #$$))1E))! *)E *!,Q/Nr-   c                    t          | t          j        j                  rt	          j        |           n| } |d         }|d         }|d         }t          | j        |||          }t          | |t          |                    }t          j
                                        }t          || j        |||          }||         }	|	S )z*Slice parameter with distributed attributerI   r   r   )r@   r   r   r   r   r'  _get_split_indicesrG   _slice_parameterr/   r  r  _get_sliced_param_index)
rF  r   rI   r   r   partition_index_listsliced_param_listr  sliced_param_indexrK  s
             r+   r;  r;    s     &eV[-DEEP5 
 ^,Lo.Mo.M-\=-  )#S)=%>%>   ))++G0lM=  %%78Lr-   c                    ddl m} t          |           dk    rId}t          | d         d                   D ]'\  }}|d         dk    s|d         ||         k    rd} n(|rdS | s|                     ||f           dS d}|t          |           k     r|                    | |         d         |          \  }	}
}|	dk    rx|
dk    r%t          j        | |         d         |f|	          }n$t          j        || |         d         f|	          }|                     |           t          | |||           dS |dz  }|t          |           k     dS dS )	a  
    Merge partial parameters to a complete one.

    Returns:
        None

    Examples:
        .. code-block:: python

            >>> import numpy as np
            >>> from paddle.distributed.auto_parallel.static.utils import _merge_parameter

            >>> partition_param_list = [(np.array([[[1.11, 1.12]]]), [[0, 1],[0, 1],[0, 2]])]
            >>> param = np.array([[[1.13, 1.14]]])
            >>> partition_index = [[0, 1],[0, 1],[2, 4]]
            >>> complete_shape = [2, 2, 4]

            >>> _merge_parameter(partition_param_list, param, partition_index, complete_shape)
            >>> print(partition_param_list)
            [(array([[[1.11, 1.12, 1.13, 1.14]]]), [[0, 1],[0, 1],[0, 4]])]

    r   rM  Tr   FNr5   r   )
rO  rN  r/   r&  rN   compute_concat_infor   concatenater<  rR  )rU  rF  rX  rT  rN  is_complete_datar   r   rK   concat_axisfirst_ordernew_partition	new_params                r+   rR  rR    s   2 #"""""
  A%%"#7#:1#=>> 	 	ICAw!||tAw.*===#(   >  	F ##UO$<=====#*++++
 --$Q'*O 	 b  !## "-a03U;+! ! !II !# 4Q 7 :;+! ! !I %((+++ (!"	   FA5 #*++++++++r-   c                    ||                                  }|j        D ]}|                                D ]}t          | |           |                                t
          v rG|j        g }g }g }|                                D ]}|                                }|K|                    t          j
                               d }	|                                j        }
|
|
j        }	n|                    |           |j        }	|	|	|vr|                    |	           |                                D ]w}|                                }|'|                    t          j
                               ?|                    |           |j        |vr|                    |j                   xt          |          dk    rFt          |          dk    r	|d         }nt          |          }t          j        |||          |_        d S )N)r   r   r   )r-  r   r{   _complete_op_dist_attrr'   partition_skip_op_listr   operands_sourcerN   r   	Attributeget_defining_oprJ   resultsr/   r   create_op_dist_attribute)r   r   r   	sub_blockmeshesoperand_attrsresult_attrsoperandtmp_attr
value_meshtmp_op_dist_attrr0  r   s                r+   rj  rj  Y  s   }$$&&i ) ) 	= 	=I"7)<<<<<7799...<FML--// . .",,..#!((999!%J'.'>'>'@'@'J$'3%5%B
!((222!)!6J)j.F.FMM*---**,, = =!++--# ''8888 ''111,F::h&;<<<6{{Qv;;!##!!9DD/77D";!    K) )r-   c           	          g }t          | j                  |z
  }t          j        | ||         |          }|dk    r|S |D ])}|                    t          |||dz
                       *|S )a  
    Slice a complete parameter.

    Returns:
        sliced_param_list(list): sliced parameters with 'partition_index_list'

    Examples:
        .. code-block:: python

            >>> import numpy as np
            >>> from paddle.distributed.auto_parallel.static.utils import _slice_parameter

            >>> complete_param = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
            >>> rank = 2
            >>> complete_shape = [1, 1, 6]
            >>> dims_mapping = [-1, -1, 0]
            >>> process_shape = [3]
            >>> process_group = [0, 1, 2]

            >>> sliced_param_list = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
            >>> print(sliced_param_list)
            [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]

    ra  r   )r/   rG   r   splitextendr[  )rJ  r]  rb   r^  r   rK  rF  s          r+   r[  r[    s    2 ~#$$v-D8,T2  L {{ 
 
  U$8&1*EE	
 	
 	
 	
 r-   c                    ddl m} |                    | ||||          }d}t          |          D ]Y\  }}	||         dk    r|	}
n|	|||                  z  }
|
dk    r||         d         }n||         d         dz   |
z  }||	|
z  z  |z   }Z|S )a  
    Get sliced_param's index of current rank in all sliced parameters list.

    Returns:
        sliced_param_index(int): the index of sliced param in sliced_param_list

    Examples:
        .. code-block:: python

            >>> import numpy as np
            >>> from paddle.distributed.auto_parallel.static.utils import _get_sliced_param_index

            >>> complete_param = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
            >>> rank = 2
            >>> complete_shape = [1, 1, 6]
            >>> dims_mapping = [-1, -1, 0]
            >>> process_shape = [3]
            >>> process_group = [0, 1, 2]

            >>> slice_param = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
            >>> print(slice_param)
            [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]

            >>> index = _get_sliced_param_index(rank, complete_shape, dims_mapping,
            ...                                 process_shape, process_group)
            >>> print(index)
            2
    r   rM  r   r5   )rO  rN  rQ  r&  )r   rT  rI   r   r   rN  rX  r_  rK   rG   slice_shaper1   s               r+   r\  r\    s    > #"""""77nlM= O n-- 	Q 	Q5?b  KK=a#AAK!#A&q)EE$Q'*Q.;>E/5K3GH5Pr-   c                 2   ddl m} g }|D ]`}|                    || |||          }|rAt          t	          |                    D ]#}||                             ||                    $^|}at          t          d ||                     }d |D             }|S )a  
    Get split indices of every dimension.

    Returns:
        split_indices_list(list): the split indices of every dimension of the parameter

    Examples:
        .. code-block:: python

            >>> import numpy as np
            >>> from paddle.distributed.auto_parallel.static.utils import _get_split_indices

            >>> complete_param = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
            >>> complete_shape = [1, 1, 6]
            >>> dims_mapping = [-1, -1, 0]
            >>> process_shape = [3]
            >>> process_group = [0, 1, 2]

            >>> index = _get_split_indices(complete_shape, dims_mapping, process_shape, process_group)
            >>> print(index)
            [[], [], [2, 4]]
    r   rM  c                 J    t          t          |           |hz
  dhz
            S r   )r0   r9  xys     r+   <lambda>z$_get_split_indices.<locals>.<lambda>  s!    c!ffslaS011 r-   c                 ,    g | ]}t          |          S r6   )r   rB   r  s     r+   r   z&_get_split_indices.<locals>.<listcomp>  s    @@@&))@@@r-   )rO  rN  rQ  rF   r/   r{  r0   map)	rT  rI   r   r   rN  split_indices_listrW  rX  dims	            r+   rZ  rZ    s    2 #"""""  1 1#;;^\=-
 
  	1S1122 E E"3'..s/CDDDDE "111	
 	
  A@-?@@@r-   c                     t          |                     d                    }t          | j        v o9|t          t          j                  k    p|t          t          j                  k    S )Nop_role)rA   r   OP_ROLE_KEY
attr_namesOpRoleForwardLossr   r  s     r+   is_forward_opr    sU    "'')$$%%G"-' 3v~&&&E'S5E5E*Er-   c                     t           | j        v oEt          |                                 t                              t          t          j                  z  S r?   )r  r  rA   	all_attrsr  Backwardr   s    r+   is_backward_opr    D    "-' C
{#- -FO- r-   c                     t           | j        v oEt          |                                 t                              t          t          j                  z  S r?   )r  r  rA   r  r  Optimizer  s    r+   is_optimize_opr    r  r-   c                     t           | j        v oJt          |                                 t                              t          t          j        j                  z  S r?   )r  r  rA   r  r  r  LRSchedr  s    r+   is_lr_sched_opr  $  sG    "-' %C
{#- -FO#$$-% %r-   c                     t           | j        v o`t          |                                 t                              t          t          j                  t          t          j                  z  k    S r?   )r  r  rA   r  r  r  r  r  s    r+   
is_loss_opr  *  sS    "-' 2C
{#- -
fn

FK 0 0
0-2 2r-   c                     t           | j        vrdS t          |                                 t                              }|t          t          j                  z  o|t          t          j                  z  S )NF)r  r  rA   r  r  r  r  r  s     r+   is_loss_grad_opr  0  sW    "-''u",,..-..GS)))HgFK8H8H.HHr-   c                     | j                             d          o,| j                             d                              d          S )Nop_namescopez/gradient_clip)r   r.  r   
startswithr  s    r+   is_gradient_clip_opr  7  sB    7N++ #1 1j!""#r-   c                 n    | j                             d          od| j                             d          v S )Nr  /auto_parallel/reshard)r   r.  r   r  s    r+   is_reshard_opr  =  s<    7  C
"bgll>&B&B
BCr-   c                 6    | j                             d          S )N_p)r   endswithr  s    r+   
is_prim_opr  C  s    7D!!!r-   c                 ,    |                      d          S )Nring_id)r.  r  s    r+   
is_comm_opr  G  s    ;;y!!!r-   c                    g }| j         D ]Z}t          |          rIt          |j                                                  dk    s
J d            |                    |           [t          |          dk    s
J d            |d         S )Nr   z#loss op should only output loss varz"num of loss op is not equal to oner   )r   r  r/   r   output_arg_namesrN   )r   loss_opsr   s      r+   get_loss_opr  K  s    Hi    b>> 	 rw//1122a7775 877 OOBx==ACA;r-   c                    t                      }||_        t          |t          t          j        f          rt          |          |_        nCt          |t          j                  r||_        n!t          | dt          |                     |                    d          r*|                    d           |                    d           |                    d          r|d         |_        |                     ||           |S )Nz8 must be a instance of ProcessMesh or list, but receive mark_annotatedrI   rJ   chunk_id)r   rI   r@   r0   r   r7  r   rJ   r   r   r   getr  r  r   )ry   r   rI   rJ   kwargsr   s         r+   set_var_dist_attrr  X  s   %''$0!,rz 233 
(3L(A(A%%	L$"2	3	3 
(4%%iiUYZfUgUgii
 
 	
 zz"## 8''777''777zz* 7$*:$6!11#7GHHHr-   c                 x   |J |J t                      }| j                                        D ]}|                    ||           | j                                        D ]}|                    ||           ||_        |                    d          r|d         |_        |	                    | |           d S )Nr  )
r   r   input_arg_namesset_input_dims_mappingr  set_output_dims_mappingrJ   r  r  set_op_dist_attr_for_program)new_oprJ   ref_mappingctxr  new_op_dist_attrinput_varnameoutput_varnames           r+   6naive_set_dist_op_attr_for_program_by_mesh_and_mappingr  m  s     ###"""'))4466 L L//{KKKK +6688 N N00MMMM$0!zz* 7$*:$6!$$V-=>>>>>r-   c                 @   |J t                      }| j                                        D ]L}| j                            |          }|                    |          j        }|                    ||           M| j                                        D ]L}| j                            |          }|                    |          j        }|	                    ||           M||_
        d|v r|d         |_        d|v r|d         |_        |                    | |           d S )Nis_recomputer  )r   r   r  r   r   r   rI   r  r  r  rJ   r  r  r  )	r  rJ   r  r  r  r  r   r8   r  s	            r+   *naive_set_dist_op_attr_for_program_by_meshr    s:    ###'))4466 H Hl}--66s;;H//wGGGG +6688 J Jl~..66s;;H00IIII$0!(.~(>%V$*:$6!$$V-=>>>>>r-   c           	         d}| j         }| j        j        }|                                dk    s|                                dk    rdS |                                }g }d|v r|                    d          }g }|                                D ]}|                     |          }|j        r|	                    |          }	t          |	          dk    rHt          |	dd                    D ]0\  }
}|dk    s%J |                                 d|
 d| d	            1t          |	          dk    r|                    |	d
                    |                                D ]g}|                     |          }|j        r |                    |          }	||vrt          |	          dk    rHt          |	dd                    D ]0\  }
}|dk    s%J |                                 d|
 d| d	            1t          |	          dk    r|                    |	d
                    |	d
         dk    s"J |                                 d| d	            t          |	          dk    rHt          |	dd                    D ]0\  }
}|dk    s%J |                                 d|
 d| d	            1|                    |	d                    it!          |          }|
J d            |                                D ]Z}|                     |          }|j        r|	                    |          }	t          |	          dk    r||	d
         k    r||	d
<   d}[|                                D ]r}|                     |          }|j        r|                    |          }	||vr't          |	          dk    r||	d
         k    r||	d
<   d}_||	d         k    r||	d<   d}s|S )NFrG   sliceXShaper   r5   zD only the batch dimension (0-dim) can be sharded, but the dimension z is sharded by z part.r   z^ only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by r   zN only the batch dimension (1-dim) of XShape can be sharded, but the dimension #There is no compatible dim mapping.T)r   	serial_opr   r   output_namesoutputr  get_serial_inputr	   get_input_dims_mappingr/   r&  rN   r  get_serial_outputget_output_dims_mappingr^   )r   rm   r   op_descr  xshape_arg_namesbatch_dim_mappingsarg_nameserial_tensorrI   r   r8   rd   s                r+   +update_op_dims_mapping_by_default_dist_implr    s   G$L$G||~~  GLLNNg$=$=u''))L<">>(33++-- 7 700::% 	#::8DD|q   ),qrr*: ; ;  W"}}}||~~  O  Okn  O  O  @G  O  O  O %}}} |!!%%l1o666,,.. 7 711(;;% 	#;;HEE+++<  1$$$-l122.>$?$?  LC"b==="<<>>  S  Sor  S  S  DK  S  S  S )=== <  A%%")),q/:::?b(((<<>>  Q  Q  BI  Q  Q  Q )(( <  1$$$-l122.>$?$?  LC"b==="<<>>  ]  ]y|  ]  ]  NU  ]  ]  ] )=== %%l1o6666;<NOO!--- .-- ++--  00::% 	#::8DD|!!&<Q&O&O4LOG,,..  11(;;% 	#;;HEE+++L!!Q&&*l1o=="8Q%a88"8QNr-   c                    d}| j         }| j        j        }|                                }i }i }d}|D ]P}|                    |          }	|t          |	          k     rt          |	          }|	||<   t          |	          ||<   Qg }
|D ]}||         |k     rfd t          |          D             }t          ||                   D ]!}|||         z
  |z   }||         |         ||<   "|
                    |           t|
                    ||                    |                                }|D ]A}|	                    |          }	t          |	          |k    sJ |
                    |	           Bt          |
          }|
J d            |D ]}||         |k     rud t          ||                   D             }t          ||                   D ]}|||         z
  |z   }||         ||<   |||         k    r|                    ||           d}|||         k    r|                    ||           d}|D ]5}|	                    |          }	||	k    r|                    ||           d}6|S )NFr5   c                     g | ]}d S r5   r6   rB   _s     r+   r   zHupdate_op_dims_mapping_by_elementwise_like_dist_impl.<locals>.<listcomp>  s    HHHqHHHr-   r  c                     g | ]}d S r  r6   r  s     r+   r   zHupdate_op_dims_mapping_by_elementwise_like_dist_impl.<locals>.<listcomp>  s%               r-   T)r   r  r   r  r  r/   rF   rN   r  r  re   r  r  )r   rm   r   r  r  input_dims_mapping_dictinput_dims_mapping_lensmax_dims_mapping_lenr  rI   ra   new_dims_mappingrK   new_idxr  compatible_dims_mappings                   r+   4update_op_dims_mapping_by_elementwise_like_dist_implr    sR   G$L$G--//O  # > >#::8DD#l"3"333#&|#4#4 ,8),/,=,=))# 
H 
H"8,/CCCHHE2F,G,GHHH28<== Q Q(+B8+LL -DH,Ma,P ))$$%56666$$%<X%FGGGG//11$ / /#;;HEE<  $88888  ....=>OPP"..- /.. $  "8,/CCC   !"9("CDD      28<== G G(+B8+LL '>g&F ###:8#DDD33H>NOOO&*A(*KKK335   $  #;;HEE"l22001   GNr-   c                    ddl m} | j        }t          j        |          }g }|t
          j                                        n!t          |	                    d                    }t          |          D ]V}t          j        |          }	 |            |	_        |                    ||	          \  }
}
}}}
|                    |           W|S )z2Get all distributed main programs by dist_context.r   )DistributedOperatorContextNGPU)ry   r  clustercopydeepcopyr   r  r  r/   get_all_devicesrF   _dist_op_context_get_dist_programrN   )serial_program_infory   parallelizerr  r  copied_parallelizerall_dist_main_programranksr  used_dist_contextr  dist_startup_programdist_main_programs                r+    get_all_distributed_main_programr  -  s     988888!)G-55 ? 	))+++((//00 

 << 
8 
8 M,77-G-G-I-I*  11';LMM	
 $$%67777  r-   c                       e Zd Z	 ddZed             Zed             Zed             Zed             Zed             Z	dS )	SerialProgramInfoNc                 L    || _         || _        || _        || _        || _        d S r?   )_train_program_startup_program_loss
_optimizer_cluster)selftrain_programstartup_programloss	optimizerr  s         r+   __init__zSerialProgramInfo.__init__K  s-     , /
#r-   c                     | j         S r?   )r  r  s    r+   r  zSerialProgramInfo.train_programT  s    ""r-   c                     | j         S r?   )r  r  s    r+   r  z!SerialProgramInfo.startup_programX  s    $$r-   c                     | j         S r?   )r  r  s    r+   r  zSerialProgramInfo.loss\  s
    zr-   c                     | j         S r?   )r  r  s    r+   r  zSerialProgramInfo.optimizer`  s
    r-   c                     | j         S r?   )r   r  s    r+   r  zSerialProgramInfo.clusterd  s
    }r-   r?   )
__name__
__module____qualname__r  propertyr  r  r  r  r  r6   r-   r+   r  r  J  s        GK        # # X# % % X%   X   X   X  r-   r  c                    d }dd l m} |                                }|                                 d}dddddd	d
d}g }g d}| D ]}i }	|                                j        }
|                                j        D ]}d}|j        |v r||	|j        	                                <   +|j
        r%t          |
|j
        d                  j                  nd}t          |                    d                    t          t          j                  k    rd|j        v r~|j        d d         }||                                v r||         }|                    |d|          }|r ||||
          }n|                    ||          }|rd ||||
          z  }nt          |                    d                    t          t          j                  k    rS|j        |                                v r||j                 n|j        }|                    |          }|r ||||
          }||	|j        	                                <   |                    |	           |S )Nc                 *   d}	 t          | d                   }n	#  |cY S xY w| d         }d}d}|                    d          }d}|D ]+}	d|	v rdnd}||	v r|	d |	                    |          dz
           }
|	                    d          }|	                    d	          }|dk    r|dk    r||k    s
J d
            |	|dz   |                             d          }d |D             }d}|t          d |d          z  }|j        dk    r
|
dk    rdnd}
|j        D ]U}|                                |
k    r;|                    |          D ]#}||         }|t          d |j                  z  }$ nV-|dk    r|dk    s
J d            ||z  |z  }|S )Nr   op_timeconfig
z
(Variable)z(list<Variable>r   []zGet shape failed.,c                 P    g | ]#}t          |                                          $S r6   )rA   rt   r  s     r+   r   zFget_standalone_cost_data.<locals>._compute_runtime.<locals>.<listcomp>  s&    777AQWWYY777r-   c                     | |z  S r?   r6   r  s     r+   r  zDget_standalone_cost_data.<locals>._compute_runtime.<locals>.<lambda>  s
    q1u r-   c_embeddingweightwidsc                     | |z  S r?   r6   r  s     r+   r  zDget_standalone_cost_data.<locals>._compute_runtime.<locals>.<lambda>  s
    QU r-   zGet input size failed.)	floatrz  findr   r   input_nameslowerinputrG   )op_costr   r|   runtime	op_configtotal_static_input_sizetotal_actual_input_sizeparsed_infovariabler  arg_name_lowershape_left_boundaryshape_right_boundaryrG   dtype_factorr  r4  r   actual_runtimes                      r+   _compute_runtimez2get_standalone_cost_data.<locals>._compute_runtimej  sM   	GI.//GG	NNNH%	"#"#ood++ 	 	D , 4 4:K  4!%&?		((;(;a(?&?!@&*iinn#'+yy~~$'!++,q00,/BBBB& CBC '!+.BB%**  87777 '62D2DeQ+O+OO'7m++-99u # !#  H~~''>99(*(:(:  H"&x.C3v 2 2CI8 8 33  : '**/F/J/J/J$ 0K/JJ
 $&==G 	 s     r   r   	embeddingmatmul	transposereshape	unsqueezer   divide)r  	matmul_v2
transpose2reshape2
unsqueeze2r   r   )create_py_readercreate_double_buffer_readerreadassignfloat32r  _gradF)forwarddtype)rD  )paddle.cost_model
cost_model	CostModelstatic_cost_datar-  r|   r   r   r   idr  rX   rD  rA   r   r  r  r   get_static_op_timer  rN   )distributed_programsr1  cmrF  DEFAULT_MULTIPLEOP_NAME_MAPPINGstandalone_cost_datanot_enum_opsdistributed_program	cost_datar|   r   r&  rD  forward_op_namer%  op_names                    r+   get_standalone_cost_datarU  i  s   1 1 1f #"""""J!!!"!!# O   L  4 )/ )/	"//116%22448 $	. $	.BGw,&&*1	"'**,,' %D+A./5666 
 2779%%&&#fo*>*>>>bg%%&(gcrclO&/*>*>*@*@@@*9/*J(;;'e <  G  N"2"27B"E"E","?"?+5 #@ # # # N&'*:*:7B*M*M&MGRWWY''((C,?,??? w/"6"6"8"888 $BG,, 
 %77@@ B..wDAAG&-Ibgjjll####I....r-   c                     |                                 }|                                }||j        v r|                     |           d S ||j        v r|                     |           d S t	          d          )Nz6Cannot find the original id in the distributed context)rI  original_id_dist_ops_for_programset_original_idAssertionError)dist_op_descr  ry   op_idop_original_ids        r+   set_dist_op_desc_original_idr^    s    JJLLE((**N222$$U+++	<=	=	=$$^444 D
 
 	
r-   c                 f    | | S t          | t          t          f          rt          |           S | gS r?   )r@   r0   tuple)r   s    r+   to_listra    s5    }%$'' E{{7Nr-   c                    t           j                            || dt          j                                                   }t          |d          5 }|                    t          |                      d d d            d S # 1 swxY w Y   d S )Nz	_program.r  )	r   r   r  r   r  r  openwriterX   )r   r   r'   filenamefs        r+   debug_programrg    s    w||?? 2 ; ; = =?? H 
h		 	G                 s   #BB	Bc                 J    ddl m}  |            D ]}|j        | k    r|c S d S )Nr   )get_all_process_groups)r   ri  rI  )r  ri  gs      r+   ring_id_to_process_grouprk    sJ    555555##%%  47??HHH 4r-   c                 \    ddg}| j         D ]}|j        D ]}|D ]}||j        v r   dS  dS )N
_grad_gradtriple_gradTF)r{   r   r   )r   higher_order_op_suffixr   r   suffixs        r+   find_higher_order_backward_oprq    sl    *M:    ) 	  	 B0    RW$$4444 % 	 
 5r-   c                 t    t          | t                    sJ d| j        vsJ t          d | j        d          S )zU
    input:
        - var: variable
    return:
        number of element in var
    r5   c                     | |z  S r?   r6   r  s     r+   r  zget_var_numel.<locals>.<lambda>!  s
    q1u r-   r   )r@   r
   rG   r   r   s    r+   get_var_numelru    sE     c8$$$$$SY$$ci333r-   c                 P   t          | t          j        j                  r|                                 S t          | t          j        j                  r5t          | j        t                    r| j        S |                                 S t          dt          |            d          )Nzg'optimizer' must be object of class `paddle.optimizer.Optimizer` or `paddle.static.Optimizer`, but got r   )
r@   r   r  	Optimizerget_lrr   _learning_rater   r   r   r  s    r+   rx  rx  $  s    )V-788 
!!!	Iv}6	7	7 	
i.66 	.++++---I6:9ooI I I
 
 	
r-   c                 T   dd l }ddlm} g }d} |            }|j                            d          \  }}t          |          |z   }	d }
d}|                     |j        |j                  }
|
                    ||	f           |
	                    d           i }| D ]v}||j
        vrt          |j
                  dk    r<|j
                            |          }|dk    rd	nd
}|r.|j
        d         }|j        |                             d          \  }}t          |          |z   }|                     |j        |j                  }|                    ||f           |                    t!          |                              d                     |                    |                              d          }t          |          }||k    rt)          d| d| d          t+          d|j
         d           |                                 n|j
        d         }	 ||vrf|
                                \  }}t          |                    |                                                    }|||<   |                    |           nn||                             t!          |                              d                     ||                                          t+          d|j
         d           n|                                 x|
                                 d S )Nr      )_get_global_envi  :i   
   r   TFr   zutf-8z0Please check comm pair, the recv rank should be z	 but got r   zIt is able to instantiate z as sender now.z as receiver now.)socket
collectiver}  current_endpointrz  rA   AF_INETSOCK_STREAMbindlistenr  r/   r1   trainer_endpointsconnectsendrX   encoderecvdecoder   r   closeacceptrN   instantiate)all_process_groupscur_rankr  r}  has_recv_by_socket	magic_numgenvcur_rank_ipcur_rank_portcur_rank_recv_portserver_socket	buff_sizeclient_socketsr   r1   is_send	recv_rankrecv_rank_iprecv_rank_portconnect_portclient_socketr   	send_rank	recv_addrs                           r+   initialize_pg_in_full_moder  3  s   MMM------I?D!%!6!<!<S!A!AK]++i7MIMM&.&2DEEM%78999N+ -$ -$=...}"##q((!'--h77E#qjjddeG &)/2	/3/E0%** -n  #>22Y> &NF$6! ! %%|\&BCCC""3x==#7#7#@#@AAA$)))44;;GDD4yy9$$$f9ff_cfff   Y]5HYYY   ##%%%%)/2	 (:::3@3G3G3I3I0y"=#5#5i#@#@#G#G#I#IJJ/<t,*11$7777&y166MM0099   'y177999_9L___    	!!####r-   c                     |                      d          o-d|                     d          v od|                     d          vS )Nr  z/auto_parallel/rc
exclude_rcr.  r   r  s    r+   is_recompute_opr  w  sH    
N## 	8277>#:#::	8 7 77r-   c                 Z    |                      d          od|                     d          v S )Nr  r  r  r  s    r+   is_recompute_exclude_opr    s5    ;;~&& <277< < , r-   c           	          ddl m} |sd S |j        }|j        sd S g }t	          | t
          j        j                  rot          | d          rW| j	        j
        dv rIt          | j        d          r4| j        j        }t          |          dk    r|                                 n|j        }n|j        }|sd S |                                } |||j                  }|                                 |                    |          }	g }
d}d}|dz   t          |	          k     r|dk    rj|	|dz            }||j        vr|dz  }7|j        |         d	         }|r:t)          |          d
k    r'|
                    d
t)          |          dz   g           n||                    |	|         g|	|dz            g          \  }}}|r1|                    ||          }|
                    ||dz   g           nt1          j        d| d|dz    d           |dz  }|dz   t          |	          k     t5          |
          D ]U\  }}t7          |d
         |d                   D ]3}|j        |                             ddt;          |          z              4Vd S )Nr|  )RecomputeStategpt)GPTForPretrainingGPTForPretrainingAutocheckpointsr   r5   r   var_as_output_opsr   zCould not recompute op range [z] - [z] r  z/auto_parallel/rc_)passes.auto_parallel_recomputer  	recomputeenabler@   r   nnLayerhasattr	__class__r  r  r  r/   r<  r-  r   build_statssort_checkpointsvar_op_depsmaxrN   is_subgraph_update_segment_startr   debugr&  rF   	_set_attrrX   )r   lossesstrategyr   r  r  ckptsr   rc_stater  segments	start_idxpre_segment_end_idx	ckpt_nameop_idx_listflagmin_idxmax_idxrK   segmentjs                        r+   set_recompute_segmentsr    s   @@@@@@ "I 
 E%)) &E5!!	*( 
 	=11 I)E5zzA~~		)EE%   ""E~eUY//H++E22KHI
a-#k**
*
*??#IM2I 444Q	".y9:MNK ;s;//!33C$4$4q$8 9:::%-%9%9Y'(;y1}+E*F& &"D'7  "880  'A+ 67777RWRR7Q;RRR   	Q	- a-#k**
*
*0  ))  
7wqz71:.. 	 	AIaL"" 4s1vv =   	 r-   c                 D   |                     |          }|j        }|j        }| |j        vrt	          |||           }n| }|d         }|dk    rQ|j        |         dk    r@t          |j        |j        ||          }t          |          |                    |          fS dS )Nr   r5   r   )r   r   )	r   rJ   rI   r   r   rG   r   r/   r1   )	r  r   ry   r   rJ   rI   r  batch_size_axisgroup_rankss	            r+   get_input_split_infor    s    #DDSII#0L#0L|///),hOO"1oO 2? Ca G G%$	
 
 ;!2!27!;!;;;4r-   c                     | Ed | _         d | _        | j        r0t          | j        t          j        j                  rd| j        _        | S )NT)_parameter_list_param_groups
_grad_clipr@   r   r  ClipGradByGlobalNorm_async_add_nrz  s    r+   validate_optr    sT    $(	!"&	 	5J &)"@%
 %
 	5 15I -r-   c                     ddl m}m} ddlm}  |            j        } ||dg          }t          |          dk    rdnd gd t          t          | j                  dz
            D             z   } || ||          S )Nr   )r   shard_tensorr   )get_world_process_groupdpc                     g | ]}d S r?   r6   r  s     r+   r   z%set_data_parallel.<locals>.<listcomp>  s%     = = == = =r-   )		interfacer   r  r   r  r  r/   rF   rG   )r  r   r  r  world_ranksrJ   rP   s          r+   set_data_parallelr    s    55555555666666))++1K;{TF33Lk**Q..$$D9 = =CLL1,--= = = J <<444r-   c                     | j         sdS d | j                                        j        D             }t	          |          t	          t
                    z  s	| j         rdS dS )NFc                     g | ]	}|j         
S r6   r   )rB   r   s     r+   r   z*is_naive_data_parallel.<locals>.<listcomp>  s*        	  r-   T)data_parallel_original_serial_main_programr-  r   r9  __not_naive_data_parallel_op__)ry   ops_types     r+   is_naive_data_parallelr     sw    % u <IIKKO  H
 MMC >???

$ t5r-   c           
          |j         }|Nt          j        |j        |j        d t          t          |j                            D                       | _         |j        | _        |j        | _        d S )Nc                 2    g | ]}d t          |          z   S rC   rX   rB   rK   s     r+   r   z1_copy_tensor_dist_attr_to_cpp.<locals>.<listcomp>  "    EEEaS3q66\EEEr-   )	rJ   r   r   rG   r   rF   r/   rI   	annotated)cpp_dist_attrpy_dist_attrpy_process_meshs      r+   _copy_tensor_dist_attr_to_cppr    ss    "/O"%)%5!'EE5_-B)C)C#D#DEEE&
 &
"
 ".!:M*4Mr-   c                     ddl m} | j         }| ||j        |j                  |_         | j        |_        | j        |_        d S Nr   )r   )rG   r   )rJ   r   rG   r   rI   r  )r  r  r   cpp_process_meshs       r+   _copy_tensor_dist_attr_from_cppr    sf    ******$1#$/K"((4%
 %
 %
! !. :L*4Lr-   c           
      $   |j         }|Nt          j        |j        |j        d t          t          |j                            D                       | _         |j        | _        |j        | _        |j	        | _	        |j
        | _
        |j                                        D ]*\  }}|                     |          }t          ||           +|j                                        D ]*\  }}|                     |          }t          ||           +d S )Nc                 2    g | ]}d t          |          z   S r  r  r  s     r+   r   z-_copy_op_dist_attr_to_cpp.<locals>.<listcomp>/  r  r-   )rJ   r   r   rG   r   rF   r/   	impl_typeimpl_idxr  r  inputs_dist_attrsr   get_input_dist_attrr  outputs_dist_attrsget_output_dist_attr)r  r  r  r'   py_tensor_dist_attrcpp_tensor_dist_attrs         r+   _copy_op_dist_attr_to_cppr  )  s-   "/O"%)%5!'EE5_-B)C)C#D#DEEE&
 &
"
 +4M)2M!-!:M*4M%1%C%I%I%K%K Q Q!!,@@FF%&:<OPPPP%1%D%J%J%L%L Q Q!!,AA$GG%&:<OPPPPQ Qr-   c                    ddl m} | j         }| ||j        |j                  |_         | j        |_        | j        |_        | j        |_        | j        |_        | j        	                                D ]*\  }}|
                    |          }t          ||           +| j        	                                D ]*\  }}|                    |          }t          ||           +d S r  )rJ   r   rG   r   r  r  r  r  r  r   r  r  r  r   )r  r  r   r  r'   r  r  s          r+   _copy_op_dist_attr_from_cppr  =  s,   ******$1#$/K"((4%
 %
 %
! +4L)2L - :L*4L&3&E&K&K&M&M 
 
""*>>tDD' "5	
 	
 	
 	
 '4&F&L&L&N&N 
 
""*??EE' "5	
 	
 	
 	

 
r-   c                     | j                                         D ]!}t          |j        j        |j                   "| j                                        D ]!}t          |j        j        |j                   "d S r?   )_dist_tensors_for_programr}   r  r  r   rX  r  r  ry   r   r   s      r+   _copy_dist_attr_to_cppr	  V  s    #=DDFF 
 
%%/1F	
 	
 	
 	
  5<<>> 
 
!'):	
 	
 	
 	

 
r-   c                     | j                                         D ]!}t          |j        j        |j                   "| j                                        D ]!}t          |j        j        |j                   "d S r?   )r  r}   r  r  r   rX  r  r  r  s      r+   _copy_dist_attr_from_cppr  b  s    #=DDFF 
 
'%/1F	
 	
 	
 	
  5<<>> 
 
#'):	
 	
 	
 	

 
r-   c                    | j         D ]}|                                rR|                                >|                     |          }|                                j        }t          ||           |                                rR|                                >|                     |          }|                                j        }t          ||           d S r?   )
serial_ordered_nodesis_varr   get_tensor_dist_attr_for_graphr   r  is_opr   get_op_dist_attr_for_graphr  ry   noder  r  s       r+    _copy_dist_attr_to_cpp_for_graphr  n  s    1 C C;;== 	GTXXZZ3'FFtLLL HHJJ0M)-FFF::<< 	CDGGII1'BB4HHL GGII/M%m\BBBC Cr-   c                    | j         D ]}|                                rR|                                >|                     |          }|                                j        }t          ||           |                                rR|                                >|                     |          }|                                j        }t          ||           d S r?   )
r  r  r   r  r   r  r  r   r  r  r  s       r+   "_copy_dist_attr_from_cpp_for_graphr  z  s    1 E E;;== 	ITXXZZ3'FFtLLL HHJJ0M+M<HHH::<< 	EDGGII1'BB4HHL GGII/M'|DDDE Er-   c                     t                      rdS t          |j                  dk    sJ d| d            t          |j                  dk    sJ d| d            |                    |          j        }|                    |          j        }	||	k    sJ d| d|	 d            d }
 |
 fd	|j        D                       } |
 fd
|j        D                       }t           ||||t          j        ||||d          S )z@
    dependency: prior_op should be run before posterior_op
    Nr   z9first op of dependency should at least have one output. [r  z9second op of dependency should at least have one input. [z5two ops of dependency should have same mesh but got [z] and [c                     d | D             } t          |           dk    sJ d | D             }|                    d            |d         d         S )Nc                      g | ]}|j         	|S r6   )r	   rB   r   s     r+   r   zTinsert_dependencies_for_two_ops.<locals>._select_best_depend_var.<locals>.<listcomp>  s     <<<3+;<<<<r-   r   c                 0    g | ]}|t          |          fS r6   )ru  r  s     r+   r   zTinsert_dependencies_for_two_ops.<locals>._select_best_depend_var.<locals>.<listcomp>  s%    FFF#S-"4"45FFFr-   c                     | d         S )Nr   r6   r  s    r+   r  zRinsert_dependencies_for_two_ops.<locals>._select_best_depend_var.<locals>.<lambda>  s
    AaD r-   keyr5   )r/   sort)r|   vars_with_numelss     r+   _select_best_depend_varz@insert_dependencies_for_two_ops.<locals>._select_best_depend_var  se    <<t<<<4yy1}}}}FFFFF..111#A&&r-   c                 :    g | ]}                     |          S r6   rt  rB   r'   r   s     r+   r   z3insert_dependencies_for_two_ops.<locals>.<listcomp>  s#    ???T4???r-   c                 :    g | ]}                     |          S r6   rt  r$  s     r+   r   z3insert_dependencies_for_two_ops.<locals>.<listcomp>  s#    BBBT4BBBr-   F)rJ   r  syncr  use_nop)	is_sequential_runr/   r  r  r   rJ   insert_dependencies_for_varsr  r  )r   r   prior_opposterior_opry   r  r&  r  prior_op_meshposterior_meshr"  	first_var
second_vars   `            r+   insert_dependencies_for_two_opsr0    s     x())Q...OHOOO /.. |+,,111SLSSS 211 !==   ">>   N***gggVdggg +**' ' ' ('????X%>??? I )(BBBB\%ABBB J ("!!   r-   Tc                    |rt                      rdS t          |t                    r|g}t          |t                    r|g}|D ]}|                     |j                  sJ |D ]}|                     |j                  sJ |                    |d                   }||j        }|J d}
|
r|                     |dd|id|i          }n|                     |d||d	d|i          }|                    t          |           |s|d
gk    rt                      }d|_        d|_        ||_        ||_        |j        |_        |j                                        D ]G}|                     |          }|                    |          j        }|                    ||           H|j                                        D ]G}|                     |          }|                    |          j        }|                    ||           H|                    ||           |	|                    dd|	            |r|                                  |S )zc
    dependency: op that generates prior_vars should be run before op that generates post_vars
    Nr   TnopXOut)r   inputsoutputsdepend)r3  Depr5   defaultr  /)r(  r@   r
   has_varr'   r   rJ   _insert_op_without_syncr  r  r   r  r  r  r  r   r  r   rI   r  r  r  r  _sync_with_cpp)r   r   
prior_vars	post_varsry   oprolerJ   r  r&  r  r'  skip_insert_when_sequential_run	prior_varpost_varpost_dist_attr	depend_opdepend_op_dist_attrr  r   r8   r  s                        r+   r)  r)    s   $ ' +<+>+> *h'' " \
)X&&  K	 - -	}}Y^,,,,,, , ,}}X]++++++!BB9Q<PPN%2###G 
11Z I& 2 
 
		 11!  I& 2 
 
	 V,,,  
|t++.00'($(1%+7(+7('5'>$&^;;== 	O 	OM))M**C"CC    66}gNNNN'n==?? 	Q 	QN))N++C"CC    77PPPP11*	
 	
 	
 N,>,>,>??? r-   c                     d| j         v rdS dS )Nc_TFr  r  s    r+   is_dep_skip_oprI   	  s    rwt5r-   c                       fd}|S )Nc                      t           j                                        r | i |S t           j        j                                        5   | i |cd d d            S # 1 swxY w Y   d S r?   )r   	frameworkin_dynamic_moder   dygraphguard)argsr  funcs     r+   __impl__z!_dygraph_guard_.<locals>.__impl__(	  s    ++-- 	-4(((($**,, - -tT,V,,- - - - - - - - - - - - - - - - - -s   A  A$'A$r6   )rQ  rR  s   ` r+   _dygraph_guard_rS  '	  s#    - - - - - Or-   c                  P    t          t          j        d          d                   S )N!FLAGS_new_executor_sequential_run)r   r   	get_flagsr6   r-   r+   r(  r(  5	  s+    <==/	
  r-   c                     t          | j                  dk     rdg fS t          | j                  }t          |          |fS )Nr   r   )r/   r   get_sub_process_mesh)ry   sub_process_meshess     r+   get_pp_degreerZ  =	  sG    
<&''!++"u-l.IJJ!""$666r-   c                 ,   |                                  j        }g }t          |          D ]G\  }}d|                                v r,|j        r%|j        j        }||vr|                    |           Ht          |          }t          |d           }|S )Npd_opc                     | j         d         S r   )r   r  s    r+   r  z1get_sub_process_mesh_by_program.<locals>.<lambda>Q	  s    !-*: r-   r  )	r-  r   r&  r'   r   rJ   rN   rX  r   )dist_programall_opsr   r   r   rJ   rY  s          r+   get_sub_process_mesh_by_programr`  E	  s    ''))-GNW%% 4 4RbggiiBL<4L>11%%l333-n== : :   r-   c                    t                      }t          j        |           }|D ]}|t          |j                  z  }g }d}t	          |          D ]g\  }}t          t          |j                            t          |          k    r|                    |           Mt          |j                  |k     rd}h|r't          |          D ]}|                    |           |S rk   )	r9  r  r  r   r&  r/   rN   r   r<  )r   r   rY  pmglobal_pm_idx
has_sub_pmr   s          r+   rX  rX  W	  s    %%K~66  + +s2>***MJ/00  Rs2>""##s;'7'777  %%%%  ;..J (M** 	( 	(C""3''''r-   c                 X    d }t          | j                  D ]\  }}||j        v r|} n|S r?   )r&  r   r   )ry   r   pp_idxr   rJ   s        r+   get_pp_stagerg  m	  sJ    F&|'BCC  \<+++FE , Mr-   c                 ^    t           j                                        }t          ||           S r?   )r   r  r  get_pp_stage_by_rank)	pp_degreer  s     r+   get_pp_stage_by_pp_degreerk  v	  s&    !**,,H)444r-   c                     d }| j         D ]5}t          ||          }|||k    r d S ||k    sJ d| d|             |}6|S )Nz;Can't get pp_stage by process_mesh with different pp_stage z and )r   ri  )rJ   rj  pp_stage_for_process_meshr   pp_stages        r+   get_pp_stage_by_process_meshro  {	  s     $( - -'i88$0444tt8888xhxx]vxx 988 %-!!$$r-   c                 V    t           j                                        }||z  }| |z  }|S r?   )r   r  r  )r   rj  	word_sizepp_group_sizern  s        r+   ri  ri  	  s/    "1133I*M}$HOr-   r"  r  r  c                    g }g }i }| j         }|D ]b}| j                            |          }	|j                            |          }
|
j        }t          ||	          }|                    |           c|D ]b}| j                            |          }	|j                            |          }
|
j        }t          ||	          }|                    |           c|D ]}|j	        
                    |          ||<    |||fS )a  
    Get data used in inferring distributed attributes, including:
      1. DistTensorSpec for each input and output tensor of this dist_op.
      2. Operator attributes of this dist_op, e.g. transpose_x in matmul op.

    Args:
      dist_op: the DistributedOperator
      input_names: list, name of the dist_op's input tensors
      output_names: list, name of the dist_op's output tensors
      attr_names: list, attribute name of the dist_op's corresponding serial op

    Returns:
      input_specs: list, DistTensorSpec for each input tensor of the dist_op
      output_specs: list, DistTensorSpec for each output tensor of the dist_op
      attrs: dict, attribute map of the dist op

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('Depends on other ops.')
            >>> from paddle.distributed.auto_parallel.static.utils import wrap_data_for_completion

            >>> op_desc = dist_op.serial_op.desc
            >>> input_name_list = []
            >>> output_name_list = []

            >>> input_name_list.append(op_desc.input('X')[0]) # 'X' is the arg name for op
            >>> input_name_list.append(op_desc.input('Y')[0])
            >>> output_name_list.append(op_desc.output('Out')[0])

            >>> attr_name_list = ['trans_x', 'trans_y']
            >>> input_specs, output_specs, attrs = wrap_data_for_completion(
            ...        dist_op,
            ...        input_name_list,
            ...        output_name_list,
            ...        attr_name_list)

    )r  r   r  r   _var_recursiverG   r   rN   r   r   r   )r   r"  r  r  input_specsoutput_specsr/  r  r'   r   r   rY   	dist_spec	attr_names                 r+   wrap_data_for_completionry  	  s+   T KLE!I  & &",@@FFo,,T22y"<1ABB	9%%%%  ' '",AA$GGo,,T22y"<1ABB	I&&&& : :	$>..y99ie++r-   c                     | j         j                            |          j        }|r| j                            |          }n| j                            |          }t          ||          S r?   )r  r   rt  rG   r   r  r   r   )r   r'   is_inputrY   r   s        r+   get_dist_tensor_specr|  	  si    $*99$??EL H",@@FF",AA$GG,(8999r-   c                     | j         j        }t          |                                          dk    s
J d            |d         }|S )Nr   zinvalid grad_var_to_var)r  grad_var_to_varr/   r   )ry   grad_var_to_var_mapr~  s      r+   get_grad_var_to_varr  	  sM    &7G"''))**a///1J///)!,Or-   c                 \   ddl m}m} d}t          |                                 j                  D ]\  }}|                    d          r@|                    d          dk    r&g d}|j        	                                |v r|j        
                                }	d|	v sd|	v sd	|	v rVd|	v r|j                            d          n7d|	v r|j                            d          n|j                            d	          }
|j                                        }d
|v sd|v r8d
|v r|j                            d
          n|j                            d          }|
d         |                                v r|D ]}||
d                  ||<   |j        j        rk|                    |          rV|                    |          t#          |j                  z  r,|                    |          t#          |j                  z  r|}|dk    r|                                 j        |dz
           }|j                            d
          d         }|                                 j        |         }|j                            d
          d         }||                                vr	|||<   d S d S d S )Nr   )r  r  r5   r  r  )rz  r?  castc_concatconcatr  
all_gatherr3  Inputr  r4  outr   )/paddle.distributed.fleet.meta_optimizers.commonr  r  r&  r-  r   r.  r   r   r   r"  r$  r  r  r   ampr  rA   r  r  )r   r  r~  r  r  first_backward_op_idxr   r   reshard_op_typesr"  r5  r  r6  r  scale_loss_opscale_loss_var_namefirst_backward_opscale_loss_grad_var_names                     r+   update_grad_var_to_varr  	  s           W1133788 1( 1(R KK''&	M''+CCC      w||~~!111 g1133;&&+--k)) +-- c***  '+55 GMM'222!#s!3!3   "w3355L((E\,A,A !L00 u---W^^E22 
 !9 4 4 6 666") M M2A&)2L// L	(K((	( %%FO(<(<<	( %%FK(8(88		( %(! "",,..23H13LM+077>>qA#002267LM#4#9#@#@#G#G#J #?+?+?+A+AAA8KO4555 #"
 BAr-   c                     | j         }|D ];}|j        dk    r||_        |                                D ]}t          ||           <d S rT   )r   r  r{   set_all_ops_op_role)r   r  r_  r   rq  s        r+   r  r  0
  se    iG 4 4: BJ 	4 	4I	73333	44 4r-   c                 B   t           j        }t           j        }|r|sJ d|             |sJ d|             t          |           dk    sJ dt          |                        ||z  } || d         |d          } || d         |d          } || d	         |d          }	g }
t	          |          D ]N}|
|||z  |dz   |z           z  }
|
                    ||                    |
                    |	|                    O ||
d          S  || d          S )
a  fuse function for fusing weights

    (1) fuse_attention_qkv
        q => [q1,q2,q3,q4]
        k => [k1,k2,k3,k4] or [k1,k2] for GQA
        v => [v1,v2,v3,v4] or [v1,v2] for GQA
        fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4]
                or for GQA [q1,q2,k1,v1,q3,q4,k2,v2]
    (2) fuse_attention_ffn
        directly fuse weights to 1 parts
        [gate_weight], [up_weight] => [gate_weight, up_weight]

    Args:
        fuse_params (_type_): to be fused weights
        is_qkv (bool, optional): for attention qkv weights. Defaults to False.
        num_heads (_type_, optional): query heads. Defaults to None.
        num_key_value_heads (_type_, optional): key and value heads. Defaults to None.

    Returns:
        _type_: fused weights
    3num_heads should be number of heads for Q, but got Mnum_key_value_heads should be number of key_value_heads for K and V, but got r|  zKfuse_params length is not equal 3, it should be Q K V list. but got length r   r5   ra  r   r   )r   r  rz  r/   rF   rN   )fuse_paramsis_qkv	num_headsnum_key_value_heads	concat_fnsplit_fnnum_query_groupsq_listk_listv_list	qkv_pairsrK   s               r+   fuse_param_funcr  9
  s   0 I|H / 	
 	
M)MM	
 	
y # 	
 	
q\oqq	
 	
" ;1$$$lZ]^iZjZjll %$$ %(;;+a.)"===+a.*=BGGG+a.*=BGGG	*++ 	( 	(A$$A1A'AA I VAY'''VAY''''y,,,, y2....r-   c                    t           j        }t           j        }|r|sJ d|             |sJ d|             ||z  }g g g }
}	} || |d|z  z   d          }t          |          D ]o}||||dz   z  |dz   |dz   z  dz
           z  }|	                    ||dz   |dz   z  dz
                      |
                    ||dz   |dz   z  dz
                      p ||d           ||	d           ||
d          fS  || |d          S )au  split function for splitting weights

    (1) fuse_attention_qkv
        fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4]
                or for GQA [q1,q2,k1,v1,q3,q4,k2,v2]
        after split
        q => [q1,q2,q3,q4]
        k => [k1,k2,k3,k4] or [k1,k2] for GQA
        v => [v1,v2,v3,v4] or [v1,v2] for GQA
    (2) fuse_attention_ffn
        directly split weight to 2 parts
        [gate_weight, up_weight] => [gate_weight], [up_weight]

    Args:
        fused_param (_type_): len(fused_param)=1, only one weight to be split
        split_nums (int, optional): split_nums. Defaults to 2.
        is_qkv (bool, optional): for attention qkv weights. Defaults to False.
        num_heads (_type_, optional): query heads. Defaults to None.
        num_key_value_heads (_type_, optional): key and value heads. Defaults to None.

    Returns:
        _type_: split weights
    r  r  r   r5   ra  r   )r   r  rz  rF   rN   )fused_param
split_numsr  r  r  r  r  r  r  r  r  split_headsrK   s                r+   split_param_funcr  q
  s   < I|H : 	
 	
M)MM	
 	
y # 	
 	
q\oqq	
 	
" %(;;!#RhQ)<%<<2
 
 
 *++ 	M 	MAk%)*a!e8H18L-M.  F MM+q1u1AA1E&F&JKLLLMM+q1u1AA1E&F&JKLLLLIf2&&&If2&&&If2&&&
 	
 xZb9999r-   global_meshsub_mesh_dimc                    | j         }t          |          }||k    s|dk     r| |k    rt          d| d| d          |dk     r||z  }t          j        | j                                      |          }t          j        |||         |          }g }|D ]*}|                    t          || j
                             +|S )Nr   z"The sub_mesh_dim should between (-z, r  ra  )rG   r/   r   r   r'  r   r5  rz  rN   r   rO   )r  r  r   	mesh_ndimr   split_process_idssub_mesh_listsub_process_idss           r+   
split_meshr  
  s   "JJIy  ql]Y66JJJiJJJ
 
 	
 a	!(;233;;JGGKZ-L   M, 
 
)>??	
 	
 	
 	
 r-   c                    |                                  }|                                j                                        dk    rdS t          j                            |           t          j                            d| j	        | j
                  }|                    |                                            |                     |           dS )a  
    Update the subblock within a pylayer operation by modifying its output argument.

    This function optimizes a pylayer operation by removing unnecessary outputs from the 'cf.yield' step.

    Args:
        trivale_value (pir::Value): The output argument of the pylayer operation to be modified.

    Example:
        (1) Original pylayer operation:
            (%1, %2) = "pd_op.pylayer" (%0) {
                () = "cf.tuple_pop" [id:1]
                (%3, %4) = "dist_op.xxx" [id:2]
                () = "cf.yield" [id:3] (%3, %4)
            }
        (2) After calling `update_pylayer_output(%4)`, the updated pylayer operation removes the unused output:
            (%1) = "pd_op.pylayer" (%0) {
                () = "cf.tuple_pop" [id:1]
                (%3) = "dist_op.xxx" [id:2]
                () = "cf.yield" [id:3] (%3)
            }

    Args:
        trivale_value(pir::Value): The output argument of the pylayer op to be updated.
    r   N_fake_pylayer_out)r'   rG   rD  )rn  get_parent_block	parent_opr'   r   r   set_insertion_pointr   datarG   rD  set_typer   replace_all_uses_with)trivial_value	define_op
fake_values      r+   update_pylayer_outputr  
  s    4 --//I!!##-2244GG
J""9---## !! $  J
 **,,---''
33333r-   )r   r?   )NFN)FFN)NFFNFT)T)FNN)r   FNN)r  r   r   r   r=  	functoolsr   numpyr   r   paddle.base.frameworkr   paddle.base.libpaddler   paddle.base.wrapped_decoratorr   paddle.frameworkr   paddle.framework.io_utilsr   r	   paddle.staticr
   rJ   r   r   dist_attributer   r   r   op_proto_and_checker_makerr  kOpRoleAttrNamer  VarDescVarTypeREADERSTEP_SCOPESDENSE_TENSOR_ARRAYFEED_MINIBATCH
FETCH_LIST__no_shape_var_type__r  _g_gradient_clip_opsrk  r,   r2   r9   r<   rL   rR   rV   rZ   r^   re   ri   rn   rr   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r   r   r   r   r  r  r:  r;  rR  rj  r[  r\  rZ  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rU  r^  ra  rg  rk  rq  ru  rx  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r0  r)  rI  rS  dygraph_guardr(  rZ  r`  rX  rg  rk  ro  ri  r0   ry  r|  r  r  r  r  r  rA   r  r  r6   r-   r+   <module>r     s*	     				                 - - - - - - % % % % % %      " ! ! ! ! ! J J J J J J J J " " " " " " < < < < < < < < L L L L L L L L L L		(	/-==?? 	LL$L+L'L#  #.    	 	 	             	 	 	    *     ,# # #   "- - -2 2 2   0   . "  "  "F  &0 0 0f! ! !H* * *.  "Q Q Q Q&  4
 
 
*  .     N 7
 7
 7
 7
t)0 )0 )0Z <@6 6 6 6r' ' 'P P P   A A A  02 2 2 2jS S Sl     F  0B B BJ, , , ,^$ $ $N/ / /d- - -`      % % %2 2 2I I I# # #C C C" " "" " "
 
 
  *? ? ?&? ? ?0M M M`B B BJ! ! !:       >v  v  v r
 
 
$        	4 	4 	4
 
 
A A AH    H H HV  0  
5 
5 
5   	5 	5 	5
5 
5 
5Q Q Q(
 
 
2	
 	
 	
	
 	
 	
	C 	C 	C	E 	E 	E$ 	= = = =N 	$(W W W Wt     //  7 7 7  $  ,  5 5 5
% % %  C,C,.2C,@DC, C, C, C,L: : : :  BL BL BLJ4 4 4 DH5/ 5/ 5/ 5/t <: <: <: <:~K s    4$4 $4 $4 $4 $4r-   