
    Αi/                      % S SK Jr  S SKrS SKrS SKrS SKrS SKJr  S SKJ	r	  S SK
JrJrJrJr  S SKrS SKrS SKJr  S SKJrJrJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SK J!r!  S SK"J#r#J$r$J%r%J&r&J'r'J(r(  S SKJ)r)  S SK*J+r+J,r-  S SK.J/r0  S SK1J2r2  S SK3J4r4  S SK5J6r6  S SK7J8r8  S SK9J:r:J;r;J<r<J=r=J>r>J?r?  S SK@JArAJBrBJCrC  S SKDJErE  S SKFJGrGJHrH  S SKIJJrJ  SSKKJLrLJMrMJNrN  SSKOJPrPJQrQJRrRJSrSJTrTJUrUJVrV  SSKWJXrXJYrYJZrZJ[r[J\r\  SSK]J^r^J_r_  SSK`JaraJbrbJcrc  \(       at  S SKdJereJfrf  S S KgJhrh  S S!KJiri  S S"KjJkrkJlrlJmrmJnrn  S S#KoJprp  S S$K"Jqrq  S S%KJrrr  S S&KsJtrt  S S'KuJvrv  S S(KwJxrx  S S)KyJzrz  SS*K{J|r|J}r}J~r~JrJrJrJrJrJr  \S+   rS,\S-'    " S. S/\S0S19rS`S2 jrS3 r " S4 S5\EGR                  5      r   Sa             SbS6 jjr/ " S7 S8\5      rS9 rS: r ScS; jr " S< S=\5      r SaS> jrS? rS@ r            SdSA jr        SeSB jr   Sa           SfSC jjrSgSD jr " SE SF\J5      r " SG SH5      r " SI SJ\5      r " SK SL\5      r " SM SN\5      r " SO SP\5      r  Sh       SiSQ jjrSjSR jr " SS ST5      r " SU SV\-GRB                  5      r " SW SX5      r     Sk             SlSY jjrSmSZ jr " S[ S\5      r    Sn             SoS] jjrS^ rS_ rg)p    )annotationsN)OrderedDict)
MethodType)TYPE_CHECKINGAnyLiteral	TypedDict)_C_opsnnpir)amp_global_state)OptimizerState)PyLayer)unique_name)switch_to_static_graph)EagerParamBaseVariabledefault_main_programin_dygraph_modein_pir_modeuse_pir_api)fleet)Enginestrategyshard_tensor)ProcessMesh)$mark_as_sharding_propagation_skip_op)get_default_distributed_context)DistributedOperator)convert_to_dims_mappingfuse_param_funcget_dist_attr
split_meshsplit_param_functo_list)align	alignmentget_current_device_type)core)DistributedBatchSampler_InfiniteIterableSampler)	Optimizer   )_enable_auto_dp_fake_replicate_grad_to_partialin_auto_dp_mode)_cal_local_shape_dist_reshape_dtensor_from_local_NdMeshAlltoAll_only_reshard_mesh_shape_reshard_mesh_shape_specific_alltoall_dim)check_placements_equalget_shard_specplacemetns_to_dist_status
to_dim_mapto_placements)determinate_rng	rng_state)ShardingOptimizerStage1get_mesh_comm_listget_placement_with_sharding)CallableSequence)	TypeAlias)Tensor)	DTypeLikeNestedNumericSequence	PlaceLike
TensorLike)
GradScaler)Program)	Placement)DistributedInputSpec)
DataLoader)Metric)Layer)	
_AMPConfig_DPOptimizationConfig_FusedPassesConfig_GradientMergeConfig_MPOptimizationConfig_PipelineConfig_RecomputeConfig_ShardingConfig_SPOptimizationConfigtrainevalpredictrE   _Modec                  p    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   S\S'   S\S'   S\S'   Srg)_Config   rY   shardingrT   fused_passesrU   gradient_mergerW   pipelinerR   amprX   	recomputerV   mp_optimizationrS   dp_optimizationrZ   sp_optimization N)__name__
__module____qualname____firstlineno____annotations____static_attributes__rl       d/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/auto_parallel/api.pyra   ra      s7    !!((,,!!##......rs   ra   F)totalc                *   [         R                  " 5       nU R                  5       (       aI  U R                  5       (       a/  UR	                  U R                  5       R                  5       5        U$ S n U$ UR	                  U R                  5       5        U$ N)r*   DenseTensoris_dist_is_initialized_share_data_with_local_value
get_tensor)tensor	lodtensors     rt   _to_lodtensorr      s      "I~~!!##&&v':':'<'G'G'IJ 	 I  	""6#4#4#67rs   c                L    U R                  U5      (       a  U [        U5      S  $ g rw   )
startswithlen)sprefixs     rt   _get_suffixr      s%    ||FVrs   c                  .    \ rS rSrSrS r\S 5       rSrg)DistAttr   aS  
DistAttr specifies how tensors are distributed or sliced on ProcessMesh.

Args:
    mesh(paddle.distributed.ProcessMesh): The `ProcessMesh` object describes the Cartesian topology of the used processes.
    sharding_specs(list[str|None]): The specification describing how to shard the Tensor.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=['x', 'y'])
        >>> dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])

        >>> print(dist_attr)

c                f   [        U[        R                  5      (       d  [        S5      e[        U[        5      (       d  [        S5      e[        S U 5       5      (       d   S5       eX l        / nU Hn  nUc  UR                  S5        M  XAR                  ;  a  [        SU SUR                   S35      eUR                  UR                  R                  U5      5        Mp     [        R                  R                  U 5        Xl        X0l        U R                  S	5        U R                  S
5        g )Nz?The mesh must be an instance of paddle.distributed.ProcessMesh.z/The sharding_specs must be an instance of list.c              3  X   #    U  H   n[        U[        5      =(       d    US L v   M"     g 7frw   )
isinstancestr).0dim_names     rt   	<genexpr>$DistAttr.__init__.<locals>.<genexpr>   s+      
* x%9T)99*s   (*z@The dimension name in sharding_specs must be an instance of str.zInvalid sharding dimension 'z%'. Available dimensions in mesh are: .process_meshdims_mapping)r   r*   r   
ValueErrorlistall_sharding_specsappend	dim_namesindexTensorDistAttr__init__r   r   mark_annotated)selfmeshsharding_specsr   r   s        rt   r   DistAttr.__init__   s/   $ 0 011Q  .$//NOO 
*
 
 
 	N N	N 

  .&H##B'>>1$6xj A==A^^<LAO  ##DNN$8$8$BC ' 	$$T* (N+N+rs   c                    U R                   $ )zL
Get sharding_specs of the dist_attr
Returns:
    list[str]: sharding_specs
)r   r   s    rt   r   DistAttr.sharding_specs   s     ###rs   )r   r   r   N)	rm   rn   ro   rp   __doc__r   propertyr   rr   rl   rs   rt   r   r      s!    ( ,D $ $rs   r   c                   Uc  [         R                  R                  5       n[         R                  R                  U5      nUc  [	        U SS5      n[         R                  R                  5       (       aP  [        U [        S5      [        R                  45      (       d   S5       eU R                  5       (       d   S5       eU nO{[        U [        5      (       a,  U R                  5       (       d  U R                  c   S5       eU nO:[        U [         R                  5      (       a  Uc  U nO[         R                  " XXES9n[         R                   " 5       (       a  [        U [        5      (       aj  S n[        R"                  " U4UUS	.UR$                  D6nUR&                  Ul        UR                  b#  UR                  n	UR)                  U" X5      5        U$ [         R                  " XaX$S
9n
UR&                  U
l        U
$ [         R                  R                  5       (       aD  [         R*                  R-                  XaU5      n
UR&                  U
l        UR.                  U
l        U
$ [1        XUR2                  5      n[5        XaU5      $ )a		  
Creates a distributed Tensor (i.e., Tensor with distributed attributes or DistTensor for short)
from the input data, which can be a scalar, tuple, list, numpy.ndarray, or paddle.Tensor.

If the ``data`` is already a Tensor, it will be transformed into a distributed Tensor.

Args:
    data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor.
        Can be a scalar, list, tuple, numpy.ndarray, paddle.Tensor.
    mesh(paddle.distributed.ProcessMesh): The `ProcessMesh` object describes the Cartesian topology of the used processes.
    placements(list[paddle.distributed.Placement]): the placements describe how to place the tensor on ProcessMesh, it can
        be Shard, Replicate and Partial.
    dtype(str|paddle.dtype|np.dtype, optional): The desired data type of returned tensor.
        It Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8',
        'complex64' , 'complex128'. Default: None. If None, the the dtype is inferred from ``data``
        except for python float number, in which case the dtype is inferred from ``get_default_type`` .
    place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be
        CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
        string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs.
    stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. If
        ``stop_gradient`` is None, set the returned Tensor's ``stop_gradient`` identical as the
        ``data.stop_gradient`` when ``data`` has ``stop_gradient`` attribute and True otherwise.
        Default: None.

Returns:
    Tensor: A Tensor constructed from ``data`` with distributed attributes.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=['x', 'y'])

        >>> # dense tensor
        >>> a = paddle.to_tensor([[1,2,3],
        ...                       [5,6,7]])

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> # distributed tensor
        >>> d_tensor = dist.shard_tensor(a, mesh, [dist.Shard(0), dist.Shard(1)])

        >>> print(d_tensor)

Nstop_gradientTzinput tensor is not pir value.zAshard_tensor() input data only supported dense tensor type right.z:Get an uninitialized param with an unregistered init_func.)dtypeplacer   c                v   ^ ^ T R                    H  nUR                  5       (       d  M   S5       e   UU 4S jnU$ )NzLazy init not support partial reshard. Notice that: shard a param to partial won't save any memory, but will increase the communication cost!c                   > [         R                  " 5       TR                  R                  ;  a  g [	        [         R                  " 5       TR                  TR
                  S9n[        U5         T" X5        S S S 5        g ! , (       d  f       g = f)Nr   
placements)distget_rankr   process_idsr>   r   r?   )varblockrng_nameorigin_hookparams      rt   
_init_func8shard_tensor.<locals>.lazy_init_hook.<locals>._init_funcW  sc    }}e.@.@.L.LL.%*%7%7#(#3#3 H #8,#C/ -,,s   -	A??
B)r   
is_partial)r   r   	placementr   s   ``  rt   lazy_init_hook$shard_tensor.<locals>.lazy_init_hookO  sB    !&!1!1I(3355 [5 "20 "!rs   r   )r   r   r   )paddle	framework_current_expected_place_get_paddle_placegetattrr   r   typer   Valueis_dense_tensor_typer   rz   r   rF   	to_tensorin_dynamic_modefrom_tensor__dict__r   set_init_funcr
   r   persistabler:   ndimshard_tensor_static)datar   r   r   r   r   r~   r   
dist_paramorigin_init_funcdist_tensorr   s               rt   r   r      sJ   l }  88:..u5E ot<##%%$dSYY 788 	
,	
8 ((** 	
O	
* dN++D4H4H4J4J??. L. Ffmm,,F %%F dN++"0 (33!% //	J (.';';J$  ,#)#4#4 ((":@  --jK
 )/(<(<K%				%	%	'	'mm00zJ$*$8$8!"("4"4 (&++F"6@@rs   c                  :    \ rS rSr\ SS j5       r\S 5       rSrg)_moe_global_mesh_tensori  Nc                   [         R                  " 5       (       a  X   nUR                  5       (       a  UR                  n	UR	                  5       n
OUn
S n	U R                  [        R                  " U5      UR                  [        R                  " U5      [        R                  " U5      5        [         R                  R                  5       n[         R                  R                  U5      n[         R                  " U
UUUUS9nUR                  Ul        U$ U R                  [        R                  " U5      [        R                  " U5      [        R                  " U5      [        R                  " U5      5        [         R                  R                  UUUUUU5      nUS   R                  Ul        US   R                   Ul        U$ )Ndimsr   r   r   r   )r   r   ry   r   r|   save_for_backwardcopydeepcopyshaper   r   r   rF   r   r
   moe_global_mesh_tensorr   )ctxlocal_tensor_listlocal_mesh_listlocal_placementsr   r   global_dimsidxlocal_tensor
local_mesh	local_valr   global_tensorr   s                 rt   forward_moe_global_mesh_tensor.forward  s}    !!##,1L##%%)66
(557	(	!
!!d#""o../	 $$<<>E$$66u=E"MM !%M +7*D*DM'  !!d#j)o../	 !-->>! K ):!(<(J(JK%&7&:&F&FK#rs   c                J   [         R                  " 5       (       a  U R                  5       u  p#pEUc  UR                  5       $ [         R                  R                  5       n[         R                  R                  U5      n/ n[        U5       H\  u  pUR                  [         R                  " UR                  5       UU	UUS95        US   R                  5       R                  S5        M^     U$ U R                  5       u  nn
nn[         R                  R                  UUUUU
5      $ )Nr   r   T)r   r   saved_tensorr|   r   r   r   	enumerater   rF   r}   _unsafe_set_skip_check_meshr
   moe_sub_mesh_tensors)r   grad_tensorglobal_mesh
local_dimsr   r   r   outir   global_placementss              rt   backward _moe_global_mesh_tensor.backward  s   !!##  " GK_ &"//11((@@B((::5A%.%?MAJJ'446!+)3'7"' G&&(DDTJ &@ 
   "! ==55 ! rs   rl   rw   rm   rn   ro   rp   staticmethodr   r   rr   rl   rs   rt   r   r     s.     8 8t $ $rs   r   c                    U b  Ub  Uc  [        S5      e[        X5      n[        U5      nU[        U5      :  a  [        R
                  " 5       XB'   X44$ )NzMthe args global_mesh, global_placements and local_mesh_dim should all be set.)r   r$   r   r   r   	Replicate)r   r   sub_mesh_dimsub_mesh_listr   s        rt   $_get_sub_meshes_and_local_placementsr     sd     l26G6O[
 	
 {9M-.c*++)-)9&**rs   c                    [        U 5      n[        U5       HM  u  pEUR                  5       (       d  M  UR                  5       nX6   S:X  a  M6  X6   nXqR                  U   -  X6'   MO     U$ )Nr   )r   r   is_shardget_dimr   )local_shaper   r   global_shaper   r   	shard_dimlocal_dim_sizes           rt   _cal_global_shaper    sm     $L#J/!))+I&",)4N&4zz#&FL# 0 rs   c           	        [         R                  " U5      n[        XU5      u  pE[        R                  " UR
                  5      R                  UR                  5      n[        R                  " U[        R                  " 5       :H  5      nUS   R                  S:X  a  SnOXs   S   nX   n	[        R                  " 5       (       Ga)  US   R                  S:X  a  [        U S   R                  US   U5      n
OX   R                  5       R                  n
[!        XU5      n/ n[#        U 5       H  u  pUR%                  5       R'                  S5        [)        UR*                  U5      (       a  UR,                  XM   :w  aB  UR/                  [1        XU   U5      5        US   R%                  5       R'                  S5        M  UR/                  U5        M     [2        R5                  UUUUUUU5      $ [        R6                  R9                  5       (       ad  [!        U	R:                  X5      n[        R<                  R?                  U UUUUU5      nU S   R@                  Ul         U S   RB                  Ul!        U$ [E        S5      e)Nr   Tr   zEdtensor_from_local_list() are only supported in dynamic and pir mode.)#r   r   r   nparrayr   reshaper   wherer   r   sizer   r   r2   r|   r  r   r}   r   r9   r   r   r   reshardr   applyr   r   _local_shaper
   r   r   r   NotImplementedError)r   r   r   local_mesh_dimr   r   r   local_coordlocal_tensor_idxr   local_tensor_shaper   resharded_local_tensor_listr   r~   r   s                   rt   r   r     s\    z*J(L.)%O ((4++,44TZZ@K((;$--/9:K 1~a&6q9$6Lq>!#!1!!$**OA,>@P"
 "3@@BHH  ((:*M&(#"#45IA;;DA*6+<+<>NOO&&/*<<+22FA$68HI ,*,::4@+226: 6 ',,'
 	
 
			%	%	'	''%%t
 mm::
 %6a$8$F$F!"3A"6"B"B!S
 	
rs   c                  B    \ rS rSr\     SS j5       r\S 5       rSrg)_moe_sub_mesh_tensorsiU  Nc           	        U R                  [        R                  " U5      UU[        R                  " U5      UUR                  5        [        R
                  " 5       (       Ga  Uc  Uc  UR                  5       $ Ub  Uc  [        S5      eUR                  nXQR                  :w  a  [        S5      e[        XaR                  5      (       d   SU SUR                   S35       e[        UR                  XV5      n[        U5       HF  u  pU
R                  5       (       d  M  U
R                  5       nX   nXS   R                  U	   -  X'   MH     [        R                  R!                  5       n[        R                  R#                  U5      n/ n[        U5       Hm  u  nn[        R$                  " UR                  5       UUUUS9nUR'                  5       R)                  S5        UR*                  Ul        UR-                  U5        Mo     U$ [        R                  R/                  5       (       aP  [        R0                  R3                  UUUUU5      nU H%  nUR*                  Ul        UR4                  Ul        M'     U$ g )	NzAthe args global_mesh and global_placements should be set togetherzAthe global_mesh should be the same as dist_tensor's process_mesh.zthe global_placements (z,) is not equal to dist_tensor's placements (z).r   r   T)r   r   r   r   r   r   r|   r   r   r9   r   r2   r   r   r   r   r   r   rF   r}   r   r   r   r   r
   r   r   )r   r   r   r   r  r   r   ori_meshr   r   r   r  r  r   r   r   r   r   local_tensorss                      rt   r   _moe_sub_mesh_tensors.forwardV  s    	MM/*MM+&	
 !!##"'8'@"//11&*;*C$[  '33":"::$[  .%'='=   ..?-@@lmx  nD  nD  mE  EG  H 
 /%%{ '00@&ANC ))++$-$5$5$7	)4)?*Q-?-E-Ec-JJ $.	 'B ((@@B((::5A$&!%.%?MAz#)==#002(%/#3#$L !++-II$O1<1J1JL.%,,\: &@ )())++"MM>> !M !.-8-F-F*+6+B+B( !. !  ,rs   c                t   U R                  5       u  nnnnnn[        R                  R                  5       n[        R                  R	                  U5      nUn	[
        R                  " U	R                  5      R                  U	R                  5      n
[
        R                  " U
[        R                  " 5       :H  5      nUS   R                  S:X  a  SnOX   S   nX   n[        R                  " 5       (       ae  [        R                  R                  5       n[        R                  R	                  U5      n[        R                  " UR!                  5       UU	UUS9nU$ [        R                  R#                  5       (       a:  [%        UR&                  X5      n[        R(                  R+                  UUUUUU5      $ g )Nr   r   )r   r   r   r   r   r  r  r   r  r   r	  r   r   r
  r   rF   r|   r   r  r  r
   r   )r   r   r   r   r  r   r   r  r   r   r   r  r  
local_gradr   r   s                   rt   r   _moe_sub_mesh_tensors.backward  s    	
  88:  2259hht//088Dhh{dmmo=>q>!# *:1= 2
!!##$$<<>E$$66u=E"MM'')!!,M ! ))+++''K ==77 !  ,rs   rl   NNNNNr   rl   rs   rt   r  r  U  s<     H! H!T + +rs   r  c                   [         R                  " U5      n[        XU5      u  pE[        R                  R                  5       (       a  [        R                  U UUUUU5      $ [        R                  R                  5       (       aP  [        R                  R                  U UUUU5      nU H%  nU R                  Ul        U R                  Ul        M'     U$ [        S5      e)zO
Get the local part of the ``dist_tensor`` on the specific ``local_mesh_dim``.
z7moe_sub_mesh_tensors is only supported in dynamic mode.)r   r   r   r   r   r   r  r  r   r
   r   r   r   r  )r   r   r  r   r   r   r  r   s           rt   r   r     s     &78(L)%O ''))$**
 	
 
			%	%	'	'::
 *L)4)B)BL&'2'>'>L$ * !E
 	
rs   c                   [         R                  " 5       (       a]  U R                  5       SL a   U R                  5       (       a  [	        S5      e[         R
                  R                  R                  XU5      $ [         R                  R                  5       (       a   [         R                  R                  XU5      $ [        S5      e)NTz#The input should be a local tensor.z?dtensor_from_local() are only supported in dynamic or pir mode.)r   r   ry   rz   r   baser*   dtensor_from_localr   r   r
   RuntimeError)r   r   r   s      rt   r!  r!    s    !T)l.J.J.L.LBCC{{22

 	

 
			%	%	'	'}}//JOOM
 	
rs   c                b   [         R                  " 5       (       aH  U R                  5       SL a  [        S5      e[         R                  R
                  R                  XU5      $ [         R                  R                  5       (       a   [         R                  R                  XU5      $ [        S5      e)NF)The input should be a distributed tensor.z=dtensor_to_local() are only supported in dynamic or pir mode.)r   r   ry   r   r   r*   dtensor_to_localr   r   r
   r"  )r   r   r   s      rt   r%  r%    s     E)HII{{00JOO				%	%	'	'}}--kLLK
 	
rs   c                *    U " U0 UD6n[        XQU5      $ )ab  
Construct a Distributed Tensor from a function of arguments.

Args:
    fn (callable): A callable function that creates and returns a tensor, such as paddle.ones, paddle.zeros, etc.
    mesh(paddle.distributed.ProcessMesh): The `ProcessMesh` object describes the Cartesian topology of the used processes.
    placements(list[paddle.distributed.Placement]): the placements describe how to place the tensor on ProcessMesh, it can
        be Shard, Replicate and Partial.
    *args (tuple): A tuple of arguments to be passed to the ``fn`` function.
    **kwargs (dict): A dict of arguments to be passed to the ``fn`` function.

Returns:
    Tensor: A Tensor constructed from ``fn`` with distributed attributes.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist
        >>> # Create a distributed attribute
        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
        >>> # Call the function dtensor_from_fn with dist_attr parameter
        >>> d_tensor = dist.dtensor_from_fn(paddle.ones, mesh, [dist.Replicate()], shape=[1])
        >>> print(d_tensor)

r   )fnr   r   argskwargsr~   s         rt   dtensor_from_fnr*    s!    B   Fj11rs   c           	     F   [        XU5      (       a  [        X R                  X5      $ [        R                  R                  5       (       GaU  [        X R                  SS9u  p4n[        R                  " 5       nX6l
        Xl        UR                  S5        UR                  S5        [        U5      S:  a*  UR                  5        H  u  pxUR                  Xx5        M     [        U5      S:  a=  / n	UR                  5        H  u  pzU	R!                  U5        M     UR#                  U	5        [%        XU5      nUb  [&        R(                  " XX+5      $ [+        XU5      (       a  [        X R                  X5      $ [        R,                  R                  R/                  X5      $ [1        5       (       a   [        R2                  R/                  XU5      $ [5        U [6        5      (       d   SU  S35       e[9        XU R                  5      n[;        5       n[=        5       nUR?                  5       RA                  [B        RD                  " SRG                  S	S
/5      5      U RH                  U R                  U RJ                  U RL                  U RN                  S9n[Q        X5      nUR?                  5       RS                  SSU /0SU/0S9n[U        U5      nUURV                  l        URV                  R                  S5        SURV                  l,        URV                  R[                  U R\                  5      nUUl/        UR                  S5        URV                  Ra                  UR\                  5      nUUl/        UR                  S5        URc                  U5        [e        U5        U$ )a  
Reshard a distributed ``paddle.Tensor`` with given distributed attributes.

Args:
    dist_tensor(Tensor): the distributed tensor to be resharded.
    mesh(paddle.distributed.ProcessMesh): The `ProcessMesh` object describes the Cartesian topology of the used processes.
    placements(list[paddle.distributed.Placement]): the placements describe how to place the tensor on ProcessMesh, it can
        be Shard, Replicate and Partial.

Returns:
    Tensor: A Distributed Tensor resharded with distributed attributes.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])

        >>> # dense tensor
        >>> a = paddle.ones([10, 20])

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> # distributed tensor
        >>> d_tensor = dist.shard_tensor(a, mesh, [dist.Partial()])

        >>> out_d_tensor = dist.reshard(d_tensor, mesh, [dist.Replicate()])

        >>> print(out_d_tensor)

T)return_split_factorr   r   r   z@in dy2static mode, reshard's input should be Variable, but got []r   reshard_apitmp)namer   r   r   r   r   assignXOut)r   inputsoutputs)3r6   r3   r   r   r   r   r;   r   r*   r   multi_dims_mappingr   r   r   items_set_split_factorr   _set_partial_dimsr8   r5   r  r7   r   r  r   r
   r   r   r:   r   r   current_block
create_varr   generate_with_ignorable_keyjoinr   r   r   r   r!   	append_opr    	dist_attrchunk_idget_input_dist_attrr0  r   get_output_dist_attradd_dist_op_for_programr   )r   r   r   r   partial_statussplit_factorr?  dimsfr   _alltoall_dimr   main_programdefault_dist_ctxout_vartarget_dims_mappingtrans_opdist_opinput_dist_attroutput_dist_attrs                        rt   r  r  >  sJ   F  :>>[*;*;TNN'')) 6O((d6
2l '')	'3$!%  0  0|q '--/++C4 0~"D(..0C  1''--kL#"((:  {*== ..  {{''??	}}$$[
CC+x00 	
N{m[\]	
0 (+:J:JK+-:< ,,.9988-/0 ####!!#//%33 : 	
 6nK--/99+'WI& : 

 &h/)-&((8%&"!++??
 (;$&&~6",,AA',,O(;%''7009,X6 rs   c                  ^^^ Tc  [        S5      e[        T[        5      (       d  [        S5      e      S	S jn[        R                  " 5       (       a  Uc"  U R                  SS9 H  u  pgU" UT5        M     O*U R                  SS9 H  u  pgU" XgT5        U" UT5        M     Tb  U R                  UU4S j5        Tb  U R                  UU4S j5        U $ [        S5      e)
aP  
Converts all layer's parameters to DistTensor parameters according to
the `shard_fn` specified. It could also control the conversion of input
or output of the layer by specifying the `input_fn` and `output_fn`.
(i.e. convert the input to `paddle.Tensor` with distributed attributes,
convert output back to `paddle.Tensor` without distributed attributes.)

The `shard_fn` should have the following signature:

    def shard_fn(layer_name, layer, process_mesh) -> None

The `input_fn` should have the following signature:

    def input_fn(inputs, process_mesh) -> list(paddle.Tensor)

In general, the type of `input_fn` return value is paddle.Tensor with distributed attributes.

The `output_fn` should have the following signature:

    def output_fn(outputs, process_mesh) -> list(paddle.Tensor)

In general, the type of `output_fn` return value is paddle.Tensor with distributed attributes.

Args:
    layer (paddle.nn.Layer): The Layer object to be shard.
    process_mesh (paddle.distributed.ProcessMesh): The `ProcessMesh` information
        to be place the input `layer`.
    shard_fn (Callable): The function to shard layer parameters across
        the `process_mesh`. If not specified, by default we replicate
        all parameters of the layer across the `process_mesh`.
    input_fn (Callable): Specify how the input of the layer is sharded.
        The `input_fn` will be registered for the Layer as a `forward pre-hook`.
        By default we do not shard the input.
    output_fn (Callable): Specify how the output of the layer is sharded or
        convert it back to `paddle.Tensor` without distributed attributes.
        The `output_fn` will be registered for the Layer as `forward post-hook`.
        By default we do not shard or convert the output.
Returns:
    Layer: A layer that contains parameters/buffers
        that are all `paddle.Tensor` with distributed attributes.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])

        >>> class MLP(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.fc1 = paddle.nn.Linear(8, 8)
        ...         self.fc2 = paddle.nn.Linear(8, 8)
        ...
        ...     def forward(self, input):
        ...         return self.fc2(self.fc1(input))

        >>> def shard_fn(layer_name, layer, process_mesh):
        ...     if layer_name == 'fc1':
        ...         layer.weight = dist.shard_tensor(layer.weight, process_mesh, [dist.Shard(0)])

        >>> layer = MLP()
        >>> layer = dist.shard_layer(layer, mesh, shard_fn)
        >>> print(layer)

        >>> # This case need to be executed in multi-card environment
        >>> # export CUDA_VISIBLE_DEVICES=0,1
        >>> # python -m paddle.distributed.launch {test_case}.py
z,The argument `process_mesh` cannot be empty.z;The argument `process_mesh` is not `dist.ProcessMesh` type.c           	        U R                   R                  5        H  u  p#Ub}  UR                  5       (       dh  [        [	        UR
                  5      5       Vs/ s H!  n[        R                  R                  5       PM#     nnU R                  U[        X1U5      5        M  M     U R                  R                  5        H  u  p&Ub}  UR                  5       (       dh  [        [	        UR
                  5      5       Vs/ s H!  n[        R                  R                  5       PM#     nnU R                  U[        XaU5      5        M  M     g s  snf s  snf rw   )_parametersr7  ry   ranger   r   r   distributedr   add_parameterr   _buffersregister_buffer)layerr   keyr   rH  r   buffers          rt   "replicate_layer_params_and_buffers7shard_layer.<locals>.replicate_layer_params_and_buffers  s(     ++113JC  #3u{{#344 &&0024   ## j9  4 !>>//1KC!&..*:*: #3v||#455 &&0025   %% z:  2s   (E;(ET)include_selfc                   > T" UT5      $ rw   rl   )rH  r4  input_fnr   s     rt   <lambda>shard_layer.<locals>.<lambda>?  s    (6<"@rs   c                   > T" UT5      $ rw   rl   )rH  r4  r5  	output_fnr   s      rt   rb  rc  D  s    9Wl+Krs   zB`paddle.distributed.shard_layer` only supports dynamic graph mode.)rZ  znn.Layerr   r   returnNone)	r   r   r   r   r   named_sublayersregister_forward_pre_hookregister_forward_post_hookr  )rZ  r   shard_fnra  re  r]  r0  	sublayerss    ` ``   rt   shard_layerrm    s   \ GHH lK00I
 	
*	<  $)#8#8d#8#K29lK $L $)#8#8d#8#K,7 39lK $L ++@  ,,K  "P
 	
rs   c                `   [         R                  " 5       (       aD  [        U [         R                  5      =(       a#    [	        U S5      =(       a    U R                  5       $ [        U [         R                  R                  R                  R                  5      =(       a    U R                  5       SL$ )z
Check if an input is a dist_tensor in both dynamic and static modes.
Args:
    tensor: The input to check
Returns:
    bool: True if the input is a dist_tensor, False otherwise
ry   N)r   r   r   rF   hasattrry   r   	libpaddler   r   r?  )r~   s    rt   is_dist_tensorrq  O  s     vv}}- !	*! 	
 vv{{4488>>? /  "$.	
rs   c                     ^  \ rS rSrSS jrS rS rS rS rS r	S r
S	 rS
 rS rS rS rS rS rS rS r SU 4S jjrS rS rSrU =r$ )_ShardOptimizerid  c                @   Uc   S5       e[        U[        R                  R                  [        R                  R                  45      (       d   S5       e[        R
                  R                  R                  UR                  R                  5      Ul
        XR                  S'   SU l        [        US5      (       aG  UR                  b:  [        UR                  [        R                  R                   5      (       a  SU l        X l        S U l        S U l        X0l        U R"                  c  [+        S5      U l        [        U R"                  [*        [,        [.        [0        45      (       d   S5       e[        U R"                  [,        [.        [0        45      (       a5  U R3                  5         U R"                  R5                  U R$                  5        [        U R"                  [.        5      (       aG  [7        5       (       d8  U R8                  R:                   H  nU R"                  R=                  U5        M      [        U R"                  [0        5      (       a  U R8                  R:                   H  nU R"                  R?                  U5        M      U R8                  R:                   H  nU R"                  R=                  U5        M      S	[@        RB                  S
'   / U l"        / U l#        / U l$        S U l%        S U l&        SU l'        [Q        5       U l)        SU l*        SU l+        g )Nz)The argument `optimizer` cannot be empty.zR`paddle.distributed.ShardOptimizer` only supports AdamW and SGD optimizer for now.
_inner_optF
_grad_clipTr   zgshard_fn must be an instance of one of: _ShardingStage0, ShardingStage1, ShardingStage2, ShardingStage31skip_sharding3_output_reshard),r   r   	optimizerAdamWSGDr   layer_helperLayerHelper	__class__rm   helperr   _shard_clipro  rv  r   ClipGradByGlobalNorm	_shard_fn_sharding_axis_sharding_degreegradient_accumulation_steps_ShardingStage0ShardingStage1ShardingStage2ShardingStage3'_set_and_check_sharding_prop_from_param_set_sharding_axisr1   ru  _parameter_list_register_hook_for_param_grad_shard_parameterosenvironfuse_param_viewparam_storagegrad_storage_sharding_group	_mp_groupdo_tensor_fusion_onceStrategy	_strategyenable_tensor_fusionenable_sharding_overlap)r   ry  rk  r  r   s        rt   r   _ShardOptimizer.__init__e  s   $ 	
7	
$ ((..0@0@0D0DE
 
 	
 a	
 
 ";;33??((
	 '0l# I|,,$$09//1O1OPP#D!" $+F(>>!,Q/DNNNnnnM
 
 	
 v		
 
 NN^^^L
 
 88:NN--d.A.AB dnnn55o>O>O88<<UC 9 dnnn5588//6 988<<UC 9:=BJJ67!#%)"!$)!',$rs   c                   [         R                  R                  5       nU(       a+  UR                  U R                  R
                  5      U l        OeU R                  R                  (       a?  U R                  R                  R                  U R                  R
                  5      U l        O[        S5      eSU l	        U R                  R                  nU GH  nUR                  5       (       d  M  UR                  nUR                  n[        XPR                     [         R"                  5      (       d;  [%        U5       H,  u  pg[        U[         R"                  5      (       d  M&  X`l	        M.     [        XPR                     [         R"                  5      (       d   S5       eU(       a2  ['        UR(                  5      ['        UR(                  5      :  a  GM  GM  U R                  R                  (       aF  ['        UR(                  5      ['        U R                  R                  R(                  5      :  a  GMb  GMe  UR+                  U R                  5      U R                  :X  a  GM   S5       e   g )NzIThe global mesh or shard_fn mesh should be set for the sharding strategy.r   z2The placement on sharding_axis should be Replicatez>The sharding degree of all parameters must be equal currently.)r   autoget_meshget_dim_sizer  _sharding_mesh_dimr  _meshr   r  ru  r  ry   r   r   r   r   r   r   setr   dim_size)r   r   
param_listr   r   r   rF  r   s           rt   r  7_ShardOptimizer._set_and_check_sharding_prop_from_param  s   jj))+$/$<$<11%D! ^^!!$(NN$8$8$E$E11%D! [ 
   __44
E==??%%D))Jj)<)<=t~~NN&/
&;NC!)T^^<<.1+ '<
 ../  DCD  t''(3{/F/F+GG H%%t''(3NN((44,   MM$"5"56$:O:OO UO;  rs   c                   UR                   S   S:X  a  g UR                  nUR                  U R                  R                  R	                  5       ;   a  U R                  R                  UR                     nUR                  n[        U R                  [        [        45      (       ac  U R                  R                  X5      U R                  R                  UR                  '   X R                  R                  UR                     l        U R                  R                  R	                  5        GH  nU R                  R                  U   U   nUR                  5       (       a!  [        U[        R                  5      (       d  MV  [        R                  " 5       (       a  UR                  n[        U R                  [        [        [         45      (       a-  U R                  XAU5      U R                  R                  U   U'   OUR                  5       (       a  SU;  a  UR"                  nOK[%        ['        UR(                  R                   5      5       Vs/ s H  n[*        R,                  " 5       PM     nn[/        UUR(                  US9U R                  R                  U   U'   [        R                  " 5       (       d  GM  WU R                  R                  U   U   l        GM     g s  snf )Nr   r.   betar   r   )r   r0  ru  _master_weightskeysr   r  r  r  shard_master_weight_accumulatorsry   r   r   r   r   r  r   rU  r   r   r   r   r   )	r   r   target_namemaster_weightr[  accumulatororigin_accumulator_namer   rH  s	            rt   _shard_accumulator"_ShardOptimizer._shard_accumulator  sH    ;;q>Qjj::88==?? OO;;EJJGM',,K$..>>*JKKNN66uL //

; DO//

;@ ??00557C//77<[IK""$$ZSYY-O-O%%''*5*:*:' P  NN3{; --c2;? ==??S( &+%5%5

 &+3u/A/A/G/G+H%I&%I !NN,%I # &
 %'!&!3!3'1 OO11#6{C %%'' 1 --c2I 8,&s   K'c                   UR                  5       (       a  [        U R                  [        [        45      (       a  [        U[
        R                  5      (       d|  UR                  n[        R                  " 5       X R                  '   [        R                  " XR                  U5      nUR                  5       R                  UR                  5       5        g g g g rw   )ry   r   r  r  r  r   r   r   r   r   r  r  r   r}   r{   )r   r   new_placement	out_params       rt   _reset_placements!_ShardOptimizer._reset_placements  s    ==??zNN^^< 
  
 eSYY// % 0 059^^5E112 LL--}	   "33I4H4H4JK 0	 
?rs   c                    [        U[        5      (       a  UR                  S5      nU H0  nU R                  R	                  X/5        U R                  U5        M2     g )Nparams)r   dictgetru  _create_accumulatorsr  )r   r   
parametersps       rt   r  $_ShardOptimizer._create_accumulators%  sK    j$''#1J AOO00<##A& rs   c                   U R                   R                  X5        U R                  (       Ga*  U R                   H  nUR	                  5         SUl        M     U R                  (       d  [        [        U R                  5      5       H  nU R                  U   R                  5       U R                  R                  -  nU[        U R                  R                  S5      -  nXe-   n[         R"                  R%                  U R                  U   Xg5      nU R                  R&                  R)                  XR                  U   5      R+                  5         M     g g [-        U[.        5      (       d  US   nU H[  u  p[1        5       R2                  (       a,  [-        U R4                  [6        [8        45      (       a  S U	l        U R=                  U	5        M]     g )Nr   r  )ru  _finish_updater  r  zero_check_inr  rU  r   r  r  _numelr  nranksmaxrankr   r
   
view_sliceprocess_group
all_gatherwaitr   r   r   use_master_gradr  r  r  	main_gradr  )r   r   parameters_and_gradsr  r   
shard_sizebeginendslice_bufferr  rH  s              rt   r  _ShardOptimizer._finish_update.  s   &&uC$$$ $ 1 1""$()% !2 //s4#7#789A**1-446//667  'T-A-A-F-F)JJE,C#)==#;#;**1-u$L ((66AA$&8&8&;df : 0 2D99';H'E$ -#%55*NN^^$D; ; #'AK&&q) -rs   c           	         / nU H'  u  p4UR                  X0R                  SX45      45        M)     [        R                  " X5      $ Ngrad)r   r  r-   apply_gradients)r   params_gradsnew_params_gradsr   r  s        rt   r  _ShardOptimizer.apply_gradientsO  sF    'KE##vu;< ( ((@@rs   c                P   U R                   R                  5       n/ n[        U R                   R                  S   [        5      (       a%  U R                   R                   H
  nX#S   -  nM     OU R                   R                  nU HK  nUR
                  (       a  M  [        US5      (       a  UR                  b  Us  $ M:  UR                  c  MI  Us  $    [        S UR                  5        5       5      (       a  U$ [        U R                   R                  S   [        5      (       d  U R                   R                   H  nUR
                  (       a  M  [        US5      (       aO  UR                  b  [        SUR                   35      e[        R                  " U[        R                  S9Ul        Mv  UR                  b  [        SUR                   35      e[        R                  " XDR                  S9Ul        M     OU R                   R                    H  nUS    H  nUR
                  (       a  M  [        US5      (       aO  UR                  b  [        SUR                   35      e[        R                  " U[        R                  S9Ul        Mv  UR                  b  [        SUR                   35      e[        R                  " XDR                  S9Ul        M     M     U R#                  5         U R                   R%                  SS9  U R                   R                  5       $ )	z
Create and shard the optimizer states e.g., accumulators and master_weights before load_state_dict.
If training has already started or the optimizer states are already created and sharded, do nothing.
r   r  r  c              3  V   #    U  H  u  pUS ;  d  M  UR                  5       v   M!     g7f))master_weightsLR_SchedulerN)ry   )r   kvs      rt   r   -_ShardOptimizer.state_dict.<locals>.<genexpr>q  s,      
*:: AIIKK*s   ))z gradient should be None, but is )r   F)set_to_zero)ru  
state_dictr   r  r  r   ro  r  r  anyr7  r   r   
zeros_likefloat32r   _param_groupsstep
clear_grad)r   r  r  param_groupr   s        rt   r  _ShardOptimizer.state_dictX  s   
 __//1

doo55a8$??#>>(33
  ? 88JE""uk**??.%% / ::)%%    
"((*
 
 

  $//99!<dCC88&&5+..2(>u>OP  '-&7&7V^^'EO zz-(>uzzlK  "(!2!25!LEJ! 9$  $<<(2E** uk22 ??6","B5??BS T#  +1*;*;!+ !::1","B5::, O#  &,%6%6uKK%P
! 3  =$ 			""u"5))++rs   c           	     P   [        5       (       Ga  US   R                  5       (       Ga  US   R                  nUS   R                  nUS   nUR                  nSS jnSn[        R
                  R                  5       n	SU	R                  ;   aG  U	R                  S5      n
[        U
5       H'  nUR                  U" U5      R                  :X  d  M%  Un  O   Sn[        S U 5       5      (       aB  UR                  U" U5      R                  :X  a"  UR                  U" U5      R                  :w  a  SnU(       a  [        R                  R                  R                  UUR                   U" U5      [        R"                  " [        R$                  R&                  5      [        R"                  " [        R$                  R&                  5      /5      nUR                  n[        [)        U5      S-
  SS5       H^  n[+        X;   [        R"                  5      (       d  M&  [        R,                  " 5       X;'   [        R.                  " XUR                  U5      nM`     U R0                  S:  a  [3        5       (       a  XPR0                  -  nU(       aI  [        R                  R                  R                  XUR                   U[        R,                  " 5       /5      nUS   U4nU R4                  R7                  X5        U R8                  (       Ga"  [;        US   S	5      (       Ga  US   R<                  nUS   R<                  S:X  a  U R>                  U   RA                  5       U RB                  RD                  -  nU[G        U RB                  RH                  S5      -  nX-   n[J        RL                  RO                  U R>                  U   UU5      n[J        RP                  RS                  U R>                  U   UU RB                  SS
9nSU R>                  U   l*        g SU R>                  U   l*        g g g )Nr.   r   c                    [         R                  R                  5       nSUR                  ;   a  UR	                  SU 5      nU$ )u   
获得pp_idx的mesh
pp)r   r  r  r   get_mesh_with_dim)pp_idxr   s     rt   r  5_ShardOptimizer._append_optimize_op.<locals>.get_mesh  s9     zz**,4>>)11$?Drs   r  Fc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7frw   )r   r   Partial)r   r   s     rt   r   6_ShardOptimizer._append_optimize_op.<locals>.<genexpr>  s!      EO	
9dll33Z   ')Tr   last_idxgroupsync_opr   )+in_auto_parallel_align_modery   r   r   r   r  r  r   r  rU  r   r  r   auto_parallel	moe_utilsr3   r   r  
ReduceTypekRedSumr   r   r   r  r  r   ru  _append_optimize_opr  ro  r  r  r  r  r  r  r  r   r
   r  rV  r  is_sync)r   r   param_and_gradr   meshsr  	grad_meshr  ippr   	pp_degreer   change_meshr   r  r  r  r  tasks                      rt   r   #_ShardOptimizer._append_optimize_op  s{   '))q!))++'*55J"1%22E!!$D))I C**--/K{,,,'44T:	y)A((HQK,C,CC *
  K EO   ""hsm&?&??__(?(??"))33AAJJSMT__%<%<=T__%<%<=	 "__
3z?Q.B7jmT\\::$(NN$4JM<<.?.?LD 8 //!38I8I888))33AA**i$..2B1C -Q/6N++EB'''~a(*55$Q'00!!$--2**3/668//667  'T-A-A-F-F)JJE,C#)==#;#;**3/$L "--88**3/$"22 %	 9 D 7;D&&s+36;D&&s+3) 6 (rs   c                l    S[         R                  S'   SU l        U R                  R	                  5         g )Nrw  FLAGS_enable_tensor_fusionT)r  r  r  r  _enable_tensor_fusionr   s    rt   r  %_ShardOptimizer._enable_tensor_fusion  s)    36

/0$(!,,.rs   c                   [        US5      (       a"  UR                  R                  SS5      (       a  g SU l        [	        U[
        R                  R                  5      (       d  [        S[        U5       35      eXl
        g )Nconfig	to_staticFTz+`layers` must be `paddle.nn.Layer` but got )ro  r  r  r  r   r   r   rQ   r"  r   _layers)r   layerss     rt   _enable_sharding_overlap(_ShardOptimizer._enable_sharding_overlap   sg    68$$):):;)N)N'+$&&))//22=d6l^L  rs   c                   UR                  5       U R                  R                  -  nU[        U R                  R                  S5      -  nX2-   n[
        R                  R                  XU5      n[
        R                  R                  UU[
        R                  R                  R                  U R                  SS9R                  5         g )Nr   Fopr  r  )r  r  r  r  r  r   r
   r  rV  reduce_scatterReduceOpSUMr  )r   r  r  r  r  reduce_scattereds         rt   _reduce_scatter_gradients)_ShardOptimizer._reduce_scatter_gradients
  s    !((*d.B.B.I.II
S!5!5!:!:A>> !==33LM))!!**..&& 	* 	
 $&rs   c           	        U R                   (       d  [        S5      e0 nU R                   R                  5        H&  nUR                  SS9 H  nX![	        U5      '   M     M(     [        U R                  5      [        U R                  5      :w  a8  [        S[        U R                  5       S[        U R                  5       S35      e[        [        U R                  5      5       GH  nU R                  U R                  U   5        S nS n[        U R                  U   5      U R                  -  nS	[        R                  R                  5       R                  ;   a0  U[        R                  R                  5       R                  S	5      -  nU R                  U   R!                  5        H8  u  pU	S
   R#                  U" UU R                  U   U R$                  5      5        M:     U[        U R                  5      S-
  :  d  GM!  ['        [)        U R                  U   R+                  5       5      5      S
   n
UR-                  [	        U
5      5      nUR/                  U" U R0                  US-      U R$                  5      5        GM     g )Nz_Sharding overlap requires an initialized model. Call `_enable_sharding_overlap()` to set model.F)include_sublayersz"Length mismatch: fuse_param_view (z) vs grad_storage ()c                ^   ^ ^^ [         R                  R                  5       UUU 4S j5       nU$ )Nc                   > T=R                   S-  sl         TR                   T:X  a  TR                  5       TR                  -  nU[        TR                  S5      -  nX!-   n[
        R                  R                  TX#5      n[
        R                  R                  UT[
        R                  R                  R                  TSS9nUTl        g g )Nr.   r   Fr  )r  r  r  r  r  r   r
   r  rV  r  r  r  	comm_task)	rH  r  r  r  r  r  
comm_groupr  param_group_lens	         rt   	fuse_commT_ShardOptimizer._async_sharding_comm.<locals>.fuse_comm_hook_func.<locals>.fuse_comm)  s     !))Q.)#,,?%1%8%8%:j>O>O%O
 *S!-D D#0+1==+C+C(%,(  &11@@,(%11::>>",$)  A   26. @rs   r   autogradno_grad)r%  r  r$  r&  s   ``` rt   fuse_comm_hook_funcA_ShardOptimizer._async_sharding_comm.<locals>.fuse_comm_hook_func(  s*    ((*6 +6& ! rs   c                Z   ^ ^ [         R                  R                  5       UU 4S j5       nU$ )Nc                 ,  > TR                   (       d  TR                  5       TR                  -  nU[        TR                  S5      -  nX!-   n[
        R                  R                  TX#5      n[
        R                  R                  TUTSS9nSTl         g g )Nr   Fr  T)
r  r  r  r  r  r   r
   r  rV  r  )rH  r  r  r  r  r  r$  r  s         rt   r&  Z_ShardOptimizer._async_sharding_comm.<locals>.fuse_all_gather_hook_func.<locals>.fuse_comm@  s     )00%2%9%9%;z?P?P%P
 *S!-D D#0'-}}'?'?)5(  &11<<)(",$)	  =   15- 1rs   r(  )r  r$  r&  s   `` rt   fuse_all_gather_hook_funcG_ShardOptimizer._async_sharding_comm.<locals>.fuse_all_gather_hook_func?  s*    ((*5 +5" ! rs   r  r   r.   )r  r"  rl  r  idr   r  r  rU  r  r  r   r  r  r   r  r7  _register_backward_hookr  nextitervaluesr  ri  r  )r   param2layerrZ  r  r   r+  r0  r%  r0  viewfirst_params              rt   _async_sharding_comm$_ShardOptimizer._async_sharding_comm  s@   ||B  \\++-E%%%>%*BqE" ? . t##$D,=,=(>>4S9M9M5N4OObcfgkgxgxcybzz{|  s4//01A**4+<+<Q+?@!.!. D((+,t/O/OO  uzz**,666#ejj&9&9&;&H&H&NN   #2215;;=
W55''))!,,, > 3t++,q00"4(<(<Q(?(F(F(H#IJ $;8//-**1q51,,O 2rs   c           
     `  ^ U4S jnSn0 nU H  u  pgXEUR                   '   XC" U5      -  nM     [        R                  " U/US   S   R                  S9nSUl        [        R
                  n	[        R                  " U/U	S9n
SU
l        S U
l        0 nU GH  u  plU" U5      nUXVR                      S.XR                   '   XVR                      nUR                  nUR                  nSUl	        UR                  5       R                  5         [        R                  " UR                  5       UR                  UXR                  5       -   5      5        UUl	        [        R                  R!                  UUXR                  5       -   5      nUR#                  5       R%                  UR&                  5        [)        UUR*                  UR,                  5      nUR#                  5       R/                  UR#                  5       5        [        R                  " UR                  5       U
R                  UXR                  5       R                  5       -   5      5        [        R                  R!                  U
UXR                  5       R                  5       -   5      nUR#                  5       R%                  UR&                  5        [)        UUR*                  UR,                  5      nUUl        UR#                  5       R3                  5         [        R4                  R6                  R9                  5         GM     XU
4$ )Nc                   > [         R                  " U R                  5      n[        [	        5          [
        U R                     -  T-  nX-   S-
  U-  U-  $ Nr.   )r  prodr  r(   r)   r'   r   )r   r
  
align_sizesharding_degrees      rt   get_padded_size?_ShardOptimizer._build_fuse_param_view.<locals>.get_padded_sizex  s[    775--.D134%&!" 
 &*z9ZGGrs   r   )r   r   F)r   r   T)r0  r   zerosr   r  r  r  r#  r   r   r|   flatten_r1  _slicer  r
   r  r}   	_set_dimsr  r4   r   r   r{   r  _cleardevicecudaempty_cache)r   params_and_gradsrA  rB  total_buffer_sizeparam2indexr   rH  param_buffer
grad_dtypegrad_bufferviewsr  padded_sizer   param_shaper   	tmp_paramtmp_grads     `                rt   _build_fuse_param_view&_ShardOptimizer._build_fuse_param_views  s   
	H (HE&7

#!77 )
 ||$%-=a-@-C-I-I
  %^^
ll*;)<JO  $ +KE)%0K$ZZ0!E**
  

+E++K!//M"&E ))+MM""$##LLN* #0E00&I
   ",,U-?-?@+""  I
 //	0D0D0FGMM!!#""--/6688 }}//))+2244H
 !++D,=,=>*!!H
 'EO OO$$&MM**,o ,r [11rs   c                   U R                   (       Ga\  [        R                  R                  5       n[	        US5      nU HC  n[        R
                  " [        U5      5      n[        R                  " 5       U;   d  M=  XPl        ME     SUR                  ;   a  UR                  R                  S5      nUR                  U   U l        [	        US5      nU HC  n[        R
                  " [        U5      5      n[        R                  " 5       U;   d  M=  XPl        ME     SU l         U Vs/ s H  oS   PM	     n	nU R                  R                  R                   n
U
S:  a  Sn
U
S-  S-  nS/[#        U	5      -  nU	 Vs0 s H  oR$                  UR&                  _M     nnU	 Vs/ s H  oR)                  5       PM     nn[*        R,                  " XX/5      n[/        5       n[1        U5       H3  u  nnU H'  nUR3                  U/ 5      R5                  UU   5        M)     M5     UR7                  5        H  u  nnU R9                  UU R                  R:                  5      u  nnnU R<                  R5                  U5        U R>                  R5                  U5        U R@                  R5                  U5        M     U RB                  (       a  U RE                  5         U RF                  RH                  b  SU RF                  RH                  l%        U R                  U RF                  RH                  l&        SUR                  ;   a5  U R                  S:  a%  U R                  U RF                  RH                  l'        / n/ n[Q        [#        U R<                  5      5       GH  nU RB                  (       d  U RS                  U R@                  U   5        U R<                  U   R7                  5        GH  u  nnUS	   nUS
   nU R>                  U   RU                  5       U R                  R:                  -  nU[W        U R                  RX                  S5      -  nUU-   n [W        UU5      n![[        UURU                  5       -   U 5      n"U!U":  a  M  [\        R^                  Ra                  U R>                  U   U!U"5      n#[c        U#URd                  [        Rf                  " 5       /5      n#UU#l        URh                  U#l4        URj                  U#l5        SU#l6        URn                  U#l7        URh                  U#l4        URp                  U#l8        URr                  U#l9        URt                  U#l:        URv                  U#l;        UR5                  U#5        [\        R^                  Ra                  U R@                  U   U!U"5      n$[c        U$URd                  [        Rf                  " 5       /5      n$UR5                  U$5        GM     U RB                  (       d  GMq  UUS   l<        U R@                  U   Rz                  c  GM  U R@                  U   Rz                  R}                  5         GM     [        [        UU5      5      n%U%$ s  snf s  snf s  snf )a  
1. Tensor Fusion
    - Groups params/grads into contiguous param_storage/grad_storage buffers
    - Supports non-uniform partitioning across GPUs
    - Uses view_slice to access individual params/grads each step
2. Reduce_scatter Overlap
    - Overlaps grad reduce_scatter with backward
3. All_gather Overlap
    - Overlaps param all_gather with forward
    - Strategically scatters all_gather during forward
    (Launching all all_gather at once blocks overlap with other sync/comm ops)
dpmpFr      i   Tr.   r   r   r   )Ar  r   r  r  rA   	new_groupsortedr   r  
_dim_namesr   _shape
_mp_degreer  r  rc   comm_buffer_size_MBr   r0  r   r|   r*   eager_assign_group_by_sizer   r   
setdefaultr   r7  rW  r  r  r  r  r  r:  ru  rv  should_comm_on_shard_dimsharding_groupmp_grouprU  r  r  r  r  minr   r
   r  r4   r   r   r   	need_clipr   	trainableoptimize_attrregularizerdo_model_averageis_distributedr  r#  r  r   zip)&r   r  r   shard_groupsr  r$  mp_mesh_axis	mp_groupsp_gr  rb  
group_sizeis_sparse_gradientr   
shape_dictdense_paramsgroup_indices
var_groups	group_idxindicesr   rL  r  r  r  
new_params	new_gradsr0  r8  r   r  
rank_beginrank_endparam_begin	param_end	new_paramnew_gradr  s&                                         rt   _tensor_fusion_ShardOptimizer._tensor_fusion  s    %%% %%..0D-dD9L%!^^F5M:
==?e++5( & t&#44T:"&++l";.tT:	&E!%u!>J}}%/)3 ' */D&,89LSa&LJ9"&.."9"9"M"M"Q&&)#,t3d:J"'3z?!:?IJze**ekk1zJJ>HIjU..0jLI !;;:2JM %J&/&>"	7 A)))R8??QP ! '?
 0:/?/?/A+	+
 //$((//	#! 
 $$++O<""))-8!!((6 0B ++ ))+ ))5FJ**C<@<P<P**94??*t/B:>..DOO..7
	s4//01A//..t/@/@/CD"2215;;=
dWW&&q)002++223  (#d.B.B.G.G*KK
%
2!%4 6A	)+"MM44&&q);		 0&&^^%&	
 "&	*/*=*=	'&+oo	#(,	%&+oo	#*/*=*=	'*/*=*=	'(-(9(9	%-2-C-C	*+0+?+?	(!!), "==33%%a(+y /e004>>3C2D   *S >V +++*+
2'$$Q'11=%%a(22779g 2j  J	 :;K : KIs   %Y,8 Y1Y6c                   / nS nU GHg  u  pE[         R                  " UR                  5      nUnUR                  n[	        5       n	UR
                  R                  n
UR                   H;  nUR                  5       (       d  M  UR                  5       nU	R                  U5        M=     U" X5      nUR                  U R                     R                  5       (       dl  [        R                  " 5       X`R                  '   US:w  aE  XR                     S:w  a3  U	R                  U5        [        R                  " U5      X`R                  '   [        UR                  5       H  u  pXR                  :X  a  M  X   S:X  a  [        R                  " 5       Xm'   M7  UR                  5       (       a  MN  [        R                  " 5       Xm'   U" X5      nUR                  5       (       d  M  US:X  a  [        R                  " 5       Xm'   M  U	R                  U5        [        R                  " U5      Xm'   M     UR                  U:w  a!  [        R                   " XUR
                  U5      nUR#                  XG45        GMj     U$ )a  
Optimizes gradient placements for parameters in dynamic sharding mode to minimize redundant allreduce
operations during gradient clipping. This function adjusts tensor placements across mesh axes based
on priority rules, prioritizing sharding for dimensions marked in `_sharding_axis`.
For each axis in the mesh:
    1. Preserves existing `Shard(dim)` placements for any axis.
    2. Converts `Partial()` placements to Shard(dim) where possible, falling back to `Replicate()` if sharding isn't feasible.
    3. Maintains `Replicate()` placements unchanged.
Processes axes in order of `_sharding_axis` first before other mesh axes in their natural order.

    e.g.
        a) sharding_axis = 0, tensor rank = 2,
            placements: [Partial(), Partial(), Repliacate()] -> [Shard(0), Shard(1), Repliacate()]
        b) sharding_axis = 0, tensor rank = 2,
            placements: [Partial(), Shard(0), Partial() ] -> [Shard(1), Shard(0), Repliacate()]
c                `    [        [        U 5      5       H  nX   S:X  a  M  X!;  d  M  Us  $    g)Nr.   r   )rU  r   )tensor_shapeshard_dims_set
tensor_dims      rt   get_first_can_shard_dimR_ShardOptimizer._fused_comm_before_apply_optimize.<locals>.get_first_can_shard_dimj  s5    #C$56
+q03%% 7 rs   r   r.   )r   r   r   r  r  r   r   r   r   addr  r   r   Shardr   r   r  r   )r   r  r  r  r   r  new_placementsr  r  r  
mesh_shaper   r  	mesh_axiss                 rt   !_fused_comm_before_apply_optimize1_ShardOptimizer._fused_comm_before_apply_optimizeU  s   " 	 (KE!]]4??;NH,,L UN**00J "__	%%''!*!2!2!4J"&&z2 - 1NJ??4#6#67@@BB6:nn6F223#
3F3F(G1(L"&&z2:>**Z:PN#6#67 )2$//(B$	 3 33(A-040@N- ))++040@N-!8$"J !++--%+8<8HN5 +..z:8<

:8NN5) )C, .0<<.?.?P##U$56e (h  rs   c                   > [         R                  " 5       (       aS  [        U R                  [        5      (       a4  U R
                  (       a  U R                  U5      nOU R                  U5      n[        TU ]%  XX45      $ rw   )
r   r   r   r  r  r  r  r  super_apply_optimize)r   lossstartup_programr  param_group_idxr~  s        rt   r  _ShardOptimizer._apply_optimize  sp     !!##
NNN)
 )
 ((#22<@#EE   w&<
 	
rs   c                    SU R                   ;   a.  US:X  a  U R                   U   $ [        U R                   S   U5      $ [        e)Nru  )r   r   AttributeError)r   items     rt   __getattr___ShardOptimizer.__getattr__  sA    4==(|#}}T**4==6==  rs   c                    US:X  a#  [        U 5      R                   S3n[        U5      e[        U R                  X5      $ )Nru  z._inner_opt is READ ONLY)r   rm   r  setattrru  )r   r  valuemsgs       rt   __setattr___ShardOptimizer.__setattr__  s>    <$Z(())ABC %%t44rs   )r  ra  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r>  r  )rm   rn   ro   rp   r   r  r  r  r  r  r  r  r   r  r  r  r:  rW  r  r  r  r  r  rr   __classcell__r~  s   @rt   rs  rs  d  s    C-J4l61pL'*BAK,ZT<l/
ZxZ2xD LR j DE
$!5 5rs   rs  c                  p    \ rS rSrS rS rS r      SS jrSS jr        SS jr	SS jr
S	 rS
rg)_ShardingStageBasei  c                8    Xl         SU l        X l        SU l        g )Nr   Fr  r  r  r  )r   r   sharding_mesh_dims      rt   r   _ShardingStageBase.__init__  s    
"3$)!rs   c                    Xl         g rw   )r  )r   sharding_axiss     rt   r  %_ShardingStageBase._set_sharding_axis  s    +rs   c                    SU l         g )NT)r  r   s    rt   r  (_ShardingStageBase._enable_tensor_fusion  s
    $(!rs   c                   UR                  5       (       Ga  U R                  (       a  UR                  nO[        XR                  5      n[        U[        R                  5      (       Ga"  UR                  5       nUR                  5       S:X  d   S5       e[        U[        UR                  5      5      u  pV[        R                  R                  R                  R!                  UR"                  XV5      n[        R                  R                  R                  R%                  UR'                  5       U5      nUR)                  U5        [        R                  R                  R                  R+                  UR"                  / U/5      Ul        [        R.                  " 5       (       a*  UR                  5       (       a  [1        UUR"                  US9nU$ )N
pd_op.dataz.The master weight must be a result of data op.r  )ry   r  r   rB   r  r   r   r   get_defining_opr0  r<   r   r   r   r   rp  create_tensor_dist_attributer   cvt_to_dist_typer   set_typecreate_op_dist_attributer?  r   r  )	r   r   r  r   data_opdim_maprD  r?  	dist_types	            rt   r  &_ShardingStageBase.shard_master_weight  sy    ==??(("--
8..
 -33'779||~5 D5 +5M$7$7 8+' KK))--JJ**G 
 #KK1155FF!&&()	 &&y1KK))--FF**B ! %%''M,A,A,C,C '!++)!
 rs   c                   [        U[        UR                  5      5      u  pE[        R                  R
                  R                  R                  UR                  XE5      n[        R                  R
                  R                  R                  UR                  5       U5      nUR                  U5        [        R                  R
                  R                  R                  UR                  / U/5      nXR                  5       l        g rw   )r<   r   r   r   r   rp  r   r  r   r  r   r  r  r  r?  )	r   r~   r   r   r  rD  r?  r  op_dist_attrs	            rt   _init_dist_attr"_ShardingStageBase._init_dist_attr   s    ",ZV\\9J"KKK))--JJ
	 KK))-->>KKM9
	 		"{{,,00IIYK
 .: *rs   c                   UR                  5       (       aY  UR                  5       nUR                  5       S:X  a  U R                  XU5        U$ [        R
                  " XR                  U5      $ [        UUR                  US9$ )Nr  r  )ry   r  r0  r  r   r  r   r   )r   r~   r   r   r  s        rt   _apply_placement#_ShardingStageBase._apply_placement  su     >>'')BwwyL($$VJ?<<(:(:JGG##!
 	
rs   c                ,    [        XR                  5      $ rw   )r0   r  )r   r  s     rt   '_reshard_fake_replicate_grad_to_partial:_ShardingStageBase._reshard_fake_replicate_grad_to_partial  s    .t5H5HIIrs   c                   ^^ S mUU4S jn[        5       R                  (       a(  S Tl        TR                  U5        S[        5       l        g TR                  T5        g )Nc                    U R                  5       (       ag  S n[        U R                  5       H(  u  p#[        U[        R
                  5      (       d  M&  UnM*     Ub!  [        X5      n[        X R                  U5      $ U $ rw   )	ry   r   r   r   r   r  rB   r  r   )r  partial_mesh_axisr  r   r  s        rt   _reshard_gradG_ShardingStageBase._register_hook_for_param_grad.<locals>._reshard_grad"  sp    ||~~$(!,5doo,F(I!)T\\::,5) -G %0%@&N #4):):NKKKrs   c                   > [         R                  " U [         R                  5      nU R                  5         TR                  c  T" U5      Tl        g TR                  R                  T" U5      5        g rw   )r   castr  _clear_datar  add_)r  rV  r  r   s     rt   _main_grad_hookI_ShardingStageBase._register_hook_for_param_grad.<locals>._main_grad_hook0  sP    {{48H&"/"9$$]8%<=rs   T)r   r  r  register_hook$already_register_final_backward_hook)r   r   r  r  s    ` @rt   r  0_ShardingStageBase._register_hook_for_param_grad!  sJ    		> --"EO0FJC.rs   r  N)r   rF   r  rF   rf  rF   )r~   rF   r   rF   r   r   )r~   rF   r   rF   r   r   rf  rF   )r  rF   rf  rF   )rm   rn   ro   rp   r   r  r  r  r  r  r  r  rr   rl   rs   rt   r  r    sd    *,)'',2'	'R:

%+
9=
	
 J/rs   r  c                  F   ^  \ rS rSr S     SU 4S jjjrSS jrSrU =r$ )r  i@  c                2   > [         TU ]  X!5        SU l        g )Nr   )r  r   r  r   r  r   r~  s      rt   r   _ShardingStage0.__init__A  s     	1rs   c                R    US:X  a   [        5       (       a  U R                  U5      $ U$ r  )r1   r  )r   r[  r   r~   s       rt   __call___ShardingStage0.__call__G  s'    &=_..??GGrs   )r  rw   r  z	int | strr   zProcessMesh | Nonerf  rg  r[  r   r   rF   r~   rF   rf  rF   )rm   rn   ro   rp   r   r  rr   r  r  s   @rt   r  r  @  s2    GK!*2D	  rs   r  c                  J   ^  \ rS rSrSr S     SU 4S jjjrSS jrSrU =r$ )	r  iN  a  
A builtin shard_fn for shard_optimizer interface, users can pass it to shard_optimizer to implement sharding optimization with stage 1.

Args:
    sharding_mesh_dim(int|str): The sharding dimension in the mesh.
    mesh(None|paddle.distributed.ProcessMesh): If mesh is not None, the `ProcessMesh` object describes the Cartesian topology of the used processes for dense type parameters. Note: Currently, only one mesh configuration is supported for all dense parameters. If there is a need for multiple mesh configurations, please configure them yourself in the upper layer networking code.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])

        >>> class MLP(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.fc1 = paddle.nn.Linear(8, 8)
        ...         self.fc2 = paddle.nn.Linear(8, 8)
        ...
        ...     def forward(self, input):
        ...         return self.fc2(self.fc1(input))

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> layer = MLP()
        >>> batch = paddle.rand(shape=[8, 8])
        >>> opt = paddle.optimizer.AdamW(parameters=layer.parameters())
        >>> opt = dist.shard_optimizer(opt, dist.ShardingStage1("x", mesh))
        >>> for _ in range(5):
        >>>     loss = layer(batch)
        >>>     loss.backward()
        >>>     opt.step()
        >>>     opt.clear_grad()
        >>> # This case need to be executed in multi-card environment
        >>> # python -m paddle.distributed.launch --gpus=0,1 {test_case}.py
c                $   > [         TU ]  X!5        g rw   r  r   r  s      rt   r   ShardingStage1.__init__u      
 	1rs   c                   UR                  5       (       d  U$ U R                  (       d  SU;  a  [        X R                  5      nOK[	        [        UR                  R                  5      5       Vs/ s H  n[        R                  " 5       PM     nnUS:X  a   [        5       (       a  U R                  U5      nU R                  X2U5      $ s  snf )Nr  r  )ry   r  rB   r  rU  r   r   r   r   r   r1   r  r  r   r[  r   r~   r   rH  s         rt   r  ShardingStage1.__call__|  s    }}M ((V3->4U<O<OPJ +0E4F4F4L4L0M*N*NQ *N   &=_..AA&IF$$VJ??s   /Crl   rw   r  r  	rm   rn   ro   rp   r   r   r  rr   r  r  s   @rt   r  r  N  s@    $R $(2$2 !2 
	2 2@ @rs   r  c                  J   ^  \ rS rSrSr S     SU 4S jjjrSS jrSrU =r$ )	r  i  a  
A builtin shard_fn for shard_optimizer interface, users can pass it to shard_optimizer to implement sharding optimization with stage 2.

Args:
    sharding_mesh_dim(int|str): The sharding dimension name in the mesh.
    mesh(None|paddle.distributed.ProcessMesh): If mesh is not None, the `ProcessMesh` object describes the Cartesian topology of the used processes for dense type parameters. Note: Currently, only one mesh configuration is supported for all dense parameters. If there is a need for multiple mesh configurations, please configure them yourself in the upper layer networking code.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])

        >>> class MLP(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.fc1 = paddle.nn.Linear(8, 8)
        ...         self.fc2 = paddle.nn.Linear(8, 8)
        ...
        ...     def forward(self, input):
        ...         return self.fc2(self.fc1(input))

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> layer = MLP()
        >>> batch = paddle.rand(shape=[8, 8])
        >>> opt = paddle.optimizer.AdamW(parameters=layer.parameters())
        >>> opt = dist.shard_optimizer(opt, dist.ShardingStage2("x", mesh))
        >>> for _ in range(5):
        >>>     loss = layer(batch)
        >>>     loss.backward()
        >>>     opt.step()
        >>>     opt.clear_grad()
        >>> # This case need to be executed in multi-card environment
        >>> # python -m paddle.distributed.launch --gpus=0,1 {test_case}.py
c                $   > [         TU ]  X!5        g rw   r  r  s      rt   r   ShardingStage2.__init__  r  rs   c                2   UR                  5       (       a|  SU;  a  [        X R                  5      nOK[        [	        UR
                  R                  5      5       Vs/ s H  n[        R                  " 5       PM     nn[        UUR
                  US9$ U$ s  snf )Nr  r  )
ry   rB   r  rU  r   r   r   r   r   r   r  s         rt   r  ShardingStage2.__call__  s    ==??S 8..
 #3u'9'9'?'?#@AA NN$A    ''% 
 s   Brl   rw   r  r  r  r  s   @rt   r  r    s>    $R $(2$2 !2 
	2 2 rs   r  c                  V   ^  \ rS rSrSr S     S	U 4S jjjrS rS rS
S jrSr	U =r
$ )r  i  a  
A builtin shard_fn for shard_optimizer interface, users can pass it to shard_optimizer to implement sharding optimization with stage 3.

Args:
    sharding_mesh_dim(int|str): The sharding dimension name in the mesh.
    mesh(None|paddle.distributed.ProcessMesh): If mesh is not None, the `ProcessMesh` object describes the Cartesian topology of the used processes for dense type parameters. Note: Currently, only one mesh configuration is supported for all dense parameters. If there is a need for multiple mesh configurations, please configure them yourself in the upper layer networking code.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])

        >>> class MLP(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.fc1 = paddle.nn.Linear(8, 8)
        ...         self.fc2 = paddle.nn.Linear(8, 8)
        ...
        ...     def forward(self, input):
        ...         return self.fc2(self.fc1(input))

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> layer = MLP()
        >>> batch = paddle.rand(shape=[8, 8])
        >>> opt = paddle.optimizer.AdamW(parameters=layer.parameters())
        >>> opt = dist.shard_optimizer(opt, dist.ShardingStage3("x", mesh))
        >>> for _ in range(5):
        >>>     loss = layer(batch)
        >>>     loss.backward()
        >>>     opt.step()
        >>>     opt.clear_grad()
        >>> # This case need to be executed in multi-card environment
        >>> # python -m paddle.distributed.launch --gpus=0,1 {test_case}.py
c                $   > [         TU ]  X!5        g rw   r  r  s      rt   r   ShardingStage3.__init__  r  rs   c                   UR                  5       (       a}  U R                  bp  / n[        [        U R                  R                  5      5       H'  nUR                  [        R                  " 5       5        M)     UR                  X R                  5        UR                  5       (       ad  [        XR                  5      n[        R                  " XR                  U5      nUR                  5       R                  UR                  5       5        g g rw   )is_denser  rU  r   r   r   r   r   	_to_dist_ry   rB   r  r  r   r}   r{   )r   r   r   rH  r  shard_params         rt   r  ShardingStage3._shard_parameter  s    >>

 6J3tzz//01!!$.."23 2OOJ

3==??8**N ,,))>K //0F0F0HI rs   c                |   UR                  5       (       a  UR                  n[        X R                     [        R
                  5      (       a!  [        R                  " 5       X R                  '   [        R                  " XR                  U5      nUR                  5       R                  UR                  5       5        g g rw   )ry   r   r   r  r   r  r   r  r   r}   r{   )r   r   r  r  s       rt   _unshard_parameter!ShardingStage3._unshard_parameter	  s    ==??"--N.)<)<=tzzJJ6:nn6F223U,>,>OI//	0D0D0FG rs   c                   UR                  5       (       d  U$ US:X  a  [        5       (       a  [        S5      eSU;  a9  UR                  n[	        S U 5       5      (       a  [        X R                  5      nO9UR                  R                   Vs/ s H  n[        R                  " 5       PM     nnU R                  X2U5      $ s  snf )Nr  z3Sharding Stage 3 does not support auto dp mode yet.r  c              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7frw   )r   r   r   )r   r  s     rt   r   *ShardingStage3.__call__.<locals>.<genexpr>#	  s     E*Q:a00*r  )ry   r1   r"  r   r   rB   r  r   r   r   r   r  r  s         rt   r  ShardingStage3.__call__	  s    }}M&=_..E  ))JE*EEE8..
 5:4F4F4L4LM4Lq$..*4LJM$$VJ?? Ns   Crl   rw   r  r  )rm   rn   ro   rp   r   r   r  r  r  rr   r  r  s   @rt   r  r    sL    $R $(2$2 !2 
	2 2J"H@ @rs   r  c                    [        XU5      $ )a  

Warp the global view optimizer to distributed view.

Note:
    The `shard_fn` should have the following signature:
        def shard_fn(accumulator_name, param, accumulator) -> sharded_accumulator

Args:
    optimizer (paddle.optimizer.Optimizer): The optimizer to be sharded.
    shard_fn (Callable, optional): The function to shard accumulators. If not specified,
       we simply pass down the dist attr of the params.

Returns:
    An optimizer with distributed view.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist
        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
        >>> class MLP(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.fc1 = paddle.nn.Linear(8, 8)
        ...         self.fc2 = paddle.nn.Linear(8, 8)
        ...
        ...     def forward(self, input):
        ...         return self.fc2(self.fc1(input))
        >>> layer = MLP()
        >>> batch = paddle.rand(shape=[8, 8])
        >>> opt = paddle.optimizer.AdamW(parameters=layer.parameters())
        >>> opt = dist.shard_optimizer(opt)
        >>> for _ in range(5):
        >>>     loss = layer(batch)
        >>>     loss.backward()
        >>>     opt.step()
        >>>     opt.clear_grad()
        >>> # This case need to be executed in multi-card environment
        >>> # python -m paddle.distributed.launch --gpus=0,1 {test_case}.py

)rs  )ry  rk  r  s      rt   shard_optimizerr  ,	  s    ` 90KLLrs   c                ,    S n[        X5      U l        U $ )a  

Warp the global view grad_scaler to distributed view.

Args:
    scaler (paddle.amp.GradScaler): The GradScaler to be sharded.

Returns:
    A GradScaler with distributed view.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist
        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
        >>> class MLP(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.fc1 = paddle.nn.Linear(8, 8)
        ...         self.fc2 = paddle.nn.Linear(8, 8)
        ...
        ...     def forward(self, input):
        ...         return self.fc2(self.fc1(input))
        >>> layer = MLP()
        >>> batch = paddle.rand(shape=[8, 8])
        >>> opt = paddle.optimizer.AdamW(parameters=layer.parameters())
        >>> layer, opt = paddle.amp.decorate(layer, opt, level='O2')
        >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
        >>> scaler = dist.shard_scaler(scaler)
        >>> opt = dist.shard_optimizer(opt)
        >>> for _ in range(5):
        >>>     with paddle.amp.auto_cast(True):
        >>>         loss = layer(batch)
        >>>     scaled = scaler.scale(loss)
        >>>     scaled.backward()
        >>>     scaler.step(opt)
        >>>     scaler.update()
        >>>     opt.clear_grad()
        >>> # This case need to be executed in multi-card environment
        >>> # python -m paddle.distributed.launch --gpus=0,1 {test_case}.py

c                   U R                   (       d  g U R                  [        U5         nUS   [        R                  L a  [        S5      eUS   [        R                  L a  [        S5      eS nS n[        R                  " [        R                  " S/5      R                  [        R                  5      5      U l        0 n[        USS 5      (       a  [        UR                   S   ["        5      (       a  UR                    H  nUS    H  nUR%                  5       nUc  M  [        USS 5      " 5       (       d  M2  Uc  UR&                  b  UR&                  nO Uc.  UR)                  5       (       a  UR&                  b  UR&                  nUR&                  U;  a  U/XXR&                  '   M  XXR&                     R+                  U5        M     M     OUR,                   H  nUR%                  5       nUc  M  [        USS	 5      " 5       (       d  M2  Uc  UR&                  nUc!  UR)                  5       (       a  UR&                  nUR&                  U;  a  U/XXR&                  '   M  XXR&                     R+                  U5        M     UR/                  5        GHZ  u  p/ n/ n[        R                  " [        R                  " S/5      R                  [        R                  5      5      n[        R                  " [        R                  " S/5      R                  [        R                  5      5      n[        R                  " [        R                  " S/5      R                  [        R                  5      5      nU R0                  R3                  5       (       a  U R0                  R5                  5       nOU R0                  nU
 H  nUR6                  [8        R:                  R<                  R>                  [        R@                  [8        R:                  R<                  RB                  [        RD                  4;   a  UR+                  U5        M  UR+                  U5        M     [G        U5      (       a  [H        RJ                  " UU5      u  pS
[        RL                  RO                  5       ;   aD  [        RP                  " US5      n[        RR                  " U5      n[        RP                  " US5      n[H        RT                  " X5      n[G        U5      (       a  [H        RJ                  " UU5      u  pS
[        RL                  RO                  5       ;   aD  [        RP                  " US5      n[        RR                  " U5      n[        RP                  " US5      n[H        RT                  " X5      n[V        RX                  " XURZ                  5      n[H        RT                  " U R                  U5      U l        GM]     U R                  R&                  U:X  aN  UR]                  5        H9  n[V        RX                  " U R                  UU R                  RZ                  5      n	M;     OZUb  [_        US5      (       d  [a        S5      e[V        RX                  " U R                  UU R                  RZ                  5      U l        [        R                  US'   g )NstatezMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().r   r  r  rz   c                     gNFrl   rl   rs   rt   rb  6shard_scaler.<locals>.unscale_method.<locals>.<lambda>	  s    rs   c                     gr  rl   rl   rs   rt   rb  r  	  s    Urs   xpuint32boolranksz:Invalid current_process_mesh: must be a valid ProcessMesh.)1_enable_optimizer_statesr2  r   UNSCALEDr"  STEPPEDr   r   r  r  astypebool_
_found_infr   r   r  r  
_grad_ivarr   rz   r   r  r7  _scalery   r|   r   r*   VarDescVarTypeFP16float16BF16bfloat16r   r
   check_finite_and_unscale_rI  
get_devicer  sum
bitwise_orr   r  r   r  ro  r   )r   ry  optimizer_statesrc_meshcurrent_process_meshmesh2param_gradsr  r   tgt_gradrH  param_gradstemp_param_grads_halftemp_param_grads_fp32temp_found_inftemp_found_inf_halftemp_found_inf_fp32
temp_scaler  r   s                      rt   unscale_method$shard_scaler.<locals>.unscale_method	  s   ||00I?7#~'>'>>_  W%)?)??IJJ# **288QC=+?+?+IJ9ot44##A&:
 :
 #00"8_E$//1H ,#$&7  
 %, ( 5 5 A'/'<'<H 08 ( 8 8 : : ( 5 5 A3;3H3H0#008HHGOj,-B-BC,-B-BCJJ (1 - 1: #22 ++-(*;]KMM'#+#8#8,4$4466/7/D/D,,,4DDCK*()>)>?()>)>?FFxP! 3$ /446NA$&!$&!#--bhhsm.B.B288.LMN"("2"2!$$RXX.# #)"2"2!$$RXX.# {{""$$![[557
![[
#::LL((--NNLL((--OO	"  *006)006 $ ()))/)I)I)*& FMM4466,2KK-w-) -3JJ7L,M),2KK-v-) "(!2!2"" ()))/)I)I)*& FMM4466*0+++W+' +1**5H*I'*0+++V+' "(!2!2"" "\\.*C*CN %//PDOE 7J ??''+?? 0 5 5 7LLOO\4??3M3M !8
 $+7$g4 4 !P  #ll$**DO
 $2#:#: rs   )r   _unscale)scalerr(  s     rt   shard_scalerr,  _	  s    Z[;z !8FOMrs   c                  B    \ rS rSr% SrS\S'   S\S'   S\S'   S
S jrS	rg)
FusePassesi/
  z8
A helper class for users to configure the fuse passes.
r  enablegemm_epiloguedropout_addNc                    SU l         SU l        SU l        UbD  UR                  5        H/  u  p#[	        X5      (       a  [        XU5        M#  [        SU 35      e   g g )NFzUnknown fuse pass )r/  r0  r1  r7  ro  r  r   )r   config_dictr[  r  s       rt   r   FusePasses.__init__8
  sa    " ")//1
4%%Du-$'9#%?@@	 2 #rs   )r1  r/  r0  rw   )rm   rn   ro   rp   r   rq   r   rr   rl   rs   rt   r.  r.  /
  s      L	Ars   r.  c                     ^  \ rS rSrSrSSU 4S jjjrS r\SS j5       r\SS j5       r	\SS j5       r
\SS j5       r\SS	 j5       r\SS
 j5       rSrU =r$ )r  iD
  a  
The `Strategy` object is used to configure the parallelization
and optimization strategies for static graph. Currently supports
configuring ``sharding``, ``fused_passes``, ``gradient_merge``
and ``pipeline``. More strategies will be supported in the future.

``sharding`` is used to configure the sharding states of the optimizer,
for saving the GPU memory.

``fused_passes`` is used to configure the fusion of the computation in
the model.

``gradient_merge`` is used to configure the gradient merge strategy in
training.

``pipeline`` is used to configure the pipeline parallelism strategy.

Args:
    config(dict|None, optional): The user-defined configurations.
        If ``config`` is None, use default configurations. If it is
        a dict, the items inside the dict will be used to set the
        configurations, and the others remain the default values.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> strategy = dist.Strategy()

        >>> strategy.sharding.enable = True
        >>> strategy.sharding.stage = 2
        >>> strategy.sharding.degree = 2

        >>> strategy.gradient_merge.enable = True
        >>> strategy.gradient_merge.k_steps = 2
        >>> strategy.gradient_merge.avg = False

        >>> strategy.pipeline.enable = True
        >>> strategy.pipeline.schedule_mode = "1F1B" # default is "1F1B"
        >>> strategy.pipeline.micro_batch_size = 2
c                  > Ub?  [        U[        5      (       a  [        R                  " U5      U l        O[        SU 35      e0 U l        [        R                  R                  n[        TU ])  X R                  5        U R                  R                  [        R                  R                  S 5      n[        R                  " U5      U l        U R                  R                  [        R                  R                  S 5      n[        R                   " U5      U l        U R                  R                  [        R                  R$                  S 5      n[        R&                  " U5      U l        U R                  R                  [        R                  R*                  S 5      n[        R,                  " U5      U l        U R                  R                  [        R                  R0                  S 5      n[3        U5      U l        U R                  R                  [        R                  R6                  S 5      n[        R8                  " U5      U l        U R                  R                  [        R                  R<                  S 5      n[        R>                  " U5      U l         U R                  R                  [        R                  RB                  S 5      n[        RD                  " U5      U l#        U R                  R                  [        R                  RH                  S 5      n[        RJ                  " U5      U l&        U R                  R                  SS5      U l'        g )Nz%Expected a dictionary. But received: 
full_graphT)(r   r  r   r   _config_dictr   auto_strategy	constantsBASEr  r   r  SHARDINGShardingConfig	_shardingGRADIENT_MERGEGradientMergeConfig_gradient_mergePIPELINEPipelineConfig	_pipelineAMP	AMPConfig_ampFUSED_PASSESr.  _fused_passes	RECOMPUTERecomputeConfig
_recomputeMP_OPTIMIZATIONMPOptimizationConfig_mp_optimizationDP_OPTIMIZATIONDPOptimizationConfig_dp_optimizationSP_OPTIMIZATIONSPOptimizationConfig_sp_optimization_full_graph)r   r  categoryr3  r~  s       rt   r   Strategy.__init__q
  s   &$''$(MM&$9! ;F8D  !#D **//#4#45''++##,,d
 '55kB''++##22D
  -@@M''++##,,d
 '55kB''++M,C,C,G,GN!++K8	''++##00$
 (4 ''++##--t
 (77D''++##33T
 !. B B; O''++##33T
 !. B B; O''++##33T
 !. B B; O,,00tDrs   c           	        SSK n[        R                  R                  n[        R                  R	                  U5      nUR                  5        H  n[        X[        X5      5        M     UR                  R                  U R                  l	        SUR                  R                  ;   a  SU R                  l        SUR                  R                  ;   a  SU R                  l        UR                  " UR                  5      U l        UR                  " UR"                  5      U l        UR                  " UR&                  5      U l        UR                  " UR*                  5      U l        UR                  " UR.                  5      U l        UR                  " UR2                  5      U l        UR                  " UR6                  5      U l        UR                  " UR:                  5      U l        g)z_
NOTE(lizhiyu): This is a template function to get `dist.Strategy` from `fleet.auto.Strategy`.
r   Nfused_gemm_epilogue_passTfused_dropout_add_pass)r   r9  r:  r;  get_category_default_configr  r  r   rd   r/  rI  fused_passes_listr0  r1  r   rg   rG  rc   r>  re   rA  rf   rD  rh   rL  ri   rO  rj   rR  rk   rU  )r   legacy_strategyr   rW  base_configr[  s         rt   _from_legacy_strategyStrategy._from_legacy_strategy
  sq    	 **//#--II
 ##%CDw<= &$3$@$@$G$G!&++==> 04D,$++==> .2D*MM/"5"56	'?'?@#}}_-K-KL'?'?@--(A(AB $o.M.M N $o.M.M N $o.M.M Nrs   c                    U R                   $ )z
Whether to use AST mode.
)rV  r   s    rt   r7  Strategy.full_graph
  s    
 rs   c                    U R                   $ )a  
``sharding`` is used to configure the sharding states of the optimizer,
containing following configs:

    ``enable`` (bool): whether to enable sharding. Default: False.

    ``stage`` (int): can be set to 1, 2 or 3. 1 indicates the optimizer states segmentation,
    2 indicates optimizer states and gradient segmentation, 3 indicates the segmentation
    of optimizer states, gradient and parameters. Default: 1.

    ``degree`` (int): the number of segmentation pieces. Default: 8.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> strategy = dist.Strategy()

        >>> strategy.sharding.enable = True
        >>> strategy.sharding.stage = 2
        >>> strategy.sharding.degree = 2
)r>  r   s    rt   rc   Strategy.sharding
  s    4 ~~rs   c                    U R                   $ )a  
``gradient_merge`` is used to configure the gradient merge strategy in
training, containing following configs:

    ``enable`` (bool): whether to enable gradient merge. Default: False.

    ``k_steps`` (int): the number of steps for merging gradients. Default: 1.

    ``avg`` (bool): whether to average the gradients of each step. Default: True.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> strategy = dist.Strategy()

        >>> strategy.gradient_merge.enable = True
        >>> strategy.gradient_merge.k_steps = 2
        >>> strategy.gradient_merge.avg = True
)rA  r   s    rt   re   Strategy.gradient_merge
  s    0 ###rs   c                    U R                   $ )a  
``fused_passes`` is used to configure the fusion of the computation in
the model, containing following configs:

    ``enable`` (bool): whether to enable fused passes. Default: False.

    ``gemm_epilogue`` (bool): whether to fuse ``matmul`` and ``add`` computation
    in the ``Linear`` layer. Default: False

    "dropout_add" (bool): whether to fuse ``dropout`` and ``add`` computation. Default: False.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> strategy = dist.Strategy()

        >>> strategy.fused_passes.enable = True
        >>> strategy.fused_passes.gemm_spilogue = True
        >>> strategy.fused_passes.dropout_add = True
)rI  r   s    rt   rd   Strategy.fused_passes  s    2 !!!rs   c                    U R                   $ )a  
``pipeline`` is used to configure the pipeline parallelism,
containing following configs:

    ``enable`` (bool): whether to enable pipeline parallelism. Default: False.

    ``schedule_mode`` (str): the scheduling mode of pipeline parallelism. Default: "1F1B".

    ``micro_batch_size`` (int): the size of each micro-batch inside a mini-batch. Default: 1.

    ``accumulate_steps`` (int): number of steps for accumulating. Default: 1.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> strategy = dist.Strategy()

        >>> strategy.pipeline.enable = True
        >>> strategy.pipeline.micro_batch_size = 2
)rD  r   s    rt   rf   Strategy.pipeline&  s    2 ~~rs   c                    U R                   $ )a6  
``amp`` is used to configure the amp,
containing following configs:

    ``enable`` (bool):  whether to enable AMP. Default: False.
    ``dtype``, (str): the data type of AMP. Default: "float16".
    ``level``, (str): the level of AMP. Default: "O1".
    ``init_loss_scaling``, (float): the initial value of loss scaling. Default: 32768.0
    ``incr_every_n_steps``, (int): the number of steps for increasing loss scaling. Default: 1000
    ``decr_every_n_nan_or_inf``, (int): the number of steps for decreasing loss scaling. Default: 2
    ``incr_ratio``, (float): the ratio for increasing loss scaling. Default: 2.0
    ``decr_ratio``, (float): the ratio for decreasing loss scaling. Default: 2.0
    ``use_dynamic_loss_scaling``, (bool): whether to use dynamic loss scaling. Default: False
    ``custom_white_list``, (list): the list of names for which AMP will be applied. Default: []
    ``custom_black_list``, (list): the list of names for which AMP will not be applied. Default: []
    ``custom_black_varnames``, (list): the list of names for which AMP will not be applied. Default: []
    ``use_fp16_guard``, (bool): whether to use fp16 guard. Default: False
    ``use_bf16_guard``, (bool): whether to use bf16 guard. Default: False
    ``use_master_grad``, (bool): whether to use master grad. Default: False

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist

        >>> strategy = dist.Strategy()

        >>> strategy.amp.enable = True
        >>> strategy.amp.dtype = "float16"
        >>> strategy.amp.level = "O2"
)rG  r   s    rt   rg   Strategy.ampA  s    D yyrs   )rG  r8  rR  rV  rI  rA  rO  rD  rL  r>  rU  rw   )r  z_Config | Nonerf  rg  rf  r  )rf  zauto_strategy.ShardingConfig)rf  z!auto_strategy.GradientMergeConfig)rf  r.  )rf  zauto_strategy.PipelineConfig)rf  zauto_strategy.AMPConfig)rm   rn   ro   rp   r   r   r`  r   r7  rc   re   rd   rf   rg   rr   r  r  s   @rt   r  r  D
  s    *X9E 9Ev OD      6 $ $2 " "4  4 ! !rs   r  c                  $   \ rS rSrSr     S               SS jjrSS jrSS jrSS jrS r	SSS	 jjr
SSS
 jjrSSS jjrSSS jjrS rS r\S S j5       rSS jr  S!     S"S jjrS rS#S jrS rS rS rS rS rS rSrg)$	DistModelif  aL	  
`DistModel` is the model converted from a ``paddle.nn.layer`` with distributed
tensors as its parameters. It contains the static graph converted from a
``paddle.nn.layer`` whose parameters are distributed tensors (constructed
from ``paddle.distributed.shard_tensor``), and provides the APIs for training,
evaluation and prediction with the static graph.

It is suggested to generate DistModel by ``paddle.distributed.to_static``,
not directly by ``paddle.distributed.DistModel``.

Please first set the DistModel to "train", "eval" or "predict" mode with
``train()/eval()/predict()`` method and then use the ``__call__`` method for
training, evaluation and prediction respectively.

For more details of the usage, please refer to the sample code in
``paddle.distributed.to_static``.

Args:
    layer(paddle.nn.Layer): The layer in dygraph mode, whose parameters
        are distributed tensors generated by ``shard_tensor``.
    loader(ShardDataLoader|paddle.io.DataLoader): The data loader used in dygraph mode,
        used to infer inputs_spec and labels_spec.
    loss(Loss|Callable|None, optional): The loss function for training
        or evaluating the model. Can be a `paddle.nn.Layer` instance or
        any callable function. If loss is not None, DistModel will be set
        to "train" (when the optimizer is also not None) or "eval" mode
        (when optimizer is None) in default. If it is None, DistModel will
        be set to "predict" mode in default. Default: None.
    optimizer(paddle.optimizer.Optimizer|None, optional): The optimizer
        for training. If both optimizer and loss are set, DistModel will
        be set to "train" mode in default. Default: None.
    strategy(paddle.distributed.Strategy|None, optional): Configs for
        parallel strategies and optimization settings (e.g. sharding,
        pipeline parallelism). Default: None.
    input_spec(list[list[paddle.distributed.DistributedInputSpec]]|None, optional):
        The custom input specs specify the shape, dtype, and name information
        of model inputs and labels. If it is not None, the input specs and
        label specs will be inferred from the custom input specs. The custom
        input specs should be a list containing two sublists: the first
        sublist represents theinput specs, and the second sublist represents
        the label specs. Default: None.
Nc                   U R                  U5      U l        UR                  5       R                  5        VV	s0 s H  u  pXR                  _M     sn	nU l        U R
                  R                  5        VV	s0 s H  u  pX_M	     sn	nU l        [        R                  " S5      (       ac  [        R                  R                  R                  [        R                  5      R                  S5        [        R                   R#                  SS9  U(       a  UR$                  R&                  (       a  [)        U[*        5      (       a  [-        US5      (       a  [-        US5      (       a  [/        5       (       a  [)        UR0                  [2        5      (       d   S5       e[)        UR0                  [2        5      (       a/  UR0                  n
UR4                  n[7        XU R                  5      nO[        R8                  " S5        [;        XXFU R                  S	9U l        S U l        0 U l         Ub)  US
   U R<                  l!        US   U R<                  l"        O[)        U[F        5      (       a=  U R<                  RI                  U5      u  U R<                  l!        U R<                  l"        O^URJ                  RL                  nU R<                  RO                  URP                  S U5      u  U R<                  l!        U R<                  l"        [R        RT                  RV                  RY                  S5      S   U l-        U RZ                  (       d<  Ub  Ub  U R]                  5         g Ub  U R_                  5         g U Ra                  5         g g s  sn	nf s  sn	nf )NPOD_NAMEz0Distribute training by paddle.distributed.launchT)is_collectiver  ru  zKThe shard_fn should be ShardingStage1 when stage1 tensor fusion is enabled.z7Sharding tensor fusion only support ShardingStage1 now.)r   r   r.   FLAGS_enable_pir_api)1_DistModel__convert_strategy_inner_strategyr  r7  r0  _structured_to_parameter_name_parameter_to_structured_namer  getenvr   utils	log_utils
get_loggerloggingINFOinfor   initrc   r  r   rs  ro  r   r  r  ru  r@   warningr   _engine_mode_feed_name_list_inputs_spec_labels_specShardDataloader"_prepare_data_spec_from_dataloaderbatch_sampler
batch_size_prepare_data_specdatasetr   r   r   	get_flags_in_pir_moder\   r]   r^   )r   rZ  loaderr  ry  r   metrics
input_specr  r  rk  	inner_optr  s                rt   r   DistModel.__init__  s     $66x@"'"2"2"4":":"<.
"<$!AvvI"<.
* "??EEG.
GTQADG.
* 99Z  JJ  ++GLL9>>B JJOO$O/ !!669o66	;//	<00i11>BB 8B )--~>>$..%00	3)=)=	 M d6J6J
 
! !(21DLL%(21DLL%00 ??G))  --88J //j)) #KK11;;"

 " !!$)9

!		 "G.
.
s   N 6Nc                    U R                   R                  S   (       d  U R                   R                  SSS9  SU l        U R                   R	                  S5        [
        R                  " 5         g)z
Set the DistModel to "train" mode. In "train" mode,
executing ``__call__`` method will update the
parameters of the model and return the loss.
r\   Fmodeinit_parametersNr  _has_prepared_prepare_programr  to_moder   disable_staticr   s    rt   r\   DistModel.train  sR     ||))'2LL))w)N
W%rs   c                    U R                   R                  S   (       d  U R                   R                  SSS9  SU l        U R                   R	                  S5        [
        R                  " 5         g)zc
Set the mode of DistModel to "eval". In "eval" mode,
executing ``__call__`` will return the loss.
r]   Fr  Nr  r   s    rt   r]   DistModel.eval  sR    
 ||))&1LL))vu)M
V$rs   c                6   U R                   R                  S   (       dD  U R                   R                  [        R                  " U R                   R
                  5      SSSS9  SU l        U R                   R                  S5        [        R                  " 5         g)z
Set the mode of DistModel to "predict". In "predict" mode,
executing ``__call__`` returns a dict that contains the
outputs of the model.
r^   NFr  )
r  r  preparer   r   r  r  r  r   r  r   s    rt   r^   DistModel.predict  ss     ||)))4LL  dll778 %	 !  
Y'rs   c                |    Uc  U R                   c  [        S5      eUc  U R                   nUS;  a  [        S5      eU$ )Nz;Please set the mode or call train()/eval()/predict() first.r[   z.mode can only be 'train', 'eval' or 'predict'.)r  r   r   r  s     rt   __validate_modeDistModel.__validate_mode  sJ    <DJJ.M  <::D33MNNrs   c                Z    U R                  U5      nU R                  R                  U5      $ )ax  
Get the distributed main program of the specified ``mode``. Each
'mode' has its own distributed main program, ``dist_main_program``
returns the corresponding distributed main program of ``mode``.

Args:
    mode (str|None, optional): Can be 'train' , 'eval' , 'predict' or None.
        'train' : Return the distributed main program for training.
        'eval' : Return the distributed main program for evaluation.
        'predict' : Return the distributed main program for prediction.
        None : The current mode of the DistModel will be used.
        Default : None.

Returns:
    The distributed main program of ``mode``.
)_DistModel__validate_moder  get_dist_main_programr  s     rt   dist_main_programDistModel.dist_main_program  s)    " ##D)||11$77rs   c                Z    U R                  U5      nU R                  R                  U5      $ )a-  
Get the corresponding distributed startup program of ``mode``,
which is used for initializing the parameters.

Args:
    mode (str|None, optional): Can be 'train' , 'eval' , 'predict' or None.
        'train' : Return the distributed startup program for training.
        'eval' : Return the distributed startup program for evaluation.
        'predict' : Return the distributed startup program for prediction.
        None: The current mode of the DistModel will be used.
        Default : None.

Returns:
    The distributed startup program of ``mode``.
)r  r  get_dist_startup_programr  s     rt   dist_startup_programDistModel.dist_startup_program3  s)      ##D)||44T::rs   c                Z    U R                  U5      nU R                  R                  U5      $ )a  
Get the corresponding serial main program of ``mode``, containing
the whole variables and operators of the given ``layer``.

Args:
    mode (str|None, optional): Can be 'train', 'eval', 'predict' or None.
        'train' : Return the main program for training.
        'eval' : Return the main program for evaluation.
        'predict' : Return the main program for prediction.
        None : The current mode of the DistModel will be used.
        Default : None.

Returns:
    The serial main program of ``mode``.
)r  r  get_serial_main_programr  s     rt   serial_main_programDistModel.serial_main_programF  s)      ##D)||33D99rs   c                Z    U R                  U5      nU R                  R                  U5      $ )a  
Get the corresponding serial startup program of ``mode``.

Args:
    mode (str|None, optional): Can be 'train' , 'eval' , 'predict' or None.
        'train' : Return the serial startup program for training.
        'eval' : Return the serial startup program for evaluation.
        'predict' : Return the serial startup program for prediction.
        None : The current mode of the DistModel will be used.
        Default : None.

Returns:
    The serial startup program of ``mode``.
)r  r  get_serial_startup_programr  s     rt   serial_startup_program DistModel.serial_startup_programY  s)     ##D)||66t<<rs   c                   U R                   U R                  ;  d  U R                  U R                      / :X  a1  U R                  R                  5       U R                  U R                   '   U R                  U R                      n[	        U5      [	        U5      :w  a  [        SU S35      e/ n/ n[        U5       Hi  u  pV[        U[        R                  5      (       a4  [        U5      nUc  UR                  U5        ME  UR                  U5        MX  UR                  U5        Mk     / n[        U5       H  u  pYXT;  d  M  UR                  U	5        M     [        [        X5      5      $ )Nz@The input data and feed_list are not consistent.The model takes z	 as input)r  r  r  get_feed_name_listr   r   r   r   r   rF   r   r   r  ro  )
r   	data_listfeed_name_list	feed_listno_data_idsr   r   feed_varfeed_name_list_with_data	feed_names
             rt   _make_feedsDistModel._make_feedsk  s:   JJd222##DJJ/25/3||/N/N/PD  ,--djj9~#i.0##1"2)= 
 	 #9-IC$..(.#&&s+$$X.  & . $& '7NC%(//	: 8 C0<==rs   c           	        SS K nUc  g [        R                  " 5       n[        R                  R                  n[        R                  R                  U5      nUR                  5        H  n[        X6[        X5      5        M     UR                  R                  UR                  l
        [        UR                  SS5      (       a%  UR                  R                  R                  S5        [        UR                  SS5      (       a%  UR                  R                  R                  S5        UR                  " UR                  5      Ul        UR                  " UR                  5      Ul        UR                  " UR                   5      Ul        UR                  " UR"                  5      Ul        [%        US5      (       a!  UR                  " UR&                  5      Ul        [%        US5      (       a!  UR                  " UR*                  5      Ul        [%        US	5      (       a!  UR                  " UR.                  5      Ul        [%        US
5      (       a!  UR                  " UR2                  5      Ul        U$ )Nr   r0  FrZ  r1  r[  rL  rO  rR  rU  )r   r9  r  r:  r;  r\  r  r  r   rd   r/  r]  r   r   rg   rc   re   rf   ro  rL  rh   rO  ri   rR  rj   rU  rk   )r   r   r   inner_strategyrW  r_  r[  s          rt   __convert_strategyDistModel.__convert_strategy  s   &//1 **//#--II
 ##%CN)?@ &-5-B-B-I-I##*8((/5AA''99@@* 8((-??''99@@( "]]8<<8"&--0A0A"B(,h6M6M(N%"&--0A0A"B8\**'+}}X5H5H'IN$8/00-1]])).N* 8/00-1]])).N* 8/00-1]])).N* rs   c                @   U R                   c  [        S5      eU R                   S:X  a9  U R                  R                  b  U R                  R                  c  [        S5      eU R                   S:X  a"  U R                  R                  c  [        S5      e/ n[        U5       Hy  n[        U[
        [        45      (       a  U[        U5      -  nM.  [        U[        R                  [        R                  45      (       a  X#/-  nMd  [        S[        U5       35      e   U R                  U5      nU R                  R                  U5      nXPl        U R                   S:X  a   SU R                   ;   a  U R                   S   $ g S	U R                   ;   a  U R                   S	   $ g )
Nz+Please call train()/eval()/predict() first.r\   z7Please set optimizer and loss function before training.r]   z+Please set loss function before evaluation.z:The inputs of DistModel should be list or tensor, but got r^   r5  r  )r  r   r  
_optimizer_lossr   r   tupler   rF   r*   rx   	TypeErrorr   r  runouts)r   r(  r  	feed_itemfeedsr  s         rt   r  DistModel.__call__  sg   ::JKK:: ||&&.$,,2D2D2L M  ::||!!) !NOO	dI)dE]33T)_,	It7G7G'HII[(	PQUV_Q`Pab  $   +||&	::"DII%yy++"yy((rs   c                    U R                   R                  R                  U5        Uc"  [        U R                   R                  5      S-
  nU R                   R                  R                  U5        g)a  
Get the value of the variable with the given name.

Args:
    value (pir.Value): The pir Value to fetch.
    name (str|None, optional): The user-defined name of
        the fetched result. If None, the order of the Value
        in the fetch list will be used. Default: None.
Nr.   )r  _pir_fetch_valuesr   r   _pir_user_defined_fetch_names)r   r  r0  s      rt   _fetch_valueDistModel._fetch_value  sR     	&&--e4<t||556:D2299$?rs   c                `   [        5       (       aQ  [        R                  R                  5       nU R	                  U R
                  R                  S9R                  X5      nO2U R	                  U R
                  R                  S9R                  U5      nU R                  U5      nU R
                  R                  Gb  U(       Ga  [        R                  R                  R                  5          U R
                  R                  R                  5        GH  u  pgU GH  nUR                  5       u  u  p[        UR                  5       5      nU GHJ  n[!        X5      nUc  M  X\   nUR#                  5       (       d   SU SU S35       eUR$                  nUR&                  nSU;   a  UR)                  5       4[+        U
5      -  nOZ[+        U
5      S:X  a!  SnU
S	   R,                  nU
S
   R,                  nOSnSnSn[/        UR)                  5       [+        U
5      UUUS9n[1        [+        U
5      5       HP  n[3        UU   UU5      n[        R4                  " UU   UR)                  5       5        UUU
U   R6                  U-   '   MR     UR9                  U5        GMM     GM     GM     SSS5        UR                  5        Vs/ s H$  nUU R:                  ;   a  U R:                  U   OUPM&     nn[=        [?        U[        URA                  5       5      5      5      nU$ ! , (       d  f       N{= fs  snf )a  
Get the state dict of model and optimizer.

Args:
    mode (str): Can be ['opt', 'param', 'all'],
        'opt' :  The return value only contains the variable in the optimizer.
        'param' : The return value only contains the variable in the network, not the variable in the optimizer.
        'all' : The return value contains the variable in the network and optimizer.
        Default: 'all'
r  Nkey  value: is not a dist tensor._pow_acc   Tr   r.   F)
split_numsis_qkv	num_headsnum_key_value_heads)!r   r   staticglobal_scoper  r  r  r  _build_distributed_state_dictfused_ffn_qkvr   dygraphguardr7  r   r  r   ry   r   r   r|   r   local_num_headr%   rU  r!  r1  r0  poprx  r  ro  r6  )r   r  split_fusionscopelocal_state_dictdist_state_dictr[  pat_list
fusion_mapfused_paramori_params_metaorigin_paramsr   suffixr  r   r   r   r  r  r  r   r   r  mapping_namess                            rt   r  DistModel.state_dict  s$    ==MM..0E#55\\''  6  j%   $55\\''  6  j  <<=MN <<%%1l$$**,%)\\%?%?%E%E%GMC&.
<F<L<L<N97+(,_-A-A-C(D%2E%0%DF%1(7(>',}} !"&*5'?U$V!" (-'9'9-2-=-=
#-#7+0+=+=+?*AC(7E& +&C (+?';q'@154C,-5**8. )2 ?N,-?**8. )< 2748	>B(;*:(-(:(:(<363G/52;<O+&C */s?/C)DA2D(+Aj3&K %+MM(+A0H0H0J%&
 )4 %4(7(:(?(?&(H%& *E !0 3 3E :W &3 '/ &H -t %))+
 , ::: 2215 , 	 
 tO$:$:$<=>
  -,h
s   ,A0L EL?+L+
L(c                   U R                  U R                  R                  S9n[        5       (       a  [	        U5      nO,[	        X R                  R
                  U R                     5      nS n0 n[        R                  R                  R                  5          UR                  5        H%  u  pgXc;   d   SU SU S35       eU" XsU   5      XV'   M'     SSS5        U$ ! , (       d  f       U$ = f)zW
Args:
    local_state_dict(Dict[str, libpaddle.Tensor]): The state dict from program.
r  c                   [        U [        R                  [        R                  [        R
                  R                  45      (       d   e[        U [        R                  5      (       d  [        R                  " U 5      n [        U [        R                  5      (       d   SU  S[        U 5       S35       e[        U R                  5      [        US   5      :X  d   SU R                   SUS    S35       eU R                  n[        [        R                  " US   5      R                  US	   5      US
   S9n[        US   U5      n[        XU5      nUR                  5       R                  U R                  :X  d/   SUR                  5       R                   SU R                   35       e[        R                  " XR                  5       5        U$ )Nzlocal tensor:z type z is not paddle.Tensor.r   zlocal tensor shape z! not equal to dims_mapping shape r   r  process_shaper   )r   z! not equal to local_tensor.shape:)r   r   rF   r  ndarrayr   r   r   r   r   r  r  r=   r!  r|   r1  )r   r?  r  r   r   r   s         rt   build_distributed_tensorIDistModel._build_distributed_state_dict.<locals>.build_distributed_tensor[  s   v}}bjj&++:L:LM    lFMM::%}}\:lFMM:: ~VD4F3GG]^: |))*c)N2K.LL %l&8&8%99Z[des[tZuuvwL (--L?34<<o. $K0	D 'y'@$GJ,\LK++-33|7I7II %k&>&>&@&F&F%GGhiui{i{h|}I MM,(@(@(BCrs   zvar z not in dist attrs:r   N)r  r  r  r   r#   _dist_contextsr   r   r  r  r7  )r   r  r  
dist_attrsr  global_state_dictvar_namer~   s           rt   r  'DistModel._build_distributed_state_dictM  s    
 !228J8J2K==&'89J '!<<#>#>tzz#JJ	6 [[  &&($4$:$:$< - 8*$7
|1E- /Gx0/!+	 %= ) !  )( ! s   :C
C+c                    0 nU R                  U R                  R                  S9nU R                  SS9nSnU R                  (       a   U R                  R
                  R                  OSnU R                  R                  b	  U(       a  SnUR                  5        H  u  pxUR                  5       (       d   SU SU S35       eXt;   a  XG   n	UR                  UU   R                  :X  da  [        UR                  U	R                  5      (       d<   SUR                   S	U	R                   S
UR                   S	U	R                   S3	5       eXpR                  ;   a  U R                  U   OUn
[        UR                  5       5      X*'   M     U R                  R                   Gb  ["        R$                  R&                  R)                  5          U R                  R                   R                  5        GH  u  pU GHz  nUR                  5       u  u  p/ nUR                  5        H4  u  px[+        US   R,                  U5      nUc  M#  UR/                  U5        M6     [1        U5      S:X  a  Ms  U GH   n/ nU H_  nUR,                  U-   U;  a(  [2        R4                  " UR,                  U-    S35          O%UR/                  UUR,                  U-      5        Ma     [1        U5      [1        U5      :X  d  M  SU;   a  US   nOB[1        U5      S:X  a!  SnUS   R6                  nUS   R6                  nOSnS nS n[9        UUUUS9n[        U5      X.U-   '   U H  nUR;                  UU-   5        M     GM     GM}     GM     S S S 5        [=        5       (       a0  UR?                  U["        R@                  RC                  5       U5        g UR?                  U["        R@                  RC                  5       5        g ! , (       d  f       N|= f)Nr  F)r  Tr  r  r  zprocess_mesh: != z or placements:z
 not matchr   z is not in state_dict.r  r  r.   )r  r  r  )"r  r  r  r  rv  rc   r  r  r7  ry   r   r9   r   rw  r   r|   r  r   r   r  r  r   r0  r   r   warningswarnr  r"   r  r   set_state_dictr  r  )r   r  r  r  cur_state_dictcopy_tensorr  r  r  cur_v
param_namer[  r  r  r  r  suffix_namesr  concat_tensorsori_pfused_wr  r  r  s                           rt   r  DistModel.set_state_dict  s    228J8J2Ke< ##   ))>> 	
 <<"".3GK$$&DA99;;J$qc3I JJ;"&)~~*,"8LL%"2"2# #
 $ANN#348J8J7K?[\[g[gZhhlmrm}m}l~  I  J  ::: 2215 
 ,99I+J(! '& <<%%1$$**,%)\\%?%?%E%E%GMC&.
<F<L<L<N97+')$4$:$:$<DA%01C1H1H!%LF%1 , 3 3F ; %= |,1$&2F-/N)8#(::#6>N#N$,MM+0::+>*??U(V%& %*$2$9$9(8f9L(M%& *9  #>2c/6JJ#-#7.<Q.?G'*?';q'@154C,-5**8. )2 ?N,-?**8. )< 2748	>B(;.=(6/52;<O	/&G %2'$: !1v1E F .=E$4$8$8$H .=M '3 '/ &H -p ==,, &--"<"<"> ,, &--"<"<">{ -,s   A6O/B)O//B
O//
O=c                   U R                   R                  nUc  U$ [        U[        R                  R
                  R                  R                  5      (       a  UR                  n[        U[        5      (       d   S5       eU$ )NzUThe optimizer should be ShardingOptimizerStage1 when stage1 tensor fusion is enabled.)	r  r  r   r   r  rg   	decoratorOptimizerWithMixedPrecisionr@   )r   ry  s     rt   _get_shard_stage1_optimizer%DistModel._get_shard_stage1_optimizer  sy    LL++	MM''CC
 
 ",,I)%<== 	
c	
= rs   c           	        U R                   (       a   U R                   R                  R                  OSnU(       d   S5       eU R                  5       nUc   S5       eUR	                  5        Vs/ s H#  nXPR
                  ;   a  U R
                  U   OUPM%     nn[        [        U[        UR                  5       5      5      5      nU" XA5        UR	                  5        Vs/ s H#  nXPR                  ;   a  U R                  U   OUPM%     nn[        [        U[        UR                  5       5      5      5      nU$ s  snf s  snf )NFz:Can only convert state_dict when tensor fusion is enabled.z!The optimizer should not be None.)rv  rc   r  r  r  rw  r  ro  r   r6  rx  )r   r  optimizer_functionr  ry  r  parameter_namesstructured_namess           rt   !_convert_state_dict_tensor_fusion+DistModel._convert_state_dict_tensor_fusion  sZ    ##   ))>> 	 $ 	
H	
# 446	$I&II$  __&
 ' ::: 2215 ' 	 
 #otJ4E4E4G/HIJ
91  __&
 ' ::: 2215 ' 	 
 #.Z5F5F5H0IJK
-

s   .*D: *D?c                *    S nU R                  X5      $ )Nc                &    U R                  U5        g rw   )(convert_state_dict_with_rank_unique_namery  r  s     rt   r  ODistModel._convert_state_dict_with_rank_unique_name.<locals>.optimizer_function  s    >>zJrs   r  r   r  r  s      rt   )_convert_state_dict_with_rank_unique_name3DistModel._convert_state_dict_with_rank_unique_name  s     	K 55
 	
rs   c                *    S nU R                  X5      $ )Nc                &    U R                  U5        g rw   ).convert_state_dict_without_tensor_fusion_paramr  s     rt   r  UDistModel._convert_state_dict_without_tensor_fusion_param.<locals>.optimizer_function'  s    DDZPrs   r  r  s      rt   /_convert_state_dict_without_tensor_fusion_param9DistModel._convert_state_dict_without_tensor_fusion_param&  s     	Q 55
 	
rs   c                *    S nU R                  X5      $ )Nc                &    U R                  U5        g rw   )+convert_state_dict_with_tensor_fusion_paramr  s     rt   r  RDistModel._convert_state_dict_with_tensor_fusion_param.<locals>.optimizer_function/  s    AA*Mrs   r  r  s      rt   ,_convert_state_dict_with_tensor_fusion_param6DistModel._convert_state_dict_with_tensor_fusion_param.  s     	N 55
 	
rs   c                *    S nU R                  X5      $ )Nc                &    U R                  U5        g rw   )#convert_state_dict_with_origin_namer  s     rt   r  JDistModel._convert_state_dict_with_origin_name.<locals>.optimizer_function7  s    99*Ers   r  r  s      rt   $_convert_state_dict_with_origin_name.DistModel._convert_state_dict_with_origin_name6  s     	F 55
 	
rs   )r  r  r  rv  r  rx  rw  r  r  )rZ  rQ   r  zShardDataloader | DataLoaderr  !Layer | Callable[..., Any] | Nonery  Optimizer | Noner   Strategy | Noner  zlist[Metric] | Noner  'list[list[DistributedInputSpec]] | Nonerf  rg  )rf  rg  rw   )r  z_Mode | Nonerf  rL   )r(  zSequence[Any] | Tensorrf  r   )r   T)r  zLiteral['opt', 'param', 'all']r  r  rf  dict[str, Tensor])r  r8  rf  rg  )rm   rn   ro   rp   r   r   r\   r]   r^   r  r  r  r  r  r  ru  r   r  r  r  r  r  r  r  r   r&  r,  r2  rr   rl   rs   rt   rp  rp  f  s   )^ 37&*$('+>BUU -U 0	U
 $U "U %U <U 
Un 
  $	8(;&:&=$>B+Z $ $L@" 05!\,\ \ 
	\|2!heN"#J



rs   rp  c           	     v   [        U[        5      (       Gac  [        5       (       GdS  UR                  nUR                  nUR
                  nUGb+  Uc  [        R                  " 5       OUnUc   S5       e[        U[        5      (       a3  SUR                  l
        SUR                  l        XtR                  l        O[        U[        5      (       a3  SUR                  l
        SUR                  l        XtR                  l        Ow[        U[        5      (       aW  SUR                  l
        SUR                  l        XtR                  l        UR                   H  nUR!                  U5        M     O[#        S5      eUb  UR$                  (       a  ['        XX#XES9n	U	$ [(        R*                  R-                  U SS	9n U $ )
a  
Converts the ``layer`` with distributed tensor (constructed from
``paddle.distributed.shard_tensor``) to a static graph. ``to_static``
returns a DistModel instance containing the static graph for
distributed training, evaluation and prediction.

Args:
    layer(paddle.nn.Layer): The layer in dygraph mode, the parameters
        or its inputs can be distributed tensors.
    loader(ShardDataloader|paddle.io.DataLoader): The data loader used in dygraph mode,
        used to infer inputs_spec and labels_spec.
    loss(Loss|Callable|None, optional): The loss function for training
        or evaluating the model. Can be a `paddle.nn.Layer` instance or
        any callable function. Default: None.
    optimizer(paddle.optimizer.Optimizer|_ShardOptimizer|None, optional):
        The optimizer for training. It can `paddle.optimizer.Optimizer`
        or `_ShardOptimizer` wrapped by `shard_optimizer`. Default: None.
    strategy(paddle.distributed.Strategy|None, optional): Configs for
        parallel strategies and optimization settings (e.g. sharding,
        pipeline parallelism). Default: None.
    input_spec(list[list[paddle.distributed.DistributedInputSpec]]|None, optional):
        The custom input specs specify the shape, dtype, and name information
        of model inputs and labels. If it is not None, the input specs and
        label specs will be inferred from the custom input specs. The custom
        input specs should be a list containing two sublists: the first
        sublist represents theinput specs, and the second sublist represents
        the label specs. Default: None.

Returns:
    DistModel: A ``DistModel`` instance converted the input ``layer``.

Examples:
    .. code-block:: python

        >>> import numpy as np
        >>> import paddle
        >>> import paddle.distributed as dist
        >>> from paddle import nn
        >>> from paddle.distributed import Replicate, Shard

        >>> BATCH_SIZE = 4
        >>> BATCH_NUM = 4
        >>> IMAGE_SIZE = 16
        >>> CLASS_NUM = 8
        >>> class RandomDataset(paddle.io.Dataset): # type: ignore[type-arg]
        ...     def __init__(self, images, labels, num_samples):
        ...         self.images = images
        ...         self.labels = labels
        ...         self.num_samples = num_samples
        ...     def __getitem__(self, idx):
        ...         return self.images[idx], self.labels[idx]
        ...     def __len__(self):
        ...         return self.num_samples

        >>> class DemoNet(nn.Layer):
        ...     def __init__(self, mesh):
        ...         super().__init__()
        ...         self._mesh = mesh
        ...         self.linear_0 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)
        ...         self.linear_1 = nn.Linear(IMAGE_SIZE, CLASS_NUM)
        ...         self.relu = nn.ReLU()
        ...         # shard the weights of this layer
        ...         self.linear_0.weight = dist.shard_tensor(
        ...             self.linear_0.weight,
        ...             self._mesh,
        ...             [Shard(1)],
        ...             stop_gradient=False,
        ...         )
        ...         self.linear_1.weight = dist.shard_tensor(
        ...             self.linear_1.weight,
        ...             self._mesh,
        ...             [Shard(0)],
        ...             stop_gradient=False,
        ...         )
        ...     def forward(self, x):
        ...         out = self.linear_0(x)
        ...         out = self.relu(out)
        ...         out = self.linear_1(out)
        ...         return out

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> images = np.random.rand(BATCH_SIZE, IMAGE_SIZE).astype('float32')
        >>> labels = np.random.rand(BATCH_SIZE, CLASS_NUM).astype('float32')
        >>> dataset = RandomDataset(images, labels, BATCH_SIZE)
        >>> loader = paddle.io.DataLoader(dataset, batch_size=BATCH_SIZE)

        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
        >>> layer = DemoNet(mesh)
        >>> opt = paddle.optimizer.SGD(
        ...     learning_rate=0.1, parameters=layer.parameters()
        ... )
        >>> loss_fn = nn.MSELoss()
        >>> dist_loader = dist.shard_dataloader(loader, meshes=[mesh])
        >>> dist_model = dist.to_static(
        ...     layer, dist_loader, loss_fn, opt
        ... )
        >>> # training
        >>> dist_model.train()
        >>> for batch_id, (image, label) in enumerate(dist_loader()):
        ...     # in train mode, executing the __call__ method will
        ...     # update the parameters of the model and return the
        ...     # loss
        ...     loss = dist_model(image, label)

        >>> # evaluation
        >>> dist_model.eval()
        >>> for batch_id, (image, label) in enumerate(dist_loader()):
        ...     # in eval mode, executing the __call__ method will
        ...     # return the loss
        ...     loss = dist_model(image, label)

        >>> # prediction
        >>> dist_model.predict()
        >>> for batch_id, (image, label) in enumerate(dist_loader()):
        ...     # in predict mode, executing the __call__ method will
        ...     # return a dict that contains the outputs of the model,
        ...     # where the value of "out0" is the first output.
        ...     outs = dist_model(image)

        >>> # This case need to be executed in multi-card environment
        >>> # export CUDA_VISIBLE_DEVICES=0,1
        >>> # python -m paddle.distributed.launch {test_case}.py
z Sharding degree can not be None.Tr.      r  zdOnly sharding stage 1, 2 and 3 can to_static for now. User-defined shard_fn will be supported later.)r  F)r7  )r   rs  r   r  r  ru  r   r  r  rc   r/  stagedegreer  r  r  r  r  r7  rp  r   jitr  )
rZ  r  r  ry  r   r  rk  rA  r   
dist_models
             rt   r  r  ?  s   F )_--kmm&&#44((	*2*:t}}H
 #. 2. (N33+/!!(*+!!'+:!!(Hn55+/!!(*+!!'+:!!(Hn55+/!!(*+!!'+:!!(&66E//6 7 *z  8..4H

 

$$Uu$=rs   c                   [         R                  " 5       (       a  U R                  5       SL a  [        S5      eU R                  nU R
                  n[        R                  " 5       /[        U5      -  n[        XU5      n[        U [        5      (       a/  [        R                  " UR                  5       40 U R                  D6$ [         R                  " UR                  5       5      $ [         R                   R#                  5       (       aK  [         R$                  R'                  U R)                  5       U R*                  5      nU R-                  U5        U $ [/        S5      e)a  
Converts a distributed tensor to a dense tensor. ``unshard_dtensor``
first make the ``dist_tensor`` be ``Replicate`` state on all processes and
then converts it to a dense ``paddle.Tensor``. It can be treated as a
reverse operation of ``shard_tensor``.

Args:
    dist_tensor (paddle.Tensor): The distributed tensor which is constructed
        from a dense tensor with ``shard_tensor``.

Returns:
    paddle.Tensor: The original dense tensor of the input ``dist_tensor``.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed as dist
        >>> from paddle.distributed import Replicate, Shard

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
        >>> original_tensor = paddle.rand([4, 1024, 512])
        >>> dist_tensor = dist.shard_tensor(original_tensor, mesh, [Shard(0)])
        >>> # dense_tensor's shape is the same as original_tensor
        >>> dense_tensor = dist.unshard_dtensor(dist_tensor)
Fr$  z;`unshard_dtensor()` only supported in dynamic and pir mode.)r   r   ry   r   r   r   r   r   r   r  r   r   r   r|   r   rF   r   r   r   create_shaped_typer   r   r  r  )r   r   r   replicate_placementsr_dist_tensordense_tensor_types         rt   unshard_dtensorrD    s)   8   E)HII'' ++
 $ 01C
OC3GHk>22!--**,&& 
 ==!;!;!=>>				%	%	'	'"JJ99 1 1
 	./ "I
 	
rs   c                      \ rS rSrSr    S           SS jjrS rS rS rS r	S	 r
S
 rS r SS jrS rS rS rSrg)r  i+  a  
ShardDataloader converts a dataloader to a new dataloader which provided two capabilities:
1. split dataloader by shard_dim to do data parallel.
2. reshard the output of dataloader to distributed tensor.
if is_dataset_splitted is True, just need to do reshard.

Args:
    dataloader (paddle.io.DataLoader): The dataloader to be sharded.
    meshes (ProcessMesh|list[ProcessMesh]|tuple[ProcessMesh]): The mesh list of the dataloader.
        Identify which mesh the input is on. if len(meshes) == 1 or type(meshes) == ProcessMesh,
        all the inputs are on the same mesh.
    input_keys (list[str]|tuple[str]): if the iteration result of dataloader is a dict of tensors,
        input_keys is the keys of this dict, identify which tensor is located on which mesh,
        one-to-one correspondence with meshes. i.e. dict[input_keys[i]] is on meshes[i].
        Default: None, which means the outputs is a list, and the i'th input is on meshes[i].
    shard_dims (list|tuple|str|int]): The mesh dimension to shard the dataloader.
        Users can specify the shard_dim of each mesh or specify a single shard_dim for all meshes.
        Default: None, which means the data loader will not be split, i.e. mp.
    is_dataset_splitted (bool): Whether the dataset has been split.
    dense_tensor_idx (list): A paired 2D list specifies the index of the dense_tensor in the output of dataloader.
        It allows users to identify which elements within each output batch are dense_tensor.
        first dense_tensor: the dense_tensor return by dataloader.
        second dense_tensor: num_or_sections specifies how to split first tensor: evenly (if a number) or unevenly (if a list).
        Default: None, meaning all outputs are dist_tensors.
        Note: For dense_tensor_idx settings, the idx must be paired.
Nc                   USL a  Uc  [        S5      e[        U5      U l        U R                  b  [        U R                  5      S:X  a  [        S5      e[        R
                  " 5       nU R                  U5      (       a  [        SU SU R                   35      e[        U R                  5      S:H  U l        X0l        U R                  U5      U l
        U R                  U5      u  pUcJ  [        U R                  S   5      S   n[        U R                  S   5      S   n	Sn
UR                  U	5      nO"UR                  X5      n
UR                  U	5      nUSL d  Uc#  Xl        UR                  R                   U l        GO[#        UR                  [$        5      (       a4  UR                  R                   U l        UR                  U l        Xl        GOs['        UR                  R                   U-  5      U l        [#        UR                  [(        5      (       a  SnSnO,UR                  R*                  nUR                  R,                  n[%        UR.                  U R                   UU
UUS	9U l        UR                  R0                  U R                  l        [2        R4                  R7                  UR.                  U R                  UR8                  UR:                  UR<                  UR>                  UR@                  URB                  URD                  URF                  URH                  URJ                  URL                  S
9U l        SU R                  l'        S U l(        X`l)        g )NTz7shard_dims must be set when is_dataset_splitted is Truer   zmeshes must be setzprocess_id z* is in more than one mesh, the meshes are r.   F)r  r  num_replicasr  shuffle	drop_last)r  r  r  placesreturn_list
collate_fnnum_workersuse_buffer_readerprefetch_factoruse_shared_memorytimeoutworker_init_fnpersistent_workers)*r   r&   _meshesr   r   r   _process_id_in_multi_meshes_all_inputs_in_one_mesh_input_keys_process_shard_dims_shard_dims_get_mesh_and_shard_dimr  get_rank_by_dim_and_process_id_dataloaderr  r  r   r+   intr,   rH  rI  r  
_acc_stepsr   iorO   r  rJ  rK  rL  rM  rN  rO  rP  rQ  rR  _persistent_workers
pin_memoryr5  dense_tensor_idx)r   
dataloadermeshes
input_keys
shard_dimsis_dataset_splittedrb  
process_idr   r  dp_rankdp_world_sizerH  rI  s                 rt   r   ShardDataloader.__init__G  s    $&:+=I  v<<3t||#4#9122]]_
++J77j\)STXT`T`Sab  (+4<<'8A'=$%33J?66zB<4<<?+A.D 0 0 34Q7IG --i8M99)PG --i8M$&**<)(66AADO
002IJJ(66AADO!+!9!9D)!((33mCDO *224LMM!	$22::&44>>	!8"**??*#"D -7,D,D,O,OD)%yy33"**"00$..!((&22%00&22",">"> * : :",">">"**)88#-#A#A  4  D  ',#	 0rs   c                   [        U[        [        45      (       d  Uc  / n[        [	        U R
                  5      5       Hi  n[        U R
                  U   [        [        45      (       a-  UR                  U/[	        U R
                  U   5      -  5        MX  UR                  U5        Mk     U$ [	        U5      [	        U R
                  5      :w  a-  [        S[	        U5       S[	        U R
                  5       35      eU$ )Nz6shard_dims must be the same length as meshes, but got r  )
r   r]  r   rU  r   rT  r   r  r   r   )r   rf  resr   s       rt   rX  #ShardDataloader._process_shard_dims  s    j3*--1CC3t||,-dll1oe}==JJ
|c$,,q/.BBCJJz*	 .
 J:#dll"33 LSQ[_L]]abefjfrfrbsatu  rs   c                   [        [        U R                  5      5       H  n[        U R                  U   [        [
        45      (       aq  [        [        U R                  U   5      5       HJ  nXR                  U   U   R                  ;   d  M$  U R                  U   U   U R                  U   U   4s  s  $    M  XR                  U   R                  ;   d  M  U R                  U   U R                  U   4s  $    g)N)NN)rU  r   rT  r   r   r  _process_idsrY  )r   rh  r   js       rt   rZ  'ShardDataloader._get_mesh_and_shard_dim  s    s4<<()A$,,q/D%=99s4<<?34A!\\!_Q%7%D%DD#||Aq143C3CA3Fq3III 5 a!=!==<<?D,<,<Q,??? * rs   c                   Sn/ nU R                    HB  n[        U[        [        45      (       a  UR	                  U5        M1  UR                  U5        MD     [        [        U5      5      nU H  nXR                  ;   d  M  US-  nM     US:  $ Nr   r.   )rT  r   r   r  extendr   r  rp  )r   rh  countflatten_meshesr   unique_meshess         rt   rU  +ShardDataloader._process_id_in_multi_meshes  s    LLD$u..%%d+%%d+	 ! S01!D...
 " qyrs   c                ,    [        U R                  5      $ rw   )r   r\  r   s    rt   __len__ShardDataloader.__len__  s    4##$$rs   c                    S U l         U $ rw   r5  r   s    rt   __iter__ShardDataloader.__iter__      	rs   c                   U R                   (       a  U R                  S   OU R                  U   nUb'  [        5       (       d  [        R                  " S5      /nO[        R
                  " 5       /nU R                   (       a  U R                  S   OU R                  U   n[        S[        UR                  5      5       H'  nUR                  [        R
                  " 5       5        M)     XC4$ rt  )rV  rY  r1   r   r  r   rT  rU  r   r`  r   )r   r   r  r   r   rH  s         rt   _get_mesh_and_placement'ShardDataloader._get_mesh_and_placement  s     ++ Q!!%( 	
  ):):**Q-J..*+J ++ LLOe$ 	
 q#dkk*+Adnn./ ,rs   c                   U R                   (       a'  U R                  S   /U-  nU R                  S   /U-  nOU R                  U   n[        U[        [
        45      (       a  [        U5      U:X  d   eOU/U-  nU R                  U   n[        U[        [
        45      (       a  [        U5      U:X  d   eOU/U-  n/ n[        U5       H  nXF   b'  [        5       (       d  [        R                  " S5      /nO[        R                  " 5       /n[        S[        X6   R                  5      5       H'  nUR                  [        R                  " 5       5        M)     UR                  U5        M     X54$ rt  )rV  rT  rY  r   r   r  r   rU  r1   r   r  r   r`  r   )	r   r   lengthrd  rf  r   r   r   rH  s	            rt   )_get_meshes_and_placements_for_list_input9ShardDataloader._get_meshes_and_placements_for_list_input  sA   ''ll1o&/F**1-.7J\\%(F&4-006{f,,, F*))%0J*tUm44:&000(\F2

vA}(1B1B!ZZ]O	!^^-.	1c&)"2"234  !12 5i(  !!rs   c           	         / n[        [        U5      5       Hc  nUb  Xd;   d!  [        X   [        R                  5      (       d  UR                  X   5        MA  UR                  [        X   X&   X6   5      5        Me     U$ rw   )rU  r   r   r   rF   r   r!  )r   list_tensorsrd  r   rb  	dist_datarq  s          rt   _dtensors_from_list_input)ShardDataloader._dtensors_from_list_input  sv     	s<()A ,1F??  1  &$JM * rs   c           	        [        U[        [        45      (       Gai  U R                  SL a$  [	        U5      [	        U R
                  5      :X  d   e/ n[        [	        U5      5       GH  nX   n[        U[        [        45      (       a_  U R                  U[	        U5      5      u  nnU R                  c  S OU R                  U   nUR                  U R                  XEXg5      5        M  [        U[        R                  5      (       ad  U R                  b&  U R                  U   / :w  a  UR                  U5        M  U R                  U5      u  pUR                  [        XHU5      5        GM  [        S[!        U5       35      e   U$ [        U["        5      (       GaS  U R$                  c  UR'                  5       OU R$                  n	U R                  SL a$  [	        U	5      [	        U R
                  5      :X  d   e0 n[)        U	5       H  u  p:X   n[        U[        [        45      (       aR  U R                  U[	        U5      5      u  nnU R                  c  S OU R                  U   nU R                  XEXg5      X*'   Mv  [        U[        R                  5      (       aK  U R                  b  U R                  U   / :w  a  XBU
'   M  U R                  U5      u  p[        X   X5      X*'   M  XBU
'   M     U$ [        U[        R                  5      (       a  U R                  S5      u  p[        XU5      $ [        S[!        U5       35      e)NFzUnsupported input_data type r   zUnsupported batch_data type )r   r   r  rV  r   rT  rU  r  rb  r   r  r   rF   r  r!  r   r   r  rW  r  r   )r   
batch_datadist_batch_datar   
input_datard  r   _dense_tensor_idxr   re  r[  s              rt   
_get_batchShardDataloader._get_batch  s   j4-00++u4:#dll*;;;; O3z?+']
j4-88 FF3z?"  008 !2215 &
 $**66&

  
FMM::--9 11!4:'..z:+/+G+G+J('...zL %6tJ7G6HI ? ,D #"
D)) ##+ !%% 
 ++u4:#dll*;;;; O#J/'_
j4-88 FF3z?"  008 !2215 &
 ,0+I+I"J,O(  
FMM::--9 11!4:/9,+/+G+G+J(/A&OT0, ,6C(; 0< #"
FMM22#;;A>D%j
CC;D<L;MNOOrs   c                    U R                   c  U R                  R                  5       U l         [        U R                   5      nU R	                  U5      $ rw   )r5  r\  r  r4  r  )r   r  s     rt   __next__ShardDataloader.__next__d  s>    99((113DI$))_
z**rs   c                    S U l         U $ rw   r~  r   s    rt   r  ShardDataloader.__call__j  r  rs   )	rV  r\  rW  rT  rY  r  r  rb  r5  NNFN)rc  zpaddle.io.DataLoaderrd  z4ProcessMesh | list[ProcessMesh] | tuple[ProcessMesh]re  zlist[str] | tuple[str] | Nonerf  zlist | tuple | str | int | Nonerg  r  rb  list[list[int]] | Nonerw   )rm   rn   ro   rp   r   r   rX  rZ  rU  r{  r  r  r  r  r  r  r  rr   rl   rs   rt   r  r  +  s    > 596:$)37S1(S1 ES1 2	S1
 4S1 "S1 1S1j 	 %
 &"8 BF"TPl+rs   r  c                "    [        U UUUUU5      $ )a!  
Convert the dataloader to a ShardDataloader which provided two capabilities:
1. split dataloader by shard_dim to do data parallel if it it not None.
2. reshard the output of dataloader to distributed tensor.
if is_dataset_splitted is True, it means that the dataset has been split by users, and just need to do reshard.
only if is_dataset_splitted is False and shard_dims is not None, it will do split.

Args:
    dataloader (paddle.io.DataLoader): The dataloader to be sharded. the output of dataloader
        must be a list or dict of paddle.Tensor with 2 elements, i.e. [input_data, label] or
        {"input_data": input_data, "label": label}, input_data and label can be a list to support multiple inputs.
    meshes (ProcessMesh|list[ProcessMesh]|tuple[ProcessMesh]): The mesh list of the dataloader.
        Identify which mesh the input is on. if len(meshes) == 1 or type(meshes) == ProcessMesh,
        all the inputs are on the same mesh.
    input_keys (list[str]|tuple[str]): if the iteration result of dataloader is a dict of tensors,
        input_keys is the keys of this dict, identify which tensor is located on which mesh,
        one-to-one correspondence with meshes. i.e. dict[input_keys[i]] is on meshes[i].
        Default: None, which means the outputs is a list, and the i'th input is on meshes[i].
    shard_dims (list(str)|tuple(str)|list(int)|tuple(int)|str|int]):
        The mesh dimension to shard the dataloader.
        Users can specify the shard_dim of each mesh or specify a single shard_dim for all meshes.
        Default: None, which means the data loader will not be split, i.e. mp.
    is_dataset_splitted (bool): Whether the dataset has been split, Default: False.
    dense_tensor_idx (list): A paired 2D list specifies the index of the dense_tensor in the output of dataloader.
        It allows users to identify which elements within each output batch are dense_tensor.
        first dense_tensor: the dense_tensor return by dataloader.
        second dense_tensor: num_or_sections specifies how to split first tensor: evenly (if a number) or unevenly (if a list).
        Default: None, meaning all outputs are dist_tensors.
        Note: For dense_tensor_idx settings, the idx must be paired.
Returns:
    ShardDataloader: The sharded dataloader.

Examples:
    .. code-block:: python
        :name: example-1

        >>> import os
        >>> import numpy as np
        >>> import paddle
        >>> import paddle.distributed as dist
        >>> from paddle.io import BatchSampler, DataLoader, Dataset

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> mesh0 = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=['x', 'y'])
        >>> mesh1 = dist.ProcessMesh([[4, 5], [6, 7]], dim_names=['x', 'y'])

        >>> paddle.seed(1024)
        >>> np.random.seed(1024)
        >>> class RandomDataset(Dataset): # type: ignore[type-arg]
        >>>     def __init__(self, seq_len, hidden, num_samples=8):
        ...         super().__init__()
        ...         self.seq_len = seq_len
        ...         self.hidden = hidden
        ...         self.num_samples = num_samples
        ...         self.inputs = [np.random.uniform(size=[self.seq_len, self.hidden]).astype("float32") for _ in range(num_samples)]
        ...         self.labels = [np.array(index, dtype="float32") for index in range(num_samples)]

        ...     def __getitem__(self, index):
        ...         return self.inputs[index], self.labels[index]

        ...     def __len__(self):
        ...         return self.num_samples

        >>> class MlpModel(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super(MlpModel, self).__init__()
        ...         self.w0 = dist.shard_tensor(
        ...             self.create_parameter(shape=[8, 8]),
        ...             mesh0, [dist.Replicate(), dist.Shard(1)])
        ...         self.w1 = dist.shard_tensor(
        ...             self.create_parameter(shape=[8, 8]),
        ...             mesh1, [dist.Replicate(), dist.Shard(0)])

        ...     def forward(self, x):
        ...         y = paddle.matmul(x, self.w0)
        ...         y = dist.reshard(y, mesh1, [dist.Shard(0), dist.Shard(2)])
        ...         z = paddle.matmul(y, self.w1)
        ...         return z

        >>> model = MlpModel()
        >>> dataset = RandomDataset(4, 8)
        >>> sampler = BatchSampler(
        ...     dataset,
        ...     batch_size=2,
        ... )
        >>> dataloader = DataLoader(
        ...     dataset,
        ...     batch_sampler=sampler,
        ... )
        >>> dist_dataloader = dist.shard_dataloader(
        ...     dataloader=dataloader,
        ...     meshes=[mesh0, mesh1],
        ...     shard_dims="x"
        ... )
        >>> opt = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters())
        >>> dist_opt = dist.shard_optimizer(opt)
        >>> def loss_fn(logits, label):
        ...     # logits: [bs, seq_len, hidden], label: [bs]
        ...     loss = paddle.nn.MSELoss(reduction="sum")
        ...     logits = paddle.sum(logits, axis=[1, 2])
        ...     return loss(logits, label)

        >>> RUN_STATIC = eval(os.environ['RUN_STATIC'])
        >>> def run_dynamic():
        ...     for step, (input, label) in enumerate(dist_dataloader()):
        ...         logits = model(input)
        ...         loss = loss_fn(logits, label)
        ...         print("step:{}, loss:{}".format(step, loss))
        ...         loss.backward()
        ...         dist_opt.step()
        ...         dist_opt.clear_grad()

        >>> def run_static():
        ...     dist_model = dist.to_static(
        ...         model, dist_dataloader, loss_fn, opt
        ...     )
        ...     dist_model.train()
        ...     for step, (input, label) in enumerate(dist_dataloader()):
        ...         print("label:", label)
        ...         loss = dist_model(input, label)
        ...         print("step:{}, loss:{}".format(step, loss))

        >>> if RUN_STATIC == 0:
        ...     run_dynamic()
        ... else:
        ...     run_static()

        >>> # This case need to be executed in multi-card environment
        >>> # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
        >>> # RUN_STATIC=1 python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" {test_case}.py
        >>> # RUN_STATIC=0 python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" {test_case}.py

    .. code-block:: python
        :name: example-2

        >>> import paddle
        >>> import paddle.distributed as dist
        >>> from paddle.io import BatchSampler, DataLoader, Dataset
        >>> import numpy as np
        >>> mesh0 = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=['dp', 'mp'])
        >>> mesh1 = dist.ProcessMesh([[4, 5], [6, 7]], dim_names=['dp', 'mp'])
        >>> class RandomDataset(Dataset): # type: ignore[type-arg]
        ...     def __init__(self, seq_len, hidden, num_samples=8):
        ...         super().__init__()
        ...         self.seq_len = seq_len
        ...         self.hidden = hidden
        ...         self.num_samples = num_samples
        ...         self.inputs1 = [
        ...             np.random.uniform(size=[self.seq_len, self.hidden]).astype(
        ...                 "float32"
        ...             )
        ...             for _ in range(num_samples)
        ...         ]
        ...         self.inputs2 = [
        ...             np.random.uniform(size=[self.seq_len, self.hidden]).astype(
        ...                 "float32"
        ...             )
        ...             for _ in range(num_samples)
        ...         ]
        ...         self.labels = [
        ...             np.array(index, dtype="float32") for index in range(num_samples)
        ...         ]
        ...     def __getitem__(self, index):
        ...         return {
        ...             "inputs": [self.inputs1[index], self.inputs2[index]],
        ...             "label": self.labels[index],
        ...         }
        ...     def __len__(self):
        ...         return self.num_samples

        >>> dataset = RandomDataset(4, 8)
        >>> sampler = BatchSampler(
        ...     dataset,
        ...     batch_size=2,
        ... )
        >>> dataloader = DataLoader(
        ...     dataset,
        ...     batch_sampler=sampler,
        ... )
        >>> dist_dataloader = dist.shard_dataloader(
        ...     dataloader=dataloader,
        ...     meshes=[mesh0, mesh1],  # or [[mesh0, mesh0], mesh1]
        ...     shard_dims="dp",
        ...     input_keys=["inputs", "label"],
        ... )
)r  )rc  rd  re  rf  rg  rb  s         rt   shard_dataloaderr  p  s$    F  rs   c                 Z    [         R                  R                  R                  S5      S   $ )N%FLAGS_enable_auto_parallel_align_mode)r   r   r   r  rl   rs   rt   r  r  =  s*    ;;  **/-/ /rs   c                     [        5         g)a<  
Enables an automated Data Parallel (DP) setup for auto-parallel training.

This function simplifies the process of implementing vanilla (standard) Data
Parallelism within the auto-parallel framework. By calling ``enable_auto_dp()``,
users can achieve data parallel training without needing to manually configure
``paddle.distributed.shard_dataloader`` (or a similar distributed dataloader
interface) for DP-specific data sharding or distribution. This mode automates
the setup required for DP communication and data handling.

The function works by setting the related environment variable
to ``1``. This signals to the auto-parallel system that it should
automatically manage the data parallelism aspects of the training process
according to a predefined strategy.

A significant advantage of this automated DP mode is its inherent robustness
and ability to handle scenarios that can be challenging for manual or other
standard DP configurations. For instance, it is particularly effective for:

- Training models where input data may have non-uniform shapes across
  different data parallel ranks (e.g., certain video generation models
  like Wanx). In such cases, where traditional DP might lead to program
  hangs due to shape mismatches during communication, this automated mode
  employs strategies (like adjusting data representation and gradient
  synchronization) to ensure smooth training.

In essence, ``enable_auto_dp()`` provides two key benefits:

1. **Simplified DP Setup:** Automates the configuration for basic data
   parallelism, reducing manual setup effort (e.g., no need for manual
   ``shard_dataloader`` DP configuration).
2. **Robustness for Complex Cases:** Effectively handles advanced scenarios
   like non-uniform input shapes.

Note:
    This function should typically be called at the very beginning of your
    training script, prior to initializing Paddle's distributed environment
    or any auto-parallel components. The underlying auto-parallel framework,
    including its data loading and optimizer components, must be designed to
    recognize and act upon the environment variable.

Examples:
    .. code-block:: python

        >>> import numpy as np
        >>> import paddle
        >>> from paddle import nn
        >>> import paddle.distributed as dist
        >>> from paddle.io import Dataset, DataLoader

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> dist.enable_auto_dp()

        >>> BATCH_SIZE = 32
        >>> CLASS_NUM = 10
        >>> INPUT_DIM = 256
        >>> STEPS = 100

        >>> class RandomDataset(Dataset):  # type: ignore[type-arg]
        ...     def __init__(self, num_samples):
        ...         rank = dist.get_rank() if dist.get_world_size() > 1 else 0
        ...         np.random.seed(42 + rank)
        ...         self.num_samples = num_samples
        ...     def __getitem__(self, idx):
        ...         x = np.random.rand(INPUT_DIM).astype('float32')
        ...         y = np.random.randint(0, CLASS_NUM, (1,)).astype('int64')
        ...         return x, y
        ...     def __len__(self):
        ...         return self.num_samples

        >>> class SimpleNet(nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.net = nn.Sequential(
        ...             nn.Linear(INPUT_DIM, 102400),
        ...             nn.Linear(102400, INPUT_DIM),
        ...             nn.Linear(INPUT_DIM, CLASS_NUM),
        ...         )
        ...     def forward(self, x):
        ...         return self.net(x)

        >>> model = SimpleNet()
        >>> optimizer = paddle.optimizer.AdamW(learning_rate=1e-3, parameters=model.parameters())
        >>> dataset = RandomDataset(num_samples=STEPS * BATCH_SIZE)
        >>> loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

        >>> model.train()
        >>> for step, (x, y) in enumerate(loader):
        ...     y.stop_gradient = True
        ...     loss = paddle.mean(model(x))
        ...     loss.backward()
        ...     optimizer.step()
        ...     model.clear_gradients()
        ...     if step % 5 == 0:
        ...         print(f"[step {step}] loss: {loss.item():.4f}")

        >>> # This case need to be executed in multi-card environment
        >>> # export CUDA_VISIBLE_DEVICES=0,1
        >>> # python -m paddle.distributed.launch {test_case}.py

N)r/   rl   rs   rt   enable_auto_dpr  C  s    L rs   )r~   zpaddle.Tensor)NNN)r   z+Tensor | TensorLike | NestedNumericSequencer   r   r   Sequence[Placement]r   zDTypeLike | Noner   zPlaceLike | Noner   zbool | Nonerf  rF   )r   )r'  zCallable[..., Tensor]r   r   r   r  r(  r   r)  r   rf  rF   )r   rF   r   r   r   r  rf  rF   )rZ  rQ   r   r   rk  z0Callable[[str, Layer, ProcessMesh], None] | Nonera  1Callable[[Any, ProcessMesh], list[Tensor]] | Nonere  r  rf  rQ   rn  r>  )ry  r-   rk  z.Callable[[str, Tensor, Tensor], Tensor] | Noner  r]  rf  rs  )r+  rK   rf  rK   r  )rZ  rQ   r  z#ShardDataloader | DataLoader | Noner  r4  ry  r5  r   r6  r  r7  rf  rp  )r   rF   rf  rF   r  )rc  rO   rd  z#ProcessMesh | Sequence[ProcessMesh]re  zSequence[str] | Nonerf  z0Sequence[str] | Sequence[int] | str | int | Nonerg  r  rb  r  rf  r  )
__future__r   r   r}  r  r  collectionsr   typesr   typingr   r   r   r	   numpyr  r   paddle.distributedrV  r   r
   r   r   paddle.amp.auto_castr   paddle.amp.grad_scalerr   paddle.autogradr   paddle.baser   paddle.base.dygraph.baser   paddle.base.frameworkr   r   r   r   r   r   r    paddle.distributed.auto_parallelr   r   r9  *paddle.distributed.auto_parallel.interfacer   r   -paddle.distributed.auto_parallel.process_meshr   2paddle.distributed.auto_parallel.static.completionr   4paddle.distributed.auto_parallel.static.dist_contextr   /paddle.distributed.auto_parallel.static.dist_opr    -paddle.distributed.auto_parallel.static.utilsr!   r"   r#   r$   r%   r&   3paddle.distributed.fleet.utils.tensor_fusion_helperr'   r(   r)   paddle.frameworkr*   "paddle.io.dataloader.batch_samplerr+   r,   paddle.optimizerr-   auto_dp_utilsr/   r0   r1   r  r2   r3   r4   r5   r6   r7   r8   placement_typer9   r:   r;   r<   r=   randomr>   r?   rc   r@   rA   rB   collections.abcrC   rD   typing_extensionsrE   rF   paddle._typingrG   rH   rI   rJ   
paddle.amprK   rL   rM   7paddle.distributed.auto_parallel.static.dist_input_specrN   	paddle.iorO   paddle.metricrP   	paddle.nnrQ   r:  rR   rS   rT   rU   rV   rW   rX   rY   rZ   r_   rq   ra   r   r   r   r   r   r   r  r   r  r   r!  r%  r*  r  rm  rq  rs  r  r  r  r  r  r  r,  r.  
BaseConfigr  rp  r  rD  r  r  r  r  rl   rs   rt   <module>r     sV   #   	  #  9 9   ! " " 1 1 # # ;  % N F P  
 " ' 
    /  2+  &-, %$
 
 
 9:E9:	/)5 	/&
>$t"" >$P #"!%OA
5OA
OA $OA 	OA
 OA OA OAdag aH+" 9;G
TxG xx KO#
L
$
"2"2
"2 $"2 	"2
 "2 "2Pww*w8Kwwz BFBFCGT
T
T
 ?T
 @	T

 AT
 T
n
*c5i c5Ls/ s/l( =@' =@@?' ?DY@' Y@| @D'(0M0M<0M "%0M 	0MfL`A A*_}'' _D	V
 V
v 37.2"& $:>kk/k ,k  	k
 k 8k k\;
|B BP
 (,CG %/3JJ/J %J A	J
 J -J JZ/frs   