
    Αio                        S SK Jr  S SKJrJr  \(       a  S SKJr  S SKrS SKJ	r
  S SKrS SKJr  S SKJr  SS jr " S S	5      r " S
 S5      rS rS rSSSSSSS.               SS jjrg)    )annotations)TYPE_CHECKINGCallable)IterableNFullyShardAuto
FullyShardc                 `    [        [        R                  R                  R                  SS5      $ )N_in_auto_parallel_F)getattrpaddlebase	framework
global_var     c/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/fsdp/fully_shard.pyin_auto_parallel_moder      s'    ((*> r   c                  N    \ rS rSr% SrS\S'   SrS\S'   SrS\S'   SrS\S	'   S
r	g)MixedPrecisionPolicy%   Nzpaddle.dtype | Noneparam_dtypereduce_dtypeoutput_dtypeTboolcast_forward_inputsr   )
__name__
__module____qualname____firstlineno__r   __annotations__r   r   r   __static_attributes__r   r   r   r   r   %   s.    '+K$+(,L%,(,L%, $$r   r   c                  $    \ rS rSr% SrS\S'   Srg)OffloadPolicy-   Tr   
pin_memoryr   N)r   r   r    r!   r'   r"   r#   r   r   r   r%   r%   -   s    Jr   r%   c                    [        U 5      $ Nr	   modulemeshreshard_after_forwardshard_placement_fn	mp_policyoffload_policyignored_paramss          r   _fully_shard_manual_parallelr2   1   s     fr   c                    [        X5        U $ r)   r   r*   s          r   _fully_shard_auto_parallelr4   =   s     6 Mr   )r,   r-   r.   r/   r0   r1   c                   Uc
  [        5       nUc
  [        5       nU(       a  [        U5      O	[        5       nU UUUUUU4n[        5       (       a  [	        U6 $ [        U6 $ )a	  
Apply fully sharded data parallel (FSDP) to the given module.

This function wraps the input module with fully sharded data parallelism, which shards
model parameters, gradients, and optimizer states across multiple devices. It supports
both auto_parallel mode and manual_parallel mode.

Args:
    module (Layer): The neural network module to be wrapped with fully sharded data parallelism.
    mesh (dist.ProcessMesh, optional): The process mesh defining the device arrangement for sharding.
        Defaults to None, which uses the default mesh.
    reshard_after_forward (bool | int | None, optional): Controls when to reshard the parameters after forward pass.
        If True or 1, reshard after each forward pass. If False or 0, keep sharded.
        If None, use default strategy. Defaults to None.
    shard_placement_fn (Callable[[paddle.Tensor], dist.Shard | None] | None, optional):
        A function that determines how each tensor should be sharded. Takes a tensor as input
        and returns a Shard placement or None. If None, uses default sharding strategy.
        Defaults to None.
    mp_policy (MixedPrecisionPolicy | None, optional): Mixed precision policy configuration.
        If None, creates a default MixedPrecisionPolicy. Defaults to None.
    offload_policy (OffloadPolicy | None, optional): Offload policy configuration for CPU offloading.
        If None, creates a default OffloadPolicy. Defaults to None.
    ignored_params (Iterable[paddle.Tensor] | None, optional): Parameters that should not be sharded.
        These parameters will be kept in full precision and not distributed. Defaults to None.

Returns:
    module: A wrapper module that applies FSDP to the input module.

Examples:
    .. code-block:: python

        >>> # type: ignore
        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> # python -m paddle.distributed.launch --device=0,1 train.py
        >>> import paddle
        >>> import paddle.distributed as dist
        >>> from paddle.distributed.fsdp import fully_shard

        >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
        >>> model = paddle.nn.Linear(10, 10)
        >>> inputs = paddle.rand(shape=[10, 10])
        >>> inputs = dist.shard_tensor(inputs, mesh, [dist.Shard(0)])
        >>> opt = paddle.optimizer.AdamW(parameters=model.parameters())
        >>> model = fully_shard(model, mesh)
        >>> tr_loss = model(inputs)
        >>> tr_loss.backward()
        >>> opt.step()
        >>> opt.clear_grad()
)r   r%   setr   r4   r2   )	r+   r,   r-   r.   r/   r0   r1   ignored_params_setargss	            r   fully_shardr9   J   sr    x (*	&-N35 
 	D )400+T22r   )returnr   )r+   paddle.nn.Layerr,   zdist.ProcessMeshr-   zbool | int | Noner.   z3Callable[[paddle.Tensor], dist.Shard | None] | Noner/   zMixedPrecisionPolicy | Noner0   zOffloadPolicy | Noner1   zIterable[paddle.Tensor] | Noner:   r;   )
__future__r   typingr   r   collections.abcr   r   paddle.distributeddistributeddist,paddle.distributed.auto_parallel.fully_shardr   Ipaddle.distributed.fleet.meta_parallel.sharding.group_sharded_fully_shardr
   r   r   r%   r2   r4   r9   r   r   r   <module>rD      s    # *(%  G
% % 	
  "/3-1+/59Q3Q3 Q3 -	Q3
Q3 +Q3 )Q3 3Q3 Q3r   