
    ΑiY                    t    S SK Jr  S SKJrJr  S SKrS SKJr  S SK	J
r
  \(       a  S SKJrJr   " S S\
5      rg)    )annotations)TYPE_CHECKINGAnyN)Layer)	PlacementProcessMeshc                  F   ^  \ rS rSrSr      SU 4S jjrSS jrSrU =r$ )
LocalLayer   a"  
The `LocalLayer` class is a specialized `Layer` for managing distributed tensors during
forward and backward passes in a parallelized training environment. It converts distributed tensors
to local tensors for computation and then back to distributed tensors as output, ensuring seamless
integration with distributed parallelism frameworks.

Args:
    out_dist_attrs (list[tuple[ProcessMesh, list[Placement]]]):
        A list where each entry is a tuple containing the `ProcessMesh` and the list of `Placement`
        attributes for the corresponding output tensors. These attributes define the distribution
        strategy for the outputs.
    grad_dist_attrs (list[tuple[ProcessMesh, list[Placement]]]):
        Similar to `out_dist_attrs` but for gradient tensors. The tuple in the list can be None, indicating that the dist_attr of the gradient tensor is same as the corresponding input tensor.

Examples:
    .. code-block:: python

        >>> from __future__ import annotations

        >>> import paddle
        >>> import paddle.distributed as dist
        >>> from paddle import Tensor
        >>> from paddle.distributed import ProcessMesh

        >>> class CustomLayer(dist.LocalLayer):
        ...     def __init__(self, out_dist_attrs, grad_dist_attrs):
        ...         super().__init__(out_dist_attrs, grad_dist_attrs)
        ...         self.local_result = paddle.to_tensor(0.0)

        ...     def forward(self, x):
        ...         mask = paddle.zeros_like(x)
        ...         if dist.get_rank() == 0:
        ...             mask[1:3] = 1
        ...         else:
        ...             mask[4:7] = 1

        ...         x = x * mask
        ...         mask_sum = paddle.sum(x)
        ...         mask_sum = mask_sum / mask.sum()
        ...         self.local_result = mask_sum
        ...         return mask_sum

        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
        >>> dist.init_parallel_env()
        >>> mesh = ProcessMesh([0, 1], dim_names=["x"])
        >>> dist_attrs = [
        ...     (mesh, [dist.Partial(dist.ReduceType.kRedSum)]),
        ... ]
        >>> local_input = paddle.arange(0, 10, dtype="float32")
        >>> local_input = local_input + dist.get_rank()
        >>> input_dist = dist.auto_parallel.api.dtensor_from_local(
        ...     local_input, mesh, [dist.Shard(0)]
        ... )
        >>> custom_layer = CustomLayer(dist_attrs, dist_attrs)
        >>> output_dist = custom_layer(input_dist)

        >>> local_value = custom_layer.local_result
        >>> gathered_values: list[Tensor] = []
        >>> dist.all_gather(gathered_values, local_value)

        >>> print(f"[Rank 0] local_loss={gathered_values[0]}")
        [Rank 0] local_loss=1.5
        >>> print(f"[Rank 1] local_loss={gathered_values[1]}")
        [Rank 1] local_loss=6.0
        >>> print(f"global_loss (distributed)={output_dist}")
        global_loss (distributed)=7.5

        >>> # This case needs to be executed in a multi-card environment
        >>> # export CUDA_VISIBLE_DEVICES=0,1
        >>> # python -m paddle.distributed.launch {test_case}.py
c                :   > [         TU ]  5         Xl        X l        g )N)super__init__out_dist_attrsgrad_dist_attrs)selfr   r   	__class__s      l/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/auto_parallel/local_layer.pyr   LocalLayer.__init__d   s    
 	,.    c           	        [        U5      n[        U5      [        U R                  5      :X  d*   S[        U5       S[        U R                  5       S35       e[        [        U5      5       H  nX   R	                  5       (       d  M  U R                  U   cn  [
        R                  " 5       (       a  X   R                  X   R                  pTO[X   R                  5       R                  X   R                  5       R                  pTO#U R                  U   S   U R                  U   S   pT[        R                  R                  R                  X   XE5      X'   M     [        R                  " U /UQ70 UD6n[
        R                   R#                  U5      n[        U5      [        U R$                  5      :X  d*   S[        U5       S[        U R$                  5       S35       e/ n[        [        U5      5       H_  nUR'                  [        R                  R                  R)                  Xs   U R$                  U   S   U R$                  U   S   5      5        Ma     [
        R                   R+                  Xh5      $ )a  
Overrides the base `Layer`'s `__call__` method. Transforms distributed tensors to local tensors
before computation, invokes the parent class's `__call__` method, and then transforms the
outputs back to distributed tensors based on the specified distribution attributes.
zThe number of inputs (z0) does not match the number of grad_dist_attrs (z).r      zThe number of outputs (z8) does not match the number of distribution attributes ()listlenr   rangeis_distpaddlein_dynamic_modeprocess_mesh
placements	dist_attrdistauto_parallelapidtensor_to_localr   __call__utilsflattenr   appenddtensor_from_localpack_sequence_as)	r   inputskwargsidxmesh	placementoutputs	list_outs	dist_outss	            r   r%   LocalLayer.__call__m   sO    f6{c$"6"677 	
$S[M1abefjfzfzb{a||~	
7 V%C{""$$'',4--//"K44"K22 ( #K113@@"K113>> ( ,,S1!4,,S1!4 $
 #0044EEK' &. ..99&9LL((1	9~T%8%8!99 	
%c)n%55mnqrv  sF  sF  oG  nH  HJ  K	
9 	Y(C""&&99N'',Q/'',Q/ ) ||,,W@@r   )r   r   )r   )list[tuple[ProcessMesh, list[Placement]]]r   r4   returnNone)r+   r   r,   r   r5   r   )	__name__
__module____qualname____firstlineno____doc__r   r%   __static_attributes____classcell__)r   s   @r   r
   r
      s8    FP/A/ C/ 
	/0A 0Ar   r
   )
__future__r   typingr   r   r   paddle.distributeddistributedr!   	paddle.nnr   r   r   r
    r   r   <module>rD      s,    # %  ! 9BA BAr   