
    x-jY                    r    d dl mZ d dlmZmZ d dlZd dlmZ d dl	m
Z
 erd dlmZmZ  G d de
          ZdS )    )annotations)TYPE_CHECKINGAnyN)Layer)	PlacementProcessMeshc                  ,     e Zd ZdZd fdZddZ xZS )
LocalLayera  
    The `LocalLayer` class is a specialized `Layer` for managing distributed tensors during
    forward and backward passes in a parallelized training environment. It converts distributed tensors
    to local tensors for computation and then back to distributed tensors as output, ensuring seamless
    integration with distributed parallelism frameworks.

    Args:
        out_dist_attrs (list[tuple[ProcessMesh, list[Placement]]]):
            A list where each entry is a tuple containing the `ProcessMesh` and the list of `Placement`
            attributes for the corresponding output tensors. These attributes define the distribution
            strategy for the outputs.
        grad_dist_attrs (list[tuple[ProcessMesh, list[Placement]]]):
            Similar to `out_dist_attrs` but for gradient tensors. The tuple in the list can be None, indicating that the dist_attr of the gradient tensor is same as the corresponding input tensor.

    Examples:
        .. code-block:: python

            >>> from __future__ import annotations

            >>> import paddle
            >>> import paddle.distributed as dist
            >>> from paddle import Tensor
            >>> from paddle.distributed import ProcessMesh

            >>> class CustomLayer(dist.LocalLayer):
            ...     def __init__(self, out_dist_attrs, grad_dist_attrs):
            ...         super().__init__(out_dist_attrs, grad_dist_attrs)
            ...         self.local_result = paddle.to_tensor(0.0)

            ...     def forward(self, x):
            ...         mask = paddle.zeros_like(x)
            ...         if dist.get_rank() == 0:
            ...             mask[1:3] = 1
            ...         else:
            ...             mask[4:7] = 1

            ...         x = x * mask
            ...         mask_sum = paddle.sum(x)
            ...         mask_sum = mask_sum / mask.sum()
            ...         self.local_result = mask_sum
            ...         return mask_sum

            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
            >>> dist.init_parallel_env()
            >>> mesh = ProcessMesh([0, 1], dim_names=["x"])
            >>> dist_attrs = [
            ...     (mesh, [dist.Partial(dist.ReduceType.kRedSum)]),
            ... ]
            >>> local_input = paddle.arange(0, 10, dtype="float32")
            >>> local_input = local_input + dist.get_rank()
            >>> input_dist = dist.auto_parallel.api.dtensor_from_local(
            ...     local_input, mesh, [dist.Shard(0)]
            ... )
            >>> custom_layer = CustomLayer(dist_attrs, dist_attrs)
            >>> output_dist = custom_layer(input_dist)

            >>> local_value = custom_layer.local_result
            >>> gathered_values: list[Tensor] = []
            >>> dist.all_gather(gathered_values, local_value)

            >>> print(f"[Rank 0] local_loss={gathered_values[0]}")
            [Rank 0] local_loss=1.5
            >>> print(f"[Rank 1] local_loss={gathered_values[1]}")
            [Rank 1] local_loss=6.0
            >>> print(f"global_loss (distributed)={output_dist}")
            global_loss (distributed)=7.5

            >>> # This case needs to be executed in a multi-card environment
            >>> # export CUDA_VISIBLE_DEVICES=0,1
            >>> # python -m paddle.distributed.launch {test_case}.py
    out_dist_attrs)list[tuple[ProcessMesh, list[Placement]]]grad_dist_attrsreturnNonec                d    t                                                       || _        || _        d S )N)super__init__r   r   )selfr   r   	__class__s      l/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/auto_parallel/local_layer.pyr   zLocalLayer.__init__d   s1    
 	,.    inputsr   kwargsc           	     0   t          |          }t          |          t          | j                  k    s0J dt          |           dt          | j                   d            t          t          |                    D ]}||                                         r| j        |         mt          j                    r||         j        ||         j        }}ne||         	                                j        ||         	                                j        }}n&| j        |         d         | j        |         d         }}t          j        j                            ||         ||          ||<   t          j        | g|R i |}t
          j                            |          }t          |          t          | j                  k    s0J dt          |           dt          | j                   d            g }t          t          |                    D ]c}|                    t          j        j                            ||         | j        |         d         | j        |         d                              dt
          j                            ||          S )	a/  
        Overrides the base `Layer`'s `__call__` method. Transforms distributed tensors to local tensors
        before computation, invokes the parent class's `__call__` method, and then transforms the
        outputs back to distributed tensors based on the specified distribution attributes.
        zThe number of inputs (z0) does not match the number of grad_dist_attrs (z).Nr      zThe number of outputs (z8) does not match the number of distribution attributes ()listlenr   rangeis_distpaddlein_dynamic_modeprocess_mesh
placements	dist_attrdistauto_parallelapidtensor_to_localr   __call__utilsflattenr   appenddtensor_from_localpack_sequence_as)	r   r   r   idxmesh	placementoutputs	list_outs	dist_outss	            r   r(   zLocalLayer.__call__m   s    f6{{c$"677777S[[befjfzb{b{ 877 V%% 	 	Cc{""$$ ',4-// 	"3K4"3K2 ( #3K1133@"3K1133> ( ,S1!4,S1!4 $D
 #04EE3Ky s .9999&99L((11	9~~T%8!9!9999 Kc)nn  K  Knqrv  sF  oG  oG  K  K  K :99 	Y(( 	 	C"&99cN',Q/',Q/     |,,Wi@@@r   )r   r   r   r   r   r   )r   r   r   r   r   r   )__name__
__module____qualname____doc__r   r(   __classcell__)r   s   @r   r
   r
      sf        F FP/ / / / / /0A 0A 0A 0A 0A 0A 0A 0Ar   r
   )
__future__r   typingr   r   r   paddle.distributeddistributedr$   	paddle.nnr   r   r   r
    r   r   <module>r?      s    # " " " " " % % % % % % % %  ! ! ! ! ! !       :99999999BA BA BA BA BA BA BA BA BA BAr   