
    Αi{                       S SK Jr  S SKJr  S SKrS SKJr  S SKJr  S SK	J
r  \(       a  S SKJr  S SKJr  S S	KJr  S S
KJr  \R$                  SS4           SS jjr\R$                  SS4           SS jjr
g)    )annotations)TYPE_CHECKINGN)stream)ReduceOp)_reduce_scatter_base)Tensor)task)Group)	_ReduceOpTc           	     P   U[         R                  [         R                  [         R                  [         R                  [         R
                  4;  a  [        S5      eU[         R                  :X  a  [        R                  R                  R                  5       S:  ar  Uc(  [        R                  R                  R                  5       OUnU R                  SUR                  -  5        [         R"                  " U U[         R
                  UUSS9$ [         R"                  " U UUUUSS9$ )aH  
Reduces, then scatters a list of tensors to all processes in a group

Args:
    tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
        float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
    tensor_list (List[Tensor]]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
        should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
    op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD|ReduceOp.AVG, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
    group (Group, optional): Communicate in which group. If none is given, use the global group as default.
    sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.

Returns:
    Return a task object.

Warning:
    This API only supports the dygraph mode.


Examples:
    .. code-block:: python

        >>> # doctest: +REQUIRES(env: DISTRIBUTED)
        >>> import paddle
        >>> import paddle.distributed as dist

        >>> dist.init_parallel_env()
        >>> if dist.get_rank() == 0:
        ...     data1 = paddle.to_tensor([0, 1])
        ...     data2 = paddle.to_tensor([2, 3])
        >>> else:
        ...     data1 = paddle.to_tensor([4, 5])
        ...     data2 = paddle.to_tensor([6, 7])
        >>> dist.reduce_scatter(data1, [data1, data2])
        >>> print(data1)
        >>> # [4, 6] (2 GPUs, out for rank 0)
        >>> # [8, 10] (2 GPUs, out for rank 1)

zInvalid ``op`` function. Expected ``op`` to be of type ``ReduceOp.SUM``, ``ReduceOp.Max``, ``ReduceOp.MIN``, ``ReduceOp.PROD`` or ``ReduceOp.AVG``.iR  g      ?Fopgroupsync_opuse_calc_stream)r   AVGMAXMINPRODSUMRuntimeErrorpaddlebasecorenccl_versiondistributed
collective_get_global_groupscale_nranksr   reduce_scatter)tensortensor_listr   r   r   s        o/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/communication/reduce_scatter.pyr!   r!   !   s   \ 
   b
 	
 
X\\fkk..;;=E } ));;= 	
 	cELL()$$||!
 	
        c           	         U[         R                  [         R                  [         R                  [         R                  4;  a  [        S5      e[        U UUUUSS9$ )a  
Reduces, then scatters a flattened tensor to all processes in a group.

Args:
    output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
    input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
        should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
    op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
    group (ProcessGroup, optional): The process group to work on. If None,
        the default process group will be used.
    sync_op (bool, optional): Whether this op is a sync op. The default value is True.

Returns:
    Async task handle, if sync_op is set to False.
    None, if sync_op or if not part of the group.

Examples:
    .. code-block:: python

        >>> # doctest: +REQUIRES(env: DISTRIBUTED)
        >>> import paddle
        >>> import paddle.distributed as dist

        >>> dist.init_parallel_env()
        >>> rank = dist.get_rank()
        >>> data = paddle.arange(4) + rank
        >>> # [0, 1, 2, 3] (2 GPUs, for rank 0)
        >>> # [1, 2, 3, 4] (2 GPUs, for rank 1)
        >>> output = paddle.empty(shape=[2], dtype=data.dtype)
        >>> dist.collective._reduce_scatter_base(output, data)
        >>> print(output)
        >>> # [1, 3] (2 GPUs, out for rank 0)
        >>> # [5, 7] (2 GPUs, out for rank 1)

zInvalid ``op`` function. Expected ``op`` to be of type ``ReduceOp.SUM``, ``ReduceOp.Max``, ``ReduceOp.MIN`` or ``ReduceOp.PROD``.Fr   )r   r   r   r   r   r   _reduce_scatter_base_stream)outputinputr   r   r   s        r$   r   r   s   sZ    T 
(,,hmmX\\JJ P
 	
 ' r%   )r"   r   r#   zlist[Tensor]r   r   r   Group | Noner   boolreturnr	   )r(   r   r)   r   r   r   r   r*   r   r+   r,   ztask | None)
__future__r   typingr   r    paddle.distributed.communicationr   'paddle.distributed.communication.reducer   6paddle.distributed.communication.stream.reduce_scatterr   r'   r   paddle.base.corer	   &paddle.distributed.communication.groupr
   r   r   r!    r%   r$   <module>r5      s    #    3 < %<A LLOOO 	O 	O
 O 
Oj LL555 	5 	5
 5 5r%   