
    BjG                     \   d dl Z d dlZd dlmZ d dlmZ d dlmZmZ dddZ	d Z
ej        fdZd ej        fdZd ej        fd	Zej        ej        fd
Zej        ej        fdZej        fdZej        fdZej        fdZddej        fdZej        ej        fdZ G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d d e          Z G d! d"e          Z G d# d$e          Z dS )%    N)Function)groupReduceOp
suggestionc                B    d|  d}|r	|d| dz  }t          |          )N torch.distributed.nn.functional.z& is not supported under torch.compile.z Use 	 instead.)RuntimeError)namer   msgs      b/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/torch/distributed/nn/functional.py_not_supported_under_compiler      sC    W4WWW   -,z,,,,
s

    c                 L    t          j        d|  d| dt          d           d S )Nr	   z is deprecated, use r
      )category
stacklevel)warningswarnFutureWarning)r   r   s     r   _deprecatedr      sN    M	%4 	% 	%	% 	% 	%	     r   c                     t           j                                        rt          dd           t	          dd           t
                              |||           S )a  
    Broadcasts the tensor to the whole group.

    ``tensor`` must have the same number of elements in all processes
    participating in the collective.

    Arguments:
        tensor (Tensor): Data to be sent if ``src`` is the rank of current
            process.
        src (int): Source rank.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Received tensor from the broadcast op.

    	broadcastz3torch.distributed._functional_collectives.broadcastr   )torchcompileris_compilingr   r   
_Broadcastapply)tensorsrcr   s      r   r   r       sb    " ~""$$ 
$L	
 	
 	
 	
 RSSSC///r   c                     t           j                                        rt          d           t                              |||           S )aT  
    Gathers a list of tensors in a single process.

    Arguments:
        tensor (Tensor): Input tensor.
        dst (int, optional): Destination rank (default is 0).
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple[Tensor]: List of appropriately-sized tensors with the gathered data.
    gather)r   r   r   r   _Gatherr   )r    dstr   s      r   r#   r#   :   s>     ~""$$ /$X...==eV,,,r   c                     t           j                                        rt          d           t	          j        ||g| R  S )a  
    Scatters a list of tensors to all processes in a group.

    Each process will receive exactly one tensor and store its data in the
    ``tensor`` argument.

    Arguments:
        tensors (list[Tensor]): List of tensors to scatter on the source rank.
            Receivers must pass ``None`.
        src (int, optional): Source rank (default is 0).
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output tensor from the scatter operation.

    scatter)r   r   r   r   _Scatterr   )tensorsr!   r   s      r   r'   r'   K   sB    " ~""$$ 0$Y///>#u/w////r   c                     t           j                                        rt          d           t                              ||||           S )a  
    Reduces the tensor data across all machines.

    Only the process with rank ``dst`` is going to receive the final result.

    Arguments:
        tensor (Tensor): Input of the collective.
        dst (int): Destination rank.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective.

    reduce)r   r   r   r   _Reducer   )r    r%   opr   s       r   r+   r+   a   s@    $ ~""$$ /$X...==b%000r   c                     t           j                                        rt          dd           t	          dd           t          j        ||| g|R  S )a  
    Reduces, then scatters a list of tensors to all processes in a group.

    Arguments:
        output (Tensor): Output tensor.
        input_list (list[Tensor]): List of tensors to reduce and scatter.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective.

    reduce_scatterz?torch.distributed._functional_collectives.reduce_scatter_tensorr   )r   r   r   r   r   _Reduce_Scatterr   )output
input_listr-   r   s       r   r/   r/   x   sp      ~""$$ 
$X	
 	
 	
 	
 I    UF@Z@@@@r   c                     t           j                                        rt          dd           t	          dd           t
                              ||           S )a  
    Gathers tensors from the whole group in a list.

    Arguments:
        tensor (Tensor): Tensor to be broadcast from current process.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple([Tensor]): Output of the collective.

    
all_gatherz;torch.distributed._functional_collectives.all_gather_tensorr   )r   r   r   r   r   
_AllGatherr   )r    r   s     r   r4   r4      si     ~""$$ 
$T	
 	
 	
 	
 S   E6***r   c                     t           j                                        rt          d           t                              | ||          S )a  
    Single tensor all gather. Gathers a single tensor from all ranks, and puts them in a single output tensor.

    Args:
        output_tensor (Tensor): Output tensor. It should contain
            correctly-sized tensors to be used for output of the collective.
        input_tensor (Tensor): Tensor to be broadcast from current process.
        group (ProcessGroup, optional): The process group to work on. If None,
            the default process group will be used.

    Examples:
        >>> # All tensors below are of torch.int64 dtype.
        >>> # We have 2 process groups, 2 ranks.
        >>> # xdoctest: +SKIP("incorrect want text")
        >>> output_tensor = torch.zeros(2, dtype=torch.int64)
        >>> output_tensor
        [tensor([0, 0])] # Rank 0 and 1
        >>> tensor = torch.arange(1, dtype=torch.int64) + 1 + rank
        >>> tensor
        tensor([1]) # Rank 0
        tensor([2]) # Rank 1
        >>> dist.all_gather_base(output_tensor, tensor)
        >>> output_tensor
        tensor([1,2]) # Rank 0
        tensor([1,2]) # Rank 1

    .. warning::
        `_all_gather_base` is experimental and subject to change.
        It is the caller's responsibility to ensure the output_tensor
        is correctly sized.

    _all_gather_base)r   r   r   r   _AllGatherBaser   )output_tensorinput_tensorr   s      r   r7   r7      sB    B ~""$$ 9$%7888|UCCCr   c                     t           j                                        rt          d           t	          j        || g|R  S )a  
    Each process scatters list of input tensors to all processes in a group and return gathered list of tensors in output list.

    Arguments:
        output_tensor_list (list[Tensor]): list of tensors to gather one per rank.
        input_tensor_list (list[Tensor]): List of tensors to scatter one per rank.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple([Tensor]): Output of the collective.

    
all_to_all)r   r   r   r   	_AlltoAllr   )output_tensor_listinput_tensor_listr   s      r   r<   r<      sD     ~""$$ 3$\222?5"4I7HIIIIr   c                     t           j                                        rt          dd           t	          dd           t
                              || |||          S )a  
    Each process splits input tensor and then scatters the split list to all processes in a group.

    Then concatenate the received tensors from all the processes in the group and return single output tensor.

    Arguments:
        output (Tensor): Gathered concatenated output tensor.
        input (Tensor): Input tensor to scatter.
        output_split_sizes: (list[Int], optional): Output split sizes for dim 0
            if specified None or empty, dim 0 of ``output`` tensor must divide
            equally by ``world_size``.
        input_split_sizes: (list[Int], optional): Input split sizes for dim 0
            if specified None or empty, dim 0 of ``input`` tensor must divide
            equally by ``world_size``.

    Returns:
        Tensor: Output of the collective.

    all_to_all_singlez;torch.distributed._functional_collectives.all_to_all_singler   )r   r   r   r   r   _AlltoAllSingler   )r1   inputoutput_split_sizesinput_split_sizesr   s        r   rA   rA      sx    4 ~""$$ 
$T	
 	
 	
 	
 E     v)+<e  r   c                     t           j                                        rt          dd           t	          dd           t
                              |||           S )a&  
    Reduces the tensor data across all machines in such a way that all get the final result.

    After the call the returned tensor is going to be bitwise
    identical in all processes.

    Arguments:
        tensor (Tensor): Input of the collective.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective

    
all_reducez4torch.distributed._functional_collectives.all_reducer   )r   r   r   r   r   
_AllReducer   )r    r-   r   s      r   rG   rG     sb    $ ~""$$ 
$M	
 	
 	
 	
 TUUUBv...r   c                   :    e Zd Zed             Zed             ZdS )r   c                     || _         || _        t          j        |          | _        |                                }t          j        |||           |S Nr   )r!   r   distget_rankrankcloner   )ctxr!   r   r    s       r   forwardz_Broadcast.forward'  sQ     	=u--- vs%0000r   c                     t                               | j        t          j        | j        |          }| j        | j        k    r|                                 d d |fS N)r,   r   r!   r   SUMr   rO   zero_)rQ   grad_outputgxs      r   backwardz_Broadcast.backward3  sJ     ]]37HL#)[II7chHHJJJdBr   N__name__
__module____qualname__staticmethodrR   rY    r   r   r   r   &  sH          \     \     r   r   c                   :    e Zd Zed             Zed             ZdS )r$   c                 V   || _         || _        fdt          t          j        |                    D             }                                t          j        |          |k    rt          j        |||           nt          j        d ||           t          |          S )Nc                 8    g | ]}t          j                  S r_   )r   
zeros_like).0ir    s     r   
<listcomp>z#_Gather.forward.<locals>.<listcomp>F  s1     
 
 
)*EV$$
 
 
r   rL   )	r%   r   rangerM   get_world_size
contiguousrN   r#   tuple)rQ   r%   r   r    tensor_lists      ` r   rR   z_Gather.forward=  s     	

 
 
 
.3D4Ge4T4T4T.U.U
 
 
 ""$$=u%%%,,KS>>>>>Kc7777[!!!r   c                 D    dt          j        | j        | j        g|R  fz   S NNN)r(   r   r%   r   )rQ   grad_outputss     r   rY   z_Gather.backwardQ  s(    x~cgsyP<PPPRRRr   NrZ   r_   r   r   r$   r$   <  sM        " " \"$ S S \S S Sr   r$   c                   :    e Zd Zed             Zed             ZdS )r(   c                 J   || _         || _        t          fdD                       st          t	          j        d                   }t          j        |          |k    r&t          j        |t                    ||           nt          j        |d ||           |S )Nc              3   x   K   | ]4}|                                 d                                           k    V  5dS )r   Nsize)rd   tr)   s     r   	<genexpr>z#_Scatter.forward.<locals>.<genexpr>\  s>      BBQ16688wqz000BBBBBBr   r   rL   )
r!   r   allAssertionErrorr   rc   rM   rN   r'   list)rQ   r!   r   r)   r1   s      ` r   rR   z_Scatter.forwardW  s     	BBBB'BBBBB 	!  !'!*--=u%%%,,Lg5AAAAALs%8888r   c                 T    dt                               | j        | j        |          z   S rm   )r$   r   r!   r   rQ   rW   s     r   rY   z_Scatter.backwarde  s#     gmmCGSYLLLLr   NrZ   r_   r   r   r(   r(   V  sM        
 
 \
 M M \M M Mr   r(   c                   :    e Zd Zed             Zed             ZdS )r,   c                 z    || _         || _        |                                }t          j        ||||           |S )Nr-   r   )r!   r   rP   rM   r+   )rQ   r!   r-   r   r    s        r   rR   z_Reduce.forwardl  s=     	FCBe4444r   c                 V    dt                               | j        | j        |          fz   S N)NNN)r   r   r!   r   r{   s     r   rY   z_Reduce.backwardu  s(     "Z%5%5cgsy+%V%V$XXXr   NrZ   r_   r   r   r,   r,   k  sM          \ Y Y \Y Y Yr   r,   c                   :    e Zd Zed             Zed             ZdS )r0   c                     || _         |                                }t          d |D                       }t          j        |t          |          ||           |S )Nc              3   >   K   | ]}|                                 V  d S rT   ri   rd   ru   s     r   rv   z*_Reduce_Scatter.forward.<locals>.<genexpr>  s*      !L!LQ!,,..!L!L!L!L!L!Lr   r~   )r   ri   rj   rM   r/   ry   )rQ   r-   r   r    r?   s        r   rR   z_Reduce_Scatter.forward|  sb     	""$$!!L!L:K!L!L!LLLFD):$;$;%PPPPr   c                 H    dt                               | j        |          z   S r   )r5   r   r   r{   s     r   rY   z_Reduce_Scatter.backward  s!     "J$4$4SY$L$LLLr   NrZ   r_   r   r   r0   r0   {  sM          \ M M \M M Mr   r0   c                   :    e Zd Zed             Zed             ZdS )r5   c                                                      || _        fdt          t          j        |                    D             }t          j        ||           t          |          S )Nc                 8    g | ]}t          j                  S r_   r   
empty_like)rd   _r    s     r   rf   z&_AllGather.forward.<locals>.<listcomp>  s1     
 
 
)*EV$$
 
 
r   rL   )ri   r   rg   rM   rh   r4   rj   )rQ   r   r    out_tensor_lists     ` r   rR   z_AllGather.forward  s     ""$$	
 
 
 
.3D4Ge4T4T4T.U.U
 
 
 	u====_%%%r   c                    t          j        | j                  t           j        j        t           j        j        fv rXt          j        | j                  }t          j        ||                   }t          j
        t          j        | j        |g|R  }nLd |D             }t          j
        | j        |g|R  }t          j        t          j        |          d          }d |fS )NrL   c                 6    g | ]}t          j        |          S r_   r   )rd   r    s     r   rf   z'_AllGather.backward.<locals>.<listcomp>  s#    OOO5+F33OOOr   r   )dim)rM   get_backendr   BackendNCCLXCCLrN   r   r   r0   r   r   rU   r=   sumstack)rQ   ro   rO   rX   rk   gxss         r   rY   z_AllGather.backward  s    #),,,1BDLDU0VVV=sy111D!,t"455B &x|SYR\RRRBB PO,OOOK/#)[H<HHHC5;s++333Bbzr   NrZ   r_   r   r   r5   r5     sH        
& 
& \
&   \  r   r5   c                   :    e Zd Zed             Zed             ZdS )r8   c                 f    || _         t          j        ||                                |           |S rK   )r   rM   r7   ri   )rQ   r9   r:   r   s       r   rR   z_AllGatherBase.forward  s5     	m\-D-D-F-FeTTTTr   c                 <   t          j        | j                  t           j        j        t           j        j        fv rt          j        | j                  }t          |                                          }|d         |z  dk    rt          d| d|           |d         t          j        | j                  z  |d<   t          j        ||j        |j                  }t          j        ||t          j        | j                   nt          d          d |d fS )NrL   r   zTensor with dimensions: z8 does not have first dimension divisible by world_size: devicedtypezBackend not supported!)rM   r   r   r   r   r   rh   ry   rt   r   r   emptyr   r   _reduce_scatter_baser   rU   )rQ   rW   
world_sizeout_sizerX   s        r   rY   z_AllGatherBase.backward  s(    #),,,1BDLDU0VVV,39===JK,,..//H{Z'1,,"Vx V VISV V   #1+)<39)M)M)MMHQK!3;;L  B %b+x|SYOOOO7888b$r   NrZ   r_   r   r   r8   r8     sH          \
     \     r   r8   c                   :    e Zd Zed             Zed             ZdS )r=   c                 <   || _         fdt          t          j        |                    D             | _        t          j        |          }t          d D                       t          j        |          t          j        j	        u r[t          t          j        |                    D ]7}d }||k    rt                    }t          j        ||         |||           8n$t          j        |t                    |           t          |          S )Nc                 D    g | ]}|                                          S r_   rs   )rd   re   r)   s     r   rf   z%_AlltoAll.forward.<locals>.<listcomp>  s4     &
 &
 &
"#GAJOO&
 &
 &
r   rL   c              3   >   K   | ]}|                                 V  d S rT   r   r   s     r   rv   z$_AlltoAll.forward.<locals>.<genexpr>  s*      881888888r   )r   rg   rM   rh   input_tensor_size_listrN   rj   r   r   GLOOry   r'   r<   )rQ   r   r   r)   my_rankre   to_sends      `   r   rR   z_AlltoAll.forward  s<    	&
 &
 &
 &
',T-@u-M-M-M'N'N&
 &
 &
" -e,,,8888888%(((DL,===4.U;;;<< J J<<"7mmG_Q/!5IIIII	J OW   
 _%%%r   c                 `    fd| j         D             }dt          j        | j        |gR  z   S )Nc                 j    g | ]/}t          j        |d          j        d          j                  0S )r   r   )r   r   r   r   )rd   rt   ro   s     r   rf   z&_AlltoAll.backward.<locals>.<listcomp>  sQ     
 
 
  K\!_3<?;P  
 
 
r   rn   )r   r=   r   r   )rQ   ro   rk   s    ` r   rY   z_AlltoAll.backward  sS    
 
 
 
 2	
 
 
 iociT|TTTTTr   NrZ   r_   r   r   r=   r=     sM        & & \&, U U \U U Ur   r=   c                   :    e Zd Zed             Zed             ZdS )rB   c                     || _         |                                | _        || _        || _        t          j        |||||           |S )N)rD   rE   r   )r   rt   
input_sizerD   rE   rM   rA   )rQ   r   r1   rD   rE   rC   s         r   rR   z_AlltoAllSingle.forward  sZ     	!2 21/	
 	
 	
 	
 r   c           	          t          j        | j        |j        |j                  }dt
                              | j        || j        | j	        |
                                          fz   S )Nr   )NNNN)r   r   r   r   r   rB   r   r   rD   rE   ri   )rQ   rW   r    s      r   rY   z_AlltoAllSingle.backward  ss     N;#5[=N
 
 
 (!!	&%&&(( +
 
 	
r   NrZ   r_   r   r   rB   rB     sH          \ 
 
 \
 
 
r   rB   c                   :    e Zd Zed             Zed             ZdS )rH   c                     || _         || _        |                    t          j                  }t          j        |||           |S )N)memory_formatr~   )r   r-   rP   r   contiguous_formatrM   rG   )rQ   r-   r   r    s       r   rR   z_AllReduce.forward  sD     	E,CDD2U3333r   c                 V    dt                               | j        | j        |          fz   S rm   )rH   r   r-   r   r{   s     r   rY   z_AllReduce.backward  s(     z//	;OOQQQr   NrZ   r_   r   r   rH   rH     sM          \ R R \R R Rr   rH   )!r   r   torch.distributeddistributedrM   torch.autogradr   r   r   r   r   WORLDr   r#   r'   rU   r+   r/   r4   r7   r<   rA   rG   r   r$   r(   r,   r0   r5   r8   r=   rB   rH   r_   r   r   <module>r      s                 # # # # # #
 . - - - - - - - 6:        "' 0 0 0 04  - - - -" %+ 0 0 0 0, $<u{ 1 1 1 1. +3,ek A A A A8 #[ + + + +. 9> #D #D #D #DL =BK J J J J* 
+% % % %P #,ek / / / /6               ,S S S S Sh S S S4M M M M Mx M M M*Y Y Y Y Yh Y Y Y M M M M Mh M M M"       <         X      :!U !U !U !U !U !U !U !UH
 
 
 
 
h 
 
 
DR R R R R R R R R Rr   