
    Gjؠ                        d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZ g d
Z G d de          Z G d de          Z G d dee          Z G d de          Z G d dee          Z G d de          Z G d dee          Z G d de          Z G d dee          Z G d de          ZdS )    )AnyN)Tensor)
functionalinit)	ParameterUninitializedBufferUninitializedParameter   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr   c                        e Zd ZU dZdZg dZeed<   eed<   edz  ed<   e	ed<   e	ed	<   	 	 	 	 	 	 ddddedededz  de	d	e	de	ddf fdZ
ddZddZd Zd Z	 	 d fdZ xZS )	_NormBasez,Common base of _InstanceNorm and _BatchNorm.   )track_running_statsmomentumepsnum_featuresaffiner   r   Nr   r   r   h㈵>皙?Tbiasr   returnc          
         ||d}	t                                                       || _        || _        || _        || _        || _        | j        rbt          t          j	        |fi |	          | _
        |r%t          t          j	        |fi |	          | _        nC|                     dd            n,|                     dd            |                     dd            | j        r|                     dt          j        |fi |	           |                     dt          j        |fi |	           |  |  |                     dt          j        	 d
dt          j        id	 |	                                D                        |  nB|                     dd            |                     dd            |                     dd            |                                  d S )Ndevicedtyper   weightrunning_meanrunning_varnum_batches_trackedr   r$   c                 &    i | ]\  }}|d k    ||S r$    .0kvs      ]/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/torch/nn/modules/batchnorm.py
<dictcomp>z&_NormBase.__init__.<locals>.<dictcomp>Q   s#    OOO1!w,,q!,,,    r   )super__init__r   r   r   r   r   r   torchemptyr%   r   register_parameterregister_bufferzerosonestensorlongitemsreset_parametersselfr   r   r   r   r   r#   r$   r   factory_kwargs	__class__s             r0   r5   z_NormBase.__init__&   s(    %+U;;( #6 ; 	2#EK$O$O$O$OPPDK 6%ek,&Q&Q.&Q&QRR		''5555##Hd333##FD111# 	>  L K KN K K     uz,II.II   ,+  % * PO(<(<(>(>OOO	    33  666  555  !6===r2   c                     | j         rN| j                                         | j                            d           | j                                         d S d S )Nr
   )r   r&   zero_r'   fill_r(   rA   s    r0   reset_running_statsz_NormBase.reset_running_stats[   s`    # 	- ##%%%""1%%%$**,,,,,	- 	-r2   c                     |                                   | j        r;t          j        | j                   | j        t          j        | j                   d S d S d S N)rH   r   r   ones_r%   r   zeros_rG   s    r0   r?   z_NormBase.reset_parametersc   s`      """; 	'Jt{###y$DI&&&&&	' 	'$$r2   c                     t           rJ   )NotImplementedErrorrA   inputs     r0   _check_input_dimz_NormBase._check_input_dimj   s    !!r2   c                 <     dj         di | j        d| j        d uiS )Nz{{num_features}, eps={eps}, momentum={momentum}, affine={affine}, bias={use_bias}, track_running_stats={track_running_stats}use_biasr+   )format__dict__r   rG   s    r0   
extra_reprz_NormBase.extra_reprm   sH    P IIO - *.)4*?  	
r2   c           	      X   |                     dd           }||dk     rc| j        r\|dz   }	|	|vrS| j        )| j        j        t	          j        d          k    r| j        nt	          j        dt          j                  ||	<   t                                          |||||||           d S )Nversionr   r(   metar   r*   )	getr   r(   r#   r6   r<   r=   r4   _load_from_state_dict)rA   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrX   num_batches_tracked_keyrC   s             r0   r[   z_NormBase._load_from_state_dictu   s     !$$Y55Ow{{0H{ '-/D&D#&j88 /;075<;O;OOO ,, auz:::	 23 	%%	
 	
 	
 	
 	
r2   r   r   TTNNr    N)__name__
__module____qualname____doc___version__constants__int__annotations__floatboolr5   rH   r?   rQ   rV   r[   __classcell__rC   s   @r0   r   r      sn        66HXXXM	JJJdlLLL !$$(3  3  3  3 3  3  $,	3 
 3  "3  3  
3  3  3  3  3  3 j- - - -' ' ' '" " "
 
 
 
 
 
  
  
  
  
  
  
  
  
  
r2   r   c                   f     e Zd Z	 	 	 	 	 	 ddddedededz  d	ed
ededdf fdZdedefdZ xZ	S )
_BatchNormr   r   TNr   r   r   r   r   r   r   r    c                V    ||d}	 t                      j        |||||fi |	d|i d S Nr"   r   )r4   r5   r@   s             r0   r5   z_BatchNorm.__init__   si     %+U;;	
 	
 	
 	
 	
 	
 	
 	
 	
 	
r2   rP   c           
         |                      |           | j        d}n| j        }| j        rN| j        rG| j        @| j                            d           | j        dt          | j                  z  }n| j        }	 | j        rd}n| j        d u o| j        d u }	 t          j
        || j        r| j        r| j        nd | j        r| j        r| j        nd | j        | j        ||| j                  S )N        r
         ?T)rQ   r   trainingr   r(   add_rn   r&   r'   F
batch_normr%   r   r   )rA   rP   exponential_average_factorbn_trainings       r0   forwardz_BatchNorm.forward   s1   e$$$
 = ),&&)-&= 	?T5 	?'3(--a000=(14uT=U7V7V1V..15.	 = 	UKK,4T4;Kt;SK	
 | }(,(@!!$(MWT5MWDSWKI&H
 
 	
r2   rd   )
rf   rg   rh   rl   rn   ro   r5   r   r   rp   rq   s   @r0   rs   rs      s         !$$(
 
 
 

 
 $,	

 
 "
 
 

 
 
 
 
 
.0
V 0
 0
 0
 0
 0
 0
 0
 0
 0
r2   rs   c                   `     e Zd ZU eed<   eed<   	 	 	 	 	 	 ddd	 d fd	Zd fd
ZddZ xZS )_LazyNormBaser%   r   r   r   TNr   r    c                   ||d} t                      j        d||ddfi |ddi || _        || _        | j        r$t	          di || _        |rt	          di || _        | j        rct          di || _        t          di || _	        t          j        	 ddt          j        id |                                D             | _        d S d S )	Nr"   r   Fr   r$   c                 &    i | ]\  }}|d k    ||S r*   r+   r,   s      r0   r1   z*_LazyNormBase.__init__.<locals>.<dictcomp>  s#    KKKDAqa7ll1alllr2   r+   r3   )r4   r5   r   r   r	   r%   r   r   r&   r'   r6   r<   r=   r>   r(   )
rA   r   r   r   r   r#   r$   r   rB   rC   s
            r0   r5   z_LazyNormBase.__init__   sB    %+U;; 
	
 
	
 
	
 
	
 
	
 
	
 
	
 
	
 #6 ; 	E0BB>BBDK E2DD^DD	# 
	 3 E En E ED2DD^DDD',|( (j( LKN$8$8$:$:KKK	( (D$$$
	 
	r2   c                     |                                  s-| j        dk    r$t                                                       d S d S d S )Nr   )has_uninitialized_paramsr   r4   r?   )rA   rC   s    r0   r?   z_LazyNormBase.reset_parameters  sP    ,,.. 	'43D3I3IGG$$&&&&&	' 	'3I3Ir2   c                 N   |                                  r|j        d         | _        | j        rt	          | j        t                    st          d          | j                            | j        f           | j	        It	          | j	        t                    st          d          | j	                            | j        f           | j
        r@| j                            | j        f           | j                            | j        f           |                                  d S d S )Nr
   z-self.weight must be an UninitializedParameterz+self.bias must be an UninitializedParameter)r   shaper   r   
isinstancer%   r	   AssertionErrormaterializer   r   r&   r'   r?   rO   s     r0   initialize_parametersz#_LazyNormBase.initialize_parameters  s@   ((** 	$ %AD{ @!$+/EFF (G   ''):(<===9(%di1GHH ,I   I))4+<*>???' !--&(    ,,&(   !!#####+	$ 	$r2   rd   re   )	rf   rg   rh   r	   rm   r5   r?   r   rp   rq   s   @r0   r   r      s         """"
      * * * 
* * * * * *X' ' ' ' ' '
$ $ $ $ $ $ $ $r2   r   c                       e Zd ZdZddZdS )r   a  Applies Batch Normalization over a 2D or 3D input.

    Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the number of features or channels of the input). By default, the
    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
    At train time in the forward pass, the variance is calculated via the biased estimator,
    equivalent to ``torch.var(input, correction=0)``. However, the value stored in the
    moving average of the variance is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

    Args:
        num_features: number of features or channels :math:`C` of the input
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm1d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm1d(100, affine=False)
        >>> input = torch.randn(20, 100)
        >>> output = m(input)
    r    Nc                     |                                 dk    r=|                                 dk    r't          d|                                  d          d S d S Nr      zexpected 2D or 3D input (got D input)dim
ValueErrorrO   s     r0   rQ   zBatchNorm1d._check_input_dim{  W    99;;!		q 0 0RUYY[[RRRSSS  0 0r2   re   rf   rg   rh   ri   rQ   r+   r2   r0   r   r   2  s;        F FPT T T T T Tr2   r   c                       e Zd ZdZeZddZdS )r   a  A :class:`torch.nn.BatchNorm1d` module with lazy initialization.

    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``
    r    Nc                     |                                 dk    r=|                                 dk    r't          d|                                  d          d S d S r   r   rO   s     r0   rQ   z LazyBatchNorm1d._check_input_dim  r   r2   re   )rf   rg   rh   ri   r   cls_to_becomerQ   r+   r2   r0   r   r     s?         8  MT T T T T Tr2   r   c                       e Zd ZdZddZdS )r   a3  Applies Batch Normalization over a 4D input.

    4D is a mini-batch of 2D inputs
    with additional channel dimension. Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    r    Nc                     |                                 dk    r%t          d|                                  d          d S N   zexpected 4D input (got r   r   rO   s     r0   rQ   zBatchNorm2d._check_input_dim  ?    99;;!Luyy{{LLLMMM r2   re   r   r+   r2   r0   r   r     ;        G GRN N N N N Nr2   r   c                       e Zd ZdZeZddZdS )r   a  A :class:`torch.nn.BatchNorm2d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``
    r    Nc                     |                                 dk    r%t          d|                                  d          d S r   r   rO   s     r0   rQ   z LazyBatchNorm2d._check_input_dim  r   r2   re   )rf   rg   rh   ri   r   r   rQ   r+   r2   r0   r   r     ?         8  MN N N N N Nr2   r   c                       e Zd ZdZddZdS )r   ah  Applies Batch Normalization over a 5D input.

    5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
    or Spatio-temporal Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, D, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``

    Shape:
        - Input: :math:`(N, C, D, H, W)`
        - Output: :math:`(N, C, D, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm3d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)
    r    Nc                     |                                 dk    r%t          d|                                  d          d S N   zexpected 5D input (got r   r   rO   s     r0   rQ   zBatchNorm3d._check_input_dima  r   r2   re   r   r+   r2   r0   r   r     r   r2   r   c                       e Zd ZdZeZddZdS )r   a  A :class:`torch.nn.BatchNorm3d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``
    r    Nc                     |                                 dk    r%t          d|                                  d          d S r   r   rO   s     r0   rQ   z LazyBatchNorm3d._check_input_dim  r   r2   re   )rf   rg   rh   ri   r   r   rQ   r+   r2   r0   r   r   f  r   r2   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddddeded	edz  d
edededz  deddf fdZddZ	ddZ
dedefdZedd            Z xZS )r   a  Applies Batch Normalization over a N-Dimensional input.

    The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over all
    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
    are learnable parameter vectors of size `C` (where `C` is the input size).
    By default, the elements of :math:`\gamma` are sampled from
    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
    The standard-deviation is calculated via the biased estimator, equivalent to
    `torch.var(input, correction=0)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
    Normalization or Spatio-temporal Batch Normalization.

    Currently :class:`SyncBatchNorm` only supports
    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
    Network with DDP.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, +)`
        eps: a value added to the denominator for numerical stability.
            Default: ``1e-5``
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        process_group: synchronization of stats happen within each process group
            individually. Default behavior is synchronization across the whole
            world
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``

    Shape:
        - Input: :math:`(N, C, +)`
        - Output: :math:`(N, C, +)` (same shape as input)

    .. note::
        Synchronization of batchnorm statistics occurs only while training, i.e.
        synchronization is disabled when ``model.eval()`` is set or if
        ``self.training`` is otherwise ``False``.

    Examples::

        >>> # xdoctest: +SKIP
        >>> # With Learnable Parameters
        >>> m = nn.SyncBatchNorm(100)
        >>> # creating process group (optional)
        >>> # ranks is a list of int identifying rank ids.
        >>> ranks = list(range(8))
        >>> r1, r2 = ranks[:4], ranks[4:]
        >>> # Note: every rank calls into new_group for every
        >>> # process group created, even if that rank is not
        >>> # part of the group.
        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)

        >>> # network is nn.BatchNorm layer
        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
        >>> # only single gpu per process is currently supported
        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
        >>>                         sync_bn_network,
        >>>                         device_ids=[args.local_rank],
        >>>                         output_device=args.local_rank)
    r   r   TNr   r   r   r   r   r   process_groupr   r    c	                d    ||d}
 t                      j        |||||fi |
d|	i || _        d S ru   )r4   r5   r   )rA   r   r   r   r   r   r   r#   r$   r   rB   rC   s              r0   r5   zSyncBatchNorm.__init__  so     %+U;;	
 	
 	
 	
 	
 	
 	
 	
 +r2   c                     |                                 dk     r%t          d|                                  d          d S )Nr   z expected at least 2D input (got r   r   rO   s     r0   rQ   zSyncBatchNorm._check_input_dim  s<    99;;??U		UUUVVV ?r2   c                 V    |                     d          dk    rt          d          d S )Nr
   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rO   s     r0   _check_non_zero_input_channelsz,SyncBatchNorm._check_non_zero_input_channels  s4    ::a==AK   r2   rP   c                    |                      |           |                     |           | j        d}n| j        }| j        rb| j        r[| j        t          d          | j                            d           | j        d| j                                        z  }n| j        }	 | j        rd}n| j	        du o| j
        du }	 | j        r| j        r| j	        nd}| j        r| j        r| j
        nd}|oB| j        o;t          j                                        ot          j                                        }|r|j        j        ddd	t          j                                        fvr.t'          d
t          j                                                   t          j        j        j        }| j        r| j        }t          j                            |          }|dk    }|s*t1          j        |||| j        | j        ||| j                  S |st          d          t;          j        || j        | j        ||| j        |||	  	        S )z(
        Runs the forward pass.
        Nrw   z$num_batches_tracked must not be Noner
   rx   Tcudahpuxpuz;SyncBatchNorm expected input tensor to be on GPU or XPU or zbn_training must be True)rQ   r   r   ry   r   r(   r   rz   itemr&   r'   r6   distributedis_availableis_initializedr#   type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizer{   r|   r%   r   r   sync_batch_normapply)	rA   rP   r}   r~   r&   r'   	need_syncr   
world_sizes	            r0   r   zSyncBatchNorm.forward  s    	e$$$++E222
 = ),&&)-&= 	;T5 	;'/$%KLLL$))!,,,}$-043K3P3P3R3R-R**-1]*	 = 	UKK,4T4;Kt;SK	 &*]Xd6NXDTX 	 %)MWT5MWDSW 	  33!..003 !0022	 	  	'| 6688	)   !Bx==??B B  
 "-39M! 3 $ 2*99-HHJ"QI  	<	*	 	 	  A$%?@@@"(	*
 
 
r2   c           
         |}t          |t          j        j        j        j                  rt          j                            |j        |j        |j	        |j
        |j        ||j        du          }|j
        rCt          j                    5  |j        |_        |j        |_        ddd           n# 1 swxY w Y   |j        |_        |j        |_        |j        |_        |j        |_        t'          |d          r|j        |_        |                                D ]/\  }}|                    ||                     ||                     0~|S )aa  Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.

        Args:
            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
            process_group (optional): process group to scope synchronization,
                default is the whole world

        Returns:
            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
            instead.

        Example::

            >>> # Network with nn.BatchNorm layer
            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
            >>> module = torch.nn.Sequential(
            >>>            torch.nn.Linear(20, 100),
            >>>            torch.nn.BatchNorm1d(100),
            >>>          ).cuda()
            >>> # creating process group (optional)
            >>> # ranks is a list of int identifying rank ids.
            >>> ranks = list(range(8))
            >>> r1, r2 = ranks[:4], ranks[4:]
            >>> # Note: every rank calls into new_group for every
            >>> # process group created, even if that rank is not
            >>> # part of the group.
            >>> # xdoctest: +SKIP("distributed")
            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

        Nr   qconfig)r   r6   nnmodules	batchnormrs   r   r   r   r   r   r   r   no_gradr%   r&   r'   r(   ry   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechilds         r0   r   z$SyncBatchNorm.convert_sync_batchnormy  s   H feh.8CDD 	7!H22#
*[, 3  M } 5]__ 5 5+1=M()/M&5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 *0)<M&(.(:M%060JM-%+_M"vy)) 7(.%!0022 	 	KD%$$c00FF    s   B11B58B5)r   r   TTNNNre   rJ   )rf   rg   rh   ri   rl   rn   ro   r   r5   rQ   r   r   r   classmethodr   rp   rq   s   @r0   r   r     s@       f fV !$$($(+ + + ++ + $,	+
 + "+ Tz+ + 
+ + + + + +2W W W W   aV a a a a aF = = = [= = = = =r2   r   )typingr   r6   r   torch.nnr   r{   r   torch.nn.parameterr   r   r	   
_functionsr   r   lazyr   r   r   __all__r   rs   r   r   r   r   r   r   r   r+   r2   r0   <module>r      s                * * * * * * * * U U U U U U U U U U 8 8 8 8 8 8 ! ! ! ! ! !        |
 |
 |
 |
 |
 |
 |
 |
~H
 H
 H
 H
 H
 H
 H
 H
VL$ L$ L$ L$ L$OY L$ L$ L$^KT KT KT KT KT* KT KT KT\!T !T !T !T !TmZ !T !T !THLN LN LN LN LN* LN LN LN^!N !N !N !N !NmZ !N !N !NHLN LN LN LN LN* LN LN LN^!N !N !N !N !NmZ !N !N !NHm m m m mJ m m m m mr2   