
    {-jD                        d dl mZ d dlZd dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZ d dlmZ erd dlmZ d d	lmZmZ d d
lmZ  G d dej        j                  Z G d dej        j                  ZdS )    )annotationsN)TYPE_CHECKINGLiteral)_C_ops)LayerHelper)in_dynamic_or_pir_modeno_grad)_BatchNormBase)Tensor)DataLayoutNDParamAttrLike)Layerc                  D     e Zd ZdZ	 	 	 	 	 	 	 dd fdZddZddZ xZS )	BatchNormaa  
    Applies Batch Normalization over a SparseCooTensor as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .

    When use_global_stats = False, the :math:`\mu_{\beta}`
    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:

    ..  math::

        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
        \ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\

    When use_global_stats = True, the :math:`\mu_{\beta}`
    and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch.
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\

    The normalization function formula is as follows:

    ..  math::

        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable proportional parameter
    - :math:`\beta` : trainable deviation parameter

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as weight_attr. If it is set to False, the weight is not learnable.
            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
            If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as bias_attr. If it is set to False, the weight is not learnable.
            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
        data_format(str, optional): Specify the input data format, may be "NDHWC" or "NHWC". Default "NDHWC".
        use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: A SparseCooTensor with layout = 'NDHWC' or 'NHWC'.
        - output: SparseCooTensor with same shape as input x.

    Returns:
        None.


    Examples:
        .. code-block:: pycon

            >>> import paddle
            >>> paddle.seed(123)
            >>> channels = 3
            >>> x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
            >>> dense_x = paddle.to_tensor(x_data)
            >>> sparse_x = dense_x.to_sparse_coo(4)
            >>> batch_norm = paddle.sparse.nn.BatchNorm(channels)
            >>> batch_norm_out = batch_norm(sparse_x)
            >>> print(batch_norm_out.shape)
            paddle.Size([1, 6, 6, 6, 3])
    ?h㈵>NNDHWCnum_featuresintmomentumfloatepsilonweight_attrParamAttrLike | None	bias_attrdata_formatLiteral['NDHWC', 'NHWC']use_global_statsbool | Nonename
str | NonereturnNonec	           
     Z    t                                          ||||||||           d S )N)r   r   r   r   r   r   r    super__init__)
selfr   r   r   r   r   r   r   r    	__class__s
            [/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/sparse/nn/layer/norm.pyr'   zBatchNorm.__init__m   sH     	##- 	 		
 		
 		
 		
 		
    inputc                ,    |dvrt          d          d S )N)r   NHWCz:sparse BatchNorm only support layout of "NDHWC" and "NHWC")
ValueError)r(   r,   s     r*   _check_data_formatzBatchNorm._check_data_format   s+    )))L   *)r+   r   c                   |                      | j                   | j        rt          j        d           | j        | j         | _        d}n| j         }| j        d         dk    rdnd}t                      rPt          j        || j	        | j
        | j        | j        | j         | j        | j        || j        |          \  }}}}}}|S || j        | j        | j	        | j
        d}| j        | j        || j         | j        |dd}d	}t          |          }	|j        }
|	                    |
d
          }|	                    |
d
          }|	                    |
d
          }|	                    |
d
          }|	                    |
d
          }|	                    |
          }||||||d}|	                    ||||           |S )Nz<When training, we now always track global mean and variance.F   CNCHWr.   )xscalebiasmeanvariance)r   r   data_layoutis_testr   trainable_statisticsfuse_with_relusparse_batch_normT)dtypestop_gradient)outmean_outvariance_out
saved_meansaved_variancereserve_space)typeinputsoutputsattrs)r0   _data_formattrainingwarningswarn_use_global_statsr   r   sparse_batch_norm__mean	_varianceweightr7   	_momentum_epsilonr   r?   "create_variable_for_type_inference)create_sparse_variable_for_type_inference	append_op)r(   r,   r<   r   batch_norm_out_rH   rJ   op_typehelperr?   rB   rC   rD   rE   rF   rA   rI   s                     r*   forwardzBatchNorm.forward   se    1222= 	MN   !))-%6D"#(  '+'=#=  $ 1! 4 ; ;ff!## >	,2,E
	M!&$- -)NAq!Q "! 	
 N F !N=*#},$($:(<"' E *G ))FKE@@4 A  H "DD4 E  L  BB4 C  J $FF4 G  N #EE4 F  M BB5IIC$ ,("0!. G VWE     Jr+   )r   r   NNr   NN)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   )r,   r   r"   r#   )r,   r   r"   r   )__name__
__module____qualname____doc__r'   r0   r]   __classcell__r)   s   @r*   r   r   #   s        G GX ,0*.07(,
 
 
 
 
 
 
,   N N N N N N N Nr+   r   c                  R     e Zd ZdZ	 	 	 	 	 	 dd fdZddZedd            Z xZS )SyncBatchNorma  
    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
    be used as a normalizer function for other operations, such as conv2d and fully connected
    operations.
    The data is normalized by the mean and variance of the channel based on whole mini-batch
    , which including data in all gpus.
    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
    for more details.

    When model in training mode, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
    Calculated as follows:

    ..  math::

        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
        \ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\

    - :math:`x` : whole mini-batch data in all gpus
    - :math:`m` : the size of the whole mini-batch data

    When model in evaluation mode, the :math:`\\mu_{\\beta}`
    and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance,
    which usually got from the pre-trained model). Global statistics calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\

    The formula of normalization is as follows:

    ..  math::

        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable scale parameter vector
    - :math:`\beta` : trainable shift parameter vector

    Note:
        If you want to use container to pack your model and has ``SyncBatchNorm`` in the
        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of
        ``list`` to pack the model.

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
             of this layer. If it is set to None or one attribute of ParamAttr, this layer
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. If it is set to False,
             this layer will not have trainable scale parameter. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
             If it is set to None or one attribute of ParamAttr, this layer
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. If it is set to False, this layer will not
             have trainable bias parameter. Default: None.
        data_format(str, optional): Specify the input data format, may be "NCHW". Default "NCHW".
        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shapes:
        input: Tensor that the dimension from 2 to 5.

        output: Tensor with the same shape as input.

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle
            >>> import paddle.sparse.nn as nn
            >>> paddle.device.set_device('gpu')

            >>> x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]], dtype='float32')
            >>> x = x.to_sparse_coo(len(x.shape)-1)

            >>> if paddle.is_compiled_with_cuda():
            ...     sync_batch_norm = nn.SyncBatchNorm(2)
            ...     hidden1 = sync_batch_norm(x)
            ...     print(hidden1)
            Tensor(shape=[1, 2, 2, 2], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=False,
                   indices=[[0, 0, 0, 0],
                            [0, 0, 1, 1],
                            [0, 1, 0, 1]],
                   values=[[-0.40730840, -0.13725480],
                            [-0.40730840, -1.20299828],
                            [ 1.69877410, -0.23414057],
                            [-0.88415730,  1.57439375]])
    r   r   Nr4   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   c           	     V    t                                          |||||||           d S )Nr%   )	r(   r   r   r   r   r   r   r    r)   s	           r*   r'   zSyncBatchNorm.__init__<  s@     		
 	
 	
 	
 	
r+   r5   r   c                    |                                   t          j        || j        | j        | j        | j        | j         | j        | j	        | j
        dd          \  }}}}}}|S )NF)r0   r   sparse_sync_batch_norm_rQ   rR   rS   r7   rL   rT   rU   rK   )r(   r5   sync_batch_norm_outrZ   s       r*   r]   zSyncBatchNorm.forwardP  sq    !!!-3-KJNKINM.
 .
*Q1a #"r+   layerr   c           	        |}t          |t                    r|j        ?t          |j        t                    s%|j        j        |j        j        dz   |j        _        |j        ?t          |j        t                    s%|j        j        |j        j        dz   |j        _        t          |t                    r9t          |j        |j	        |j
        |j        |j        |j        |j                  }nHt          j                            |j        |j	        |j
        |j        |j        |j        |j                  }|j        durG|j        dur>t                      5  |j        |_        |j        |_        ddd           n# 1 swxY w Y   |j        |_        |j        |_        |                                D ].\  }}|                    ||                     |                     /~|S )a  
        Helper function to convert :class: `paddle.sparse.nn.BatchNorm` layers in the model to :class: `paddle.sparse.nn.SyncBatchNorm` layers.

        Parameters:
            layer(paddle.nn.Layer): model containing one or more `BatchNorm` layers.

        Returns:
            The original model with converted SyncBatchNorm layers. If BatchNorm layer in the model, use SyncBatchNorm layer instead.

        Examples:

            .. code-block:: python

                >>> import paddle
                >>> import paddle.sparse.nn as nn

                >>> model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5))
                >>> sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
        N_syncF)
isinstancer
   _weight_attrboolr    
_bias_attrr   re   _num_featuresrT   rU   rK   _namepaddlennr	   rS   r7   rQ   rR   named_childrenadd_sublayerconvert_sync_batchnorm)clsrj   layer_outputr    sublayers        r*   rw   z$SyncBatchNorm.convert_sync_batchnorma  s(   , e^,, -	5"."5#5t<< /&+7*/*<*AG*K"' ,"5#3T:: -$)5(-(8(=(G % %++ ,'ON&$&K     &y66'ON&$&K    "%//$E11YY 3 3*/,L'(-
L%3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 "'L%*_L"#2244 	 	ND(%%c00::    s   F  FF)r   r   NNr4   N)r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   )r5   r   r"   r   )rj   r   r"   r   )	r^   r_   r`   ra   r'   r]   classmethodrw   rb   rc   s   @r*   re   re      s        _ _H ,0*.$*
 
 
 
 
 
 
(# # # #" J J J [J J J J Jr+   re   )
__future__r   rM   typingr   r   rs   r   paddle.base.layer_helperr   paddle.frameworkr   r	   paddle.nn.layer.normr
   r   paddle._typingr   r   	paddle.nnr   rt   BatchNorm1Dr   re    r+   r*   <module>r      sb   # " " " " "  ) ) ) ) ) ) ) )        0 0 0 0 0 0 < < < < < < < < / / / / / /           t t t t t	% t t tnR R R R RFI+ R R R R Rr+   