
    x-jIN                        d dl mZ d dlZd dlmZ d dlZd dlmZmZ d dlm	Z	 erd dl
mZ ej        d:d	            Z	 d;d<dZd=dZ	 d>d?dZ	 d@dZ	 d@dAd Z	 	 	 dBdCd$Z	 	 	 	 	 	 dDdEd+Z	 	 	 	 	 	 	 	 	 dFdGd9ZdS )H    )annotationsN)TYPE_CHECKING)Tensor_C_ops)in_dynamic_or_pir_mode)Sequencereturnr   c                     t                      S )z&Get tensor with no entries and no data)r        a/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/incubate/nn/functional/fp8.py_empty_tensorr      s     88Or   TxSequence[Tensor]	transposebooltuple[Tensor, Tensor]c                v    t                      r*|rt          j        |           S t          j        |           S dS )a  
    Fused operation that performs stacking, optional transposition, and quantization
    on a list of bfloat16 tensors.

    Args:
        x (list[Tensor] or tuple[Tensor]): A list or tuple of bfloat16 tensors, where each tensor
            has shape `[M, K]`. All tensors should have the same shape and dtype.
        transpose (bool, optional): If True, applies a transpose before quantization.
            Default is True.

    Returns:
        tuple:
            - out (Tensor): The quantized output tensor with dtype `float8_e4m3fn`.
            - scale (Tensor): A float32 tensor representing the quantization scale.

    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('BF16 requires SM80 or higher env')
            >>> import paddle
            >>> import paddle.incubate.nn.functional as F
            >>> paddle.set_device('gpu')

            >>> x_vec = []
            >>> num_experts = 1
            >>> seq_len = 2048
            >>> hidden_size = 128
            >>> for _ in range(num_experts):
            ...     x = paddle.randn([seq_len, hidden_size], dtype='bfloat16')
            ...     x = paddle.clip(x, min=-50, max=50)
            ...     x_vec.append(x)

            >>> out, scale = F.fused_stack_transpose_quant(x_vec, transpose=True)
            >>> print(out.shape)
            paddle.Size([128, 2048])
            >>> print(scale.shape)
            paddle.Size([1, 16])
    N)r   r   fused_stack_transpose_quantfused_stack_quant)r   r   s     r   r   r   #   sG    R  / 	/5a888+A...	/ /r   x_scalec                L    t                      rt          j        | |          S dS )a  
    Applies fused activation and dequantization operation to convert float8 quantized data back to bfloat16.

    Args:
        x (Tensor): Input quantized tensor with dtype float8_e4m3fn and shape [M, N]. This tensor contains the quantized
            activations from previous layers.
        x_scale (Tensor): Dequantization scale tensor with dtype float32 and shape [M, (N + 127) // 128] or int32 and shape [M, (N + 511) // 512].
            Each scale value corresponds to a 128-column in the input tensor.

    Returns:
        Tensor. Dequantized output tensor with dtype bfloat16 and shape [M, N]. The values are
            computed as input * scale for each corresponding 128-column block.
    N)r   r   fused_act_dequant)r   r   s     r   r   r   S   s/    "  4'73334 4r   o1do2_sunzipped_probsname
str | Nonetuple[Tensor, Tensor, Tensor]c                N    t                      rt          j        | ||          S dS )u>  
    Computes gradients for fused weighted SwiGLU activation function in backward pass.

    Note:
        This function performs the backward propagation for the SwiGLU (Swish-Gated Linear Unit)
        activation with probability weighting. It computes gradients with respect to both the
        input activations and the probability weights, while also recomputing forward pass values
        for memory efficiency. The kernel automatically selects between vectorized and standard
        implementations based on input dimensions.

    Args:
        o1 (Tensor): Forward pass input tensor with dtype bfloat16 and shape
            [..., intermediate_size * 2]. The tensor is split into two halves:
            - Left half [0:intermediate_size]: x1 values (gate inputs)
            - Right half [intermediate_size:]: x2 values (activation inputs)
            This is the same input used in the forward SwiGLU computation.
        do2_s (Tensor): Upstream gradient tensor with dtype bfloat16 and shape
            [..., intermediate_size]. Contains gradients flowing back from
            the next layer, representing ∂L/∂output before probability weighting.
            Each element corresponds to the gradient of one output element.
        unzipped_probs (Tensor): Probability weighting tensor with dtype float32 and
            shape matching the batch dimensions of o1 and do2_s
            [...]. Each probability value was used to weight the
            corresponding row's output in the forward pass.

    Returns:
        tuple:
            - do1 (Tensor). Input gradients with dtype bfloat16 and shape
              [..., intermediate_size * 2]. Layout matches o1:
              - [0:intermediate_size]: ∂L/∂x1 (gradients w.r.t. gate inputs)
              - [intermediate_size:]: ∂L/∂x2 (gradients w.r.t. activation inputs)
            - probs_grad (Tensor). Probability gradients with dtype float32 and
              shape [...]. Each element is ∂L/∂prob for the corresponding batch item,
              computed as the sum of (∂L/∂output_i * SwiGLU_output_i) across the
              intermediate dimension.
            - o2_s (Tensor). Recomputed forward output with dtype bfloat16 and
              shape [..., intermediate_size]. Contains SwiGLU(x1, x2) * prob values.
              This avoids storing forward activations, trading computation for memory.

    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('BF16 requires SM80 or higher env')
            >>> import paddle
            >>> import paddle.incubate.nn.functional as F
            >>> paddle.set_device('gpu')

            >>> batch_size, seq_len = 32, 128
            >>> intermediate_size = 2048

            >>> o1 = paddle.randn(
            ...     [batch_size, seq_len, intermediate_size * 2],
            ...     dtype='bfloat16',
            ... )
            >>> do2_s = paddle.randn([batch_size, seq_len, intermediate_size], dtype='bfloat16')
            >>> expert_probs = paddle.rand([batch_size, seq_len, 1], dtype='float32')

            >>> do1, probs_grad, o2_s = F.fused_swiglu_weighted_bwd(o1, do2_s, expert_probs)
            >>> print(do1.shape)
            paddle.Size([32, 128, 4096])
            >>> print(probs_grad.shape)
            paddle.Size([32, 128, 1])
            >>> print(o2_s.shape)
            paddle.Size([32, 128, 2048])
    N)r   r   fused_swiglu_weighted_bwd)r   r   r   r   s       r   r!   r!   h   s5    N  K/E>JJJK Kr   Fc                    d |D             }| j         d         dk    s| j         d         dk    rg g fS t                      rt          j        | |||          S dS )aB	  
    Applies fused transpose, split, and quantization operation for Mixture of Experts (MoE) models.

    Note:
        This function performs three operations in a single optimized CUDA kernel:
        1. Quantizes input from bfloat16 to float8_e4m3fn format using column-wise scaling
        2. Transposes the matrix from [M, K] to [K, M] layout
        3. Splits the transposed data across multiple experts based on token distribution

    Args:
        x (Tensor): Input tensor of shape [M, K] with dtype bfloat16, where M is the total
            number of tokens and K is the feature dimension. M must be divisible by 128
            for optimal performance.
        tokens_per_expert (List[int]): List containing the number of tokens assigned to each expert.
            Each value should be a multiple of 128 for optimal performance.
            The sum should equal M (total tokens). Values can be 0 for
            unused experts.
        pow_2_scales (bool, optional): Whether to constrain quantization scales to powers of 2
            for better hardware efficiency. If True, scales will be
            rounded to the nearest power of 2. Default: False.

    Returns:
        tuple:
            - outs (List[Tensor]). List of quantized and transposed output tensors, one per expert.
              Each tensor has shape [K, tokens_per_expert[i]] and dtype float8_e4m3fn.
              Empty tensors are included for experts with 0 tokens.
            - scales (List[Tensor]). List of dequantization scale tensors, one per expert.
              Each tensor has shape [K // 128, tokens_per_expert[i] // 128]
              and dtype float32. These are the reciprocal of quantization scales.

    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('BF16 requires SM80 or higher env')
            >>> import paddle
            >>> import paddle.incubate.nn.functional as F
            >>> paddle.set_device('gpu')

            >>> x = paddle.randn([384, 512], dtype='bfloat16')
            >>> x = paddle.clip(x, min=-50, max=50)
            >>> tokens_per_expert = [128, 128, 128]
            >>> outs, scales = F.fused_transpose_split_quant(x, None, tokens_per_expert, pow_2_scales=True)
            >>> print(outs[0].shape)
            paddle.Size([512, 128])
            >>> print(scales[0].shape)
            paddle.Size([1, 512])
    c                ,    g | ]}t          |          S r   int.0ts     r   
<listcomp>z/fused_transpose_split_quant.<locals>.<listcomp>       ;;;AQ;;;r   r      N)shaper   r   fused_transpose_split_quant)r   input_scalestokens_per_expertpow_2_scaless       r   r-   r-      sw    d <;):;;;wqzQ!'!*//2v 
1|.
 
 	

 
r   r/   Sequence[int]r0   !tuple[list[Tensor], list[Tensor]]c                f    d |D             }t                      rt          j        | ||          S d S )Nc                ,    g | ]}t          |          S r   r$   r&   s     r   r)   z4fused_transpose_wlch_split_quant.<locals>.<listcomp>   r*   r   )r   r    fused_transpose_wlch_split_quant)r   r/   r0   s      r   r5   r5      sN     <;):;;; 
6 ,
 
 	

 
r   probTensor | Noneusing_pow2_scalingc                N    t                      rt          j        | ||          S dS )aE
  
    Applies fused weighted SwiGLU activation followed by quantization to float8_e4m3fn format.

    Note:
        This function combines four operations into a single optimized CUDA kernel:
        1. SwiGLU activation: SwiGLU(x1, x2) = SiLU(x1) * x2 = (x1 * sigmoid(x1)) * x2
        2. Probability weighting: multiply by optional probability factors
        3. Activation computation: compute final activation values in float32 precision
        4. Quantization: convert results to float8_e4m3fn with computed scaling factors

        The input tensor is split into two halves along the last dimension:
        - Left half [0, cols/2): first input to SwiGLU (gate values)
        - Right half [cols/2, cols): second input to SwiGLU (activation values)

    Args:
        x (Tensor): Input tensor with dtype bfloat16 and shape [..., cols], where cols
            must be even. The tensor is interpreted as two concatenated matrices:
            gate values [0:cols/2] and activation values [cols/2:cols].
            Typical shapes: [batch_size, sequence_length, hidden_dim] or
            [tokens, expert_dim] in MoE scenarios.
        prob (Tensor, optional): Probability weighting tensor with dtype float32 and
            shape matching x's batch dimensions [...]. Each value
            multiplies the corresponding row's activation output.
        using_pow2_scaling (bool, optional): Whether to use power-of-2 quantization
            scaling for hardware efficiency.

    Returns:
        tuple:
            - out (Tensor). Quantized activation output with dtype float8_e4m3fn
              and shape [..., cols/2]. Contains the quantized SwiGLU results.
            - scale (Tensor). Dequantization scales with dtype float32 and shape
              [..., (cols/2 + 127) // 128]. Each scale corresponds to a 128-element
              block in the output tensor. To dequantize: original_value = quantized_value / scale.

    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('BF16 requires SM80 or higher env')
            >>> import paddle
            >>> import paddle.incubate.nn.functional as F
            >>> paddle.set_device('gpu')

            >>> batch_size, seq_len, expert_dim = 32, 128, 2048
            >>> x = paddle.randn([batch_size, seq_len, expert_dim], dtype='bfloat16')
            >>> quantized_out, scales = F.fused_weighted_swiglu_act_quant(x)
            >>> print(x.shape)
            paddle.Size([32, 128, 2048])
            >>> print(quantized_out.shape)
            paddle.Size([4096, 1024])
            >>> print(scales.shape)
            paddle.Size([4096, 8])
    N)r   r   fused_weighted_swiglu_act_quant)r   r6   r8   r   s       r   r:   r:      s:    t  
5t'
 
 	

 
r   outbias
accumulateuse_split_accumulatoris_a_1d_scaledis_b_1d_scaledc                   |
J d            |t                      }n)|j        t          j        t          j        fv s
J d            | j        \  }}|j        \  }}|t          j        ||f|          }nB|j        ||gk    sJ d||f d|j                     |                                s
J d            t                      rt          j	        j
                                        j        dk    rdnd	}t          j        |gt          j                  }d
\  }}d}d}t          j        ||| |||t                      ||||||||
|	          \  }}}|S d S )NzBias is not supportedz*Only fp16 and bfloat16 bias are supported.)dtypezExpected shape z, got z Output tensor is not contiguous.	   i   i  @ )TFFp   )r   rB   paddlefloat16bfloat16r,   emptyis_contiguousr   devicecudaget_device_propertiesmajoruint8r   fp8_gemm_blockwise_)aa_decode_scalebb_decode_scale	out_dtyper;   r<   r=   r>   r?   r@   MKNK_bworkspace_size	workspacetransatransbgradmath_sm_countoutput_s                          r   fp8_gemm_blockwisera   ;  s    <<0<<<|zNO
 
 
 
 8
 
 

 7DAqWFAs
{lAq6333y
 
 
 
 7aV663966
 
 
   ""FF$FFF"  }!7799?1DD J 	
 L.!1FFF	$ 1OO!!
 
1$ ? r           1x128e4m3epsilonfloatinput_transposeoutput_scale_transposereturn_transpose_onlyusing_pow2_scaleusing_ue8m0_scalequant_methodstroutput_typec
                    |dk    rd}
n|dk    rd}
nt          d          |dk    rd}nt          d          t                      r3t          j        | ||
||||||	  	        \  }}}}|s||fS |r||fS ||||fS dS )	a  
    Applies blockwise FP8 quantization to input tensor with flexible configuration options.

    Note:
        This function performs blockwise quantization from higher precision formats (typically bfloat16)
        to FP8 format (float8_e4m3fn by default). The quantization is performed in blocks (128x128 or 1x128)
        for better numerical stability and hardware efficiency.

    Args:
        x (Tensor): Input tensor to be quantized. Typically has dtype bfloat16 and shape [M, N].
        epsilon (float, optional): Small constant added to avoid division by zero when computing scales.
            Default: 0.0.
        input_transpose (bool, optional): Whether to transpose the input before quantization.
            If True, input shape [M, N] becomes [N, M]. Default: False.
        output_scale_transpose (bool, optional): Whether to transpose the output scale tensor.
            Default: True.
        return_transpose_only (bool, optional): If True and input_transpose is True, returns only
            the transposed quantized output and scale. Default: False.
        using_pow2_scale (bool, optional): Whether to use power-of-2 quantization scaling for
            better hardware efficiency. Default: True.
        using_ue8m0_scale (bool, optional): Whether to use unsigned 8-bit with mantissa 0 scaling format.
            If True, the output scale tensor has dtype int32, where each element contains 4 packed ue8m0 scales.
            If False, the output scale tensor has dtype float32.
            Default: False.
        quant_method (str, optional): Quantization block size method. Options: "1x128" or "128x128".
            "1x128" uses 1x128 blocks, "128x128" uses 128x128 blocks. Default: "1x128".
        output_type (str, optional): Output FP8 format. Currently only "e4m3" (float8_e4m3fn) is supported.
            Default: "e4m3".
        name (str, optional): Name for the operation. Default: None.

    Returns:
        tuple: The return value depends on the configuration:
            - If not input_transpose: returns (quantized_output, scale)
            - If return_transpose_only and input_transpose: returns (transposed_quantized_output, transposed_scale)
            - Otherwise: returns (quantized_output, scale, transposed_quantized_output, transposed_scale)

        Where:
            - quantized_output (Tensor): Quantized output tensor with dtype float8_e4m3fn.
            - scale (Tensor): Dequantization scale tensor with dtype float32.
            - transposed_quantized_output (Tensor): Transposed quantized output (if input_transpose).
            - transposed_scale (Tensor): Transposed scale tensor (if input_transpose).

    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('BF16 requires SM80 or higher env')
            >>> import paddle
            >>> import paddle.incubate.nn.functional as F
            >>> paddle.set_device('gpu')

            >>> x = paddle.randn([1024, 512], dtype='bfloat16')
            >>> x = paddle.clip(x, min=-50, max=50)

            # Basic quantization
            >>> quantized, scale = F.fp8_quant_blockwise(x)
            >>> print(quantized.shape)
            paddle.Size([1024, 512])
            >>> print(scale.shape)
            paddle.Size([1024, 4])

            # With transpose
            >>> quantized, scale, transposed_quantized, transposed_scale = F.fp8_quant_blockwise(
            ...     x, input_transpose=True, return_transpose_only=False
            ... )
            >>> print(transposed_quantized.shape)
            paddle.Size([512, 1024])
    rc   T128x128FzUnsupported quantization methodrd   zUnsupported output typeN)
ValueErrorr   r   fp8_quant_blockwise)r   re   rg   rh   ri   rj   rk   rl   rn   r   using_1x128
using_e5m2x_fp8scalex_fp8_tscale_ts                   r   rr   rr     s    ^ w		"	":;;;f

2333 2)/)C"!
*
 
*
&ugw  	2%<" 	2G##%'11%2 2r   )r	   r   )T)r   r   r   r   r	   r   )r   r   r   r   r	   r   )N)
r   r   r   r   r   r   r   r   r	   r   )F)r   r   r/   r1   r0   r   r	   r2   )NFN)
r   r   r6   r7   r8   r   r   r   r	   r   )NNFTTT)r;   r7   r<   r7   r=   r   r>   r   r?   r   r@   r   )	rb   FTFTFrc   rd   N)r   r   re   rf   rg   r   rh   r   ri   r   rj   r   rk   r   rl   rm   rn   rm   r   r   )
__future__r   	functoolstypingr   rE   r   r   paddle.frameworkr   collections.abcr   cacher   r   r   r!   r-   r5   r:   ra   rr   r   r   r   <module>r      s   # " " " " "                  ! ! ! ! ! ! ! ! 3 3 3 3 3 3 )((((((     ,0-/ -/ -/ -/ -/`4 4 4 42 	HK HK HK HK HKX 6;:
 :
 :
 :
| GL
 
 
 
 
 $	=
 =
 =
 =
 =
L "&B B B B BN !#'"'!#m2 m2 m2 m2 m2 m2 m2r   