
    ϑi?                       S SK Jr  S SKrS SKJr  S SKrS SKJrJr  S SKJ	r	  \(       a  S SK
Jr  \R                  SS j5       r S     SS jjr      SS	 jr S         SS
 jjr SS jr S       SS jjr   S         SS jjr      S           SS jjr        S                 SS jjrg)    )annotationsN)TYPE_CHECKING)Tensor_C_ops)in_dynamic_or_pir_mode)Sequencec                     [        5       $ )z&Get tensor with no entries and no data)r        a/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/incubate/nn/functional/fp8.py_empty_tensorr      s     8Or   c                    [        5       (       a3  U(       a  [        R                  " U 5      $ [        R                  " U 5      $ g)a  
Fused operation that performs stacking, optional transposition, and quantization
on a list of bfloat16 tensors.

Args:
    x (list[Tensor] or tuple[Tensor]): A list or tuple of bfloat16 tensors, where each tensor
        has shape `[M, K]`. All tensors should have the same shape and dtype.
    transpose (bool, optional): If True, applies a transpose before quantization.
        Default is True.

Returns:
    tuple:
        - out (Tensor): The quantized output tensor with dtype `float8_e4m3fn`.
        - scale (Tensor): A float32 tensor representing the quantization scale.

Examples:
    .. code-block:: pycon

        >>> # doctest: +SKIP('BF16 requires SM80 or higher env')
        >>> import paddle
        >>> import paddle.incubate.nn.functional as F
        >>> paddle.set_device('gpu')

        >>> x_vec = []
        >>> num_experts = 1
        >>> seq_len = 2048
        >>> hidden_size = 128
        >>> for _ in range(num_experts):
        ...     x = paddle.randn([seq_len, hidden_size], dtype='bfloat16')
        ...     x = paddle.clip(x, min=-50, max=50)
        ...     x_vec.append(x)

        >>> out, scale = F.fused_stack_transpose_quant(x_vec, transpose=True)
        >>> print(out.shape)
        paddle.Size([128, 2048])
        >>> print(scale.shape)
        paddle.Size([1, 16])
N)r   r   fused_stack_transpose_quantfused_stack_quant)x	transposes     r   r   r   #   s8    R 55a88++A..	  r   c                N    [        5       (       a  [        R                  " X5      $ g)av  
Applies fused activation and dequantization operation to convert float8 quantized data back to bfloat16.

Args:
    x (Tensor): Input quantized tensor with dtype float8_e4m3fn and shape [M, N]. This tensor contains the quantized
        activations from previous layers.
    x_scale (Tensor): Dequantization scale tensor with dtype float32 and shape [M, (N + 127) // 128].
        Each scale value corresponds to a 128-column block in the input tensor.

Returns:
    Tensor. Dequantized output tensor with dtype bfloat16 and shape [M, N]. The values are
        computed as input * scale for each corresponding 128-column block.
N)r   r   fused_act_dequant)r   x_scales     r   r   r   S   s#    " ''33  r   c                P    [        5       (       a  [        R                  " XU5      $ g)u^  
Computes gradients for fused weighted SwiGLU activation function in backward pass.

Note:
    This function performs the backward propagation for the SwiGLU (Swish-Gated Linear Unit)
    activation with probability weighting. It computes gradients with respect to both the
    input activations and the probability weights, while also recomputing forward pass values
    for memory efficiency. The kernel automatically selects between vectorized and standard
    implementations based on input dimensions.

Args:
    o1 (Tensor): Forward pass input tensor with dtype bfloat16 and shape
        [..., intermediate_size * 2]. The tensor is split into two halves:
        - Left half [0:intermediate_size]: x1 values (gate inputs)
        - Right half [intermediate_size:]: x2 values (activation inputs)
        This is the same input used in the forward SwiGLU computation.
    do2_s (Tensor): Upstream gradient tensor with dtype bfloat16 and shape
        [..., intermediate_size]. Contains gradients flowing back from
        the next layer, representing ∂L/∂output before probability weighting.
        Each element corresponds to the gradient of one output element.
    unzipped_probs (Tensor): Probability weighting tensor with dtype float32 and
        shape matching the batch dimensions of o1 and do2_s
        [...]. Each probability value was used to weight the
        corresponding row's output in the forward pass.

Returns:
    tuple:
        - do1 (Tensor). Input gradients with dtype bfloat16 and shape
          [..., intermediate_size * 2]. Layout matches o1:
          - [0:intermediate_size]: ∂L/∂x1 (gradients w.r.t. gate inputs)
          - [intermediate_size:]: ∂L/∂x2 (gradients w.r.t. activation inputs)
        - probs_grad (Tensor). Probability gradients with dtype float32 and
          shape [...]. Each element is ∂L/∂prob for the corresponding batch item,
          computed as the sum of (∂L/∂output_i * SwiGLU_output_i) across the
          intermediate dimension.
        - o2_s (Tensor). Recomputed forward output with dtype bfloat16 and
          shape [..., intermediate_size]. Contains SwiGLU(x1, x2) * prob values.
          This avoids storing forward activations, trading computation for memory.

Examples:
    .. code-block:: pycon

        >>> # doctest: +SKIP('BF16 requires SM80 or higher env')
        >>> import paddle
        >>> import paddle.incubate.nn.functional as F
        >>> paddle.set_device('gpu')

        >>> batch_size, seq_len = 32, 128
        >>> intermediate_size = 2048

        >>> o1 = paddle.randn(
        ...     [batch_size, seq_len, intermediate_size * 2],
        ...     dtype='bfloat16',
        ... )
        >>> do2_s = paddle.randn([batch_size, seq_len, intermediate_size], dtype='bfloat16')
        >>> expert_probs = paddle.rand([batch_size, seq_len, 1], dtype='float32')

        >>> do1, probs_grad, o2_s = F.fused_swiglu_weighted_bwd(o1, do2_s, expert_probs)
        >>> print(do1.shape)
        paddle.Size([32, 128, 4096])
        >>> print(probs_grad.shape)
        paddle.Size([32, 128, 1])
        >>> print(o2_s.shape)
        paddle.Size([32, 128, 2048])
N)r   r   fused_swiglu_weighted_bwd)o1do2_sunzipped_probsnames       r   r   r   h   s&    N //>JJ  r   c                    U Vs/ s H  n[        U5      PM     nnU R                  S   S:X  d  U R                  S   S:X  a  / / 4$ [        5       (       a  [        R                  " XX#5      $ gs  snf )a  
Applies fused transpose, split, and quantization operation for Mixture of Experts (MoE) models.

Note:
    This function performs three operations in a single optimized CUDA kernel:
    1. Quantizes input from bfloat16 to float8_e4m3fn format using column-wise scaling
    2. Transposes the matrix from [M, K] to [K, M] layout
    3. Splits the transposed data across multiple experts based on token distribution

Args:
    x (Tensor): Input tensor of shape [M, K] with dtype bfloat16, where M is the total
        number of tokens and K is the feature dimension. M must be divisible by 128
        for optimal performance.
    tokens_per_expert (List[int]): List containing the number of tokens assigned to each expert.
        Each value should be a multiple of 128 for optimal performance.
        The sum should equal M (total tokens). Values can be 0 for
        unused experts.
    pow_2_scales (bool, optional): Whether to constrain quantization scales to powers of 2
        for better hardware efficiency. If True, scales will be
        rounded to the nearest power of 2. Default: False.

Returns:
    tuple:
        - outs (List[Tensor]). List of quantized and transposed output tensors, one per expert.
          Each tensor has shape [K, tokens_per_expert[i]] and dtype float8_e4m3fn.
          Empty tensors are included for experts with 0 tokens.
        - scales (List[Tensor]). List of dequantization scale tensors, one per expert.
          Each tensor has shape [K // 128, tokens_per_expert[i] // 128]
          and dtype float32. These are the reciprocal of quantization scales.

Examples:
    .. code-block:: pycon

        >>> # doctest: +SKIP('BF16 requires SM80 or higher env')
        >>> import paddle
        >>> import paddle.incubate.nn.functional as F
        >>> paddle.set_device('gpu')

        >>> x = paddle.randn([384, 512], dtype='bfloat16')
        >>> x = paddle.clip(x, min=-50, max=50)
        >>> tokens_per_expert = [128, 128, 128]
        >>> outs, scales = F.fused_transpose_split_quant(x, None, tokens_per_expert, pow_2_scales=True)
        >>> print(outs[0].shape)
        paddle.Size([512, 128])
        >>> print(scales[0].shape)
        paddle.Size([1, 512])
r      N)intshaper   r   fused_transpose_split_quant)r   input_scalestokens_per_expertpow_2_scalests        r   r    r       sr    d *;;):AQ):;wwqzQ!''!*/2v11.
 	
   <s   A-c                    U Vs/ s H  n[        U5      PM     nn[        5       (       a  [        R                  " XU5      $ g s  snf N)r   r   r    fused_transpose_wlch_split_quant)r   r"   r#   r$   s       r   r'   r'      sJ     *;;):AQ):;66,
 	
   <s   Ac                P    [        5       (       a  [        R                  " XU5      $ g)a	  
Applies fused weighted SwiGLU activation followed by quantization to float8_e4m3fn format.

Note:
    This function combines four operations into a single optimized CUDA kernel:
    1. SwiGLU activation: SwiGLU(x1, x2) = SiLU(x1) * x2 = (x1 * sigmoid(x1)) * x2
    2. Probability weighting: multiply by optional probability factors
    3. Activation computation: compute final activation values in float32 precision
    4. Quantization: convert results to float8_e4m3fn with computed scaling factors

    The input tensor is split into two halves along the last dimension:
    - Left half [0, cols/2): first input to SwiGLU (gate values)
    - Right half [cols/2, cols): second input to SwiGLU (activation values)

Args:
    x (Tensor): Input tensor with dtype bfloat16 and shape [..., cols], where cols
        must be even. The tensor is interpreted as two concatenated matrices:
        gate values [0:cols/2] and activation values [cols/2:cols].
        Typical shapes: [batch_size, sequence_length, hidden_dim] or
        [tokens, expert_dim] in MoE scenarios.
    prob (Tensor, optional): Probability weighting tensor with dtype float32 and
        shape matching x's batch dimensions [...]. Each value
        multiplies the corresponding row's activation output.
    using_pow2_scaling (bool, optional): Whether to use power-of-2 quantization
        scaling for hardware efficiency.

Returns:
    tuple:
        - out (Tensor). Quantized activation output with dtype float8_e4m3fn
          and shape [..., cols/2]. Contains the quantized SwiGLU results.
        - scale (Tensor). Dequantization scales with dtype float32 and shape
          [..., (cols/2 + 127) // 128]. Each scale corresponds to a 128-element
          block in the output tensor. To dequantize: original_value = quantized_value / scale.

Examples:
    .. code-block:: pycon

        >>> # doctest: +SKIP('BF16 requires SM80 or higher env')
        >>> import paddle
        >>> import paddle.incubate.nn.functional as F
        >>> paddle.set_device('gpu')

        >>> batch_size, seq_len, expert_dim = 32, 128, 2048
        >>> x = paddle.randn([batch_size, seq_len, expert_dim], dtype='bfloat16')
        >>> quantized_out, scales = F.fused_weighted_swiglu_act_quant(x)
        >>> print(x.shape)
        paddle.Size([32, 128, 2048])
        >>> print(quantized_out.shape)
        paddle.Size([4096, 1024])
        >>> print(scales.shape)
        paddle.Size([4096, 8])
N)r   r   fused_weighted_swiglu_act_quant)r   probusing_pow2_scalingr   s       r   r)   r)      s,    t 55'
 	
  r   c                   Ub   S5       eUc  [        5       nO5UR                  [        R                  [        R                  4;   d   S5       eU R
                  u  pUR
                  u  pUc  [        R                  " X4US9nOFUR
                  UU/:X  d   SX4 SUR
                   35       eUR                  5       (       d   S5       e[        5       (       a  [        R                  R                  R                  5       R                  S:  a  SOS	n[        R                  " U/[        R                  S9nS
u  nnSnSn[        R                  " UUU UUU[        5       UUUUUUUU
U	5      u  n  nU$ g )NzBias is not supportedz*Only fp16 and bfloat16 bias are supported.)dtypezExpected shape z, got z Output tensor is not contiguous.	   i   i  @ )TFFp   )r   r-   paddlefloat16bfloat16r   emptyis_contiguousr   devicecudaget_device_propertiesmajoruint8r   fp8_gemm_blockwise_)aa_decode_scalebb_decode_scale	out_dtypeoutbias
accumulateuse_split_accumulatoris_a_1d_scaledis_b_1d_scaledMKNK_bworkspace_size	workspacetransatransbgradmath_sm_countoutput_s                          r   fp8_gemm_blockwiserR   ;  s    <000<|zzNNOO
 
 	8 8	8 

 77DAWWFA
{llA63yy
 
 	7 aVHF399+6	7 
   ""F$FF" }}!!779??1D  	
 LL.!1F	$ 11O!!
1$ ?  r   c	           
         US:X  a  Sn	OUS:X  a  Sn	O[        S5      eUS:X  a  Sn
O[        S5      e[        5       (       a8  [        R                  " U UU	UUUU
U5      u  ppU(       d  X4$ U(       a  X4$ XX4$ g )N1x128T128x128FzUnsupported quantization methode4m3zUnsupported output type)
ValueErrorr   r   fp8_quant_blockwise)r   epsiloninput_transposeoutput_scale_transposereturn_transpose_onlyusing_pow2_scalequant_methodoutput_typer   using_1x128
using_e5m2x_fp8scalex_fp8_tscale_ts                  r   rX   rX     s     w		":;;f
233)/)C)C"!	*
&g <"##11#  r   )returnr   )T)r   zSequence[Tensor]r   boolrf   tuple[Tensor, Tensor])r   r   r   r   rf   r   r&   )
r   r   r   r   r   r   r   
str | Nonerf   ztuple[Tensor, Tensor, Tensor])F)r   r   r"   zSequence[int]r#   rg   rf   z!tuple[list[Tensor], list[Tensor]])NFN)
r   r   r*   Tensor | Noner+   rg   r   ri   rf   rh   )NNFTTT)r@   rj   rA   rj   rB   rg   rC   rg   rD   rg   rE   rg   )g        FTFTrT   rV   N)r   r   rY   floatrZ   rg   r[   rg   r\   rg   r]   rg   r^   strr_   rl   r   ri   )
__future__r   	functoolstypingr   r0   r   r   paddle.frameworkr   collections.abcr   cacher   r   r   r   r    r'   r)   rR   rX   r
   r   r   <module>rs      s   #     ! 3(   ,0-/-/$(-/-/`444 42 	HKHKHK HK 	HK
 #HKX 6;:
| GL

"/
?C
&
 $	=
=

=
 =
 	=

 =
L "&B 
B B B  B B BN !#'"'!(2(2(2 (2 !	(2
  (2 (2 (2 (2 (2r   