
    ёi8                       % S SK Jr  S SKJrJr  S SKrS SKJr  S SKJr  S SK	J
r
  S SKJr  S SKJrJr  \(       a&  S S	KJr  S S
KJr  S SKJr  \S   rS\S'   \S   rS\S'   S r   S         SS jjr   S           SS jjr     S               SS jjr   S           SS jjrSS jrg)     )annotations)TYPE_CHECKINGLiteralN)_C_ops)check_dtype)is_compiled_with_cuda)get_device_capability)LayerHelperin_dynamic_or_pir_mode)	TypeAlias)Tensor)	DTypeLike)weight_only_int8weight_only_int4zllm.int8r   _Algo)@      
_GroupSizec                     [        5       (       ak  [        R                  R                  5       n U b  U S:w  d  [        R                  " 5       (       a  [        5       u  p[        US-  U-   5      nU$ [        S5      eg)NFalse
   zmPaddle is not compiled with CUDA, we cannot get SMVersion from device, please try to compile Paddle with CUDAr   )r   paddleversioncudais_compiled_with_rocmr	   int
ValueError)cuda_versionmajorminorarchs       `/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/nn/quant/quantized_linear.py_get_arch_infor$   +   sp    ~~**,$)@))++02LEurzE)*DK 
     c           	        Uc
  [        5       n[        5       (       a;  US:X  d5  US:X  d/  US:X  d)  US:X  d#  US:X  d  US:X  d  US:X  d  US:X  d   S	U S
35       eUS:X  d  US:X  d  US:X  d   SU S
35       e[        5       (       a  [        R                  " XX#5      $ Sn[        U40 [        5       D6nUR                  S5      nUR                  S5      nUR                  USU 0XgS.XUS.S9  Xg4$ )a"  
Quantization function for weight_only and llm.int8's weight.

Args:
    x (Tensor): The input Tensor to be quantized, the data type is float16 or bfloat16.
    algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
        'weight_only_int4', 'llm.int8', 'w4a8' and 'w4afp8, default: 'weight_only_int8'.
    arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
    group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.

Returns:
    out (Tensor): The Tensor which is the quantitative results, the data type is int8, the shape is transposition of x.
    scale (Tensor): The scale Tensor which is the scale of pre-channel, the data type is float32.
Examples:
    .. code-block:: pycon

        >>> # doctest: +SKIP('No testing required')
        >>> import paddle
        >>> from paddle.nn.quant import weight_quantize

        >>> paddle.seed(2023)
        >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
        >>> out, scale = weight_quantize(x, algo='weight_only_int8')
        >>> print(out.shape)
        paddle.Size([32, 64])
        >>> print(scale.shape)
        paddle.Size([32])
F   K   P   V   Y   Z   \   d   KCurrently weight_quantize only support SM70/75/80/86/89/90/92/100. but got  r   r   r   5Currently group_size only support -1/64/128. but got weight_quantizeint8floatx)outscale)algor"   
group_sizetypeinputsoutputsattrs)	r$   r   r   r   r2   r
   locals"create_variable_for_type_inference	append_op)r5   r8   r"   r9   r;   helperr6   r7   s           r#   r2   r2   >   s.   D |BJrzrzrzrzrzrzs{	
 ZZ^Y__`a	
 zR/:3D 
?
|1MD %%at@@ T.VX.77?99'B80ZH	 	 	
 |r%   c                $   US:X  d  US:X  d  US:X  d   SU S35       e[        5       (       a  [        R                  " XX$5      $ Sn[        U40 [	        5       D6nUR
                  nUR                  U5      nUR                  UXS.SU0UUS	.S
9  U$ )aV  
Dequantization function for weight_only and llm.int8's weight.

Args:
    x (Tensor): The input Tensor to be dequantized, the data type is int8.
    scale (Tensor): The scale Tensor which is the output of weight_quantize, the data type is float32.
    algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
        'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'.
    out_dtype (str|np.dtype): [Deprecated][Not used] The output Tensor's data type, must be one of 'float16' and 'bfloat16', default: 'float16'.

Returns:
    out (Tensor): The Tensor which is the dequantitative results, the data type is float16 or bfloat16, the shape is transposition of x.

Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('No testing required')
        >>> import paddle
        >>> from paddle.nn.quant import weight_quantize, weight_dequantize

        >>> paddle.seed(2023)
        >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
        >>> out, scale = weight_quantize(x, algo='weight_only_int8')
        >>> x_dequant = weight_dequantize(out, scale)
r   r   r   r1   r0   weight_dequantize)r5   r7   r6   )r8   r9   r:   )r   r   rD   r
   r?   dtyper@   rA   )r5   r7   r8   	out_dtyper9   r;   rB   r6   s           r#   rD   rD      s    @ zR/:3D 
?
|1MD ''$CC"T.VX.KK	77	B+CL(	 	 	
 
r%   c           	         Uc
  [        5       n[        5       (       a;  US:X  d5  US:X  d/  US:X  d)  US:X  d#  US:X  d  US:X  d  US:X  d  US:X  d   S	U S
35       eUS:X  d  US:X  d  US:X  d   SU S
35       e[        5       (       a  [        R                  " XX#XEU5      nU$ [        USSS/S5        Sn[        U40 [        5       D6n	U R                  n
U /U/U/S.nUb  U/US'   UUUS.nU	R                  U
5      nU	R                  UUSU0US9  U$ )ab  
Applies matrix multiplication of two tensors and then bias addition if provided.
This method requires CUDA version >= 11.2.

Args:
    x (Tensor): The first input Tensor to be multiplied, the data type is float16 or bfloat16.
    weight (Tensor): The second input Tensor to be multiplied. Its rank must be 2.
    bias (Tensor|None): The input bias Tensor. If it is None, no bias addition would
        be performed. Otherwise, The bias is added to the matrix multiplication result.
    weight_scale (Tensor|None): The input scale Tensor Provided to weight for dequantization. Its rank must be 1.
    weight_dtype(str): The dtype of  weight Tensor, must be one of 'int8', 'int4', Defaulted to 'int8'.
    arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
    group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.
Returns:
    Tensor: the output Tensor, the data type is the same as that of x.

Examples:
    .. code-block:: pycon

        >>> # doctest: +SKIP('No testing required')
        >>> import paddle
        >>> from paddle.nn.quant import weight_only_linear

        >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
        >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
        >>> scale = paddle.randn([32], dtype='float32')
        >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
        >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
        ...     out = weight_only_linear(
        ...         x,
        ...         weight,
        ...         bias=bias,
        ...         weight_scale=scale,
        ...         weight_dtype='int8',
        ...     )
        ...     print(out.shape)
        paddle.Size([1, 2, 32])
r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r   r   r   zLCurrently weight_quantize only support group size of -1, 64 or 128. but got weight_dtyper3   int4weight_only_linearr5   weightweight_scalebias)rH   r"   r9   r6   r:   )r$   r   r   r   rJ   r   r
   r?   rE   r@   rA   )r5   rL   rN   rM   rH   r"   r9   r6   r;   rB   rE   r<   r>   s                r#   rJ   rJ      s   ^ |BJrzrzrzrzrzrzs{	
 ZZ^Y__`a	
 zR/:3D 
VWaVbbcdD ''t<z
 
.66*:<P	
 $T.VX. h)N

 "VF6N($
 77>CL	 	 	
 
r%   c                   [        5       (       a  [        R                  " XX#U5      nU$ Sn[        U40 [	        5       D6nU R
                  nU /U/U/S.n	U(       a  U/U	S'   SU0n
UR                  U5      nUR                  UU	SU0U
S9  U$ )a  
Applies matrix multiplication of two tensors and then bias addition if provided.
This method requires CUDA version >= 11.2.

Args:
    x (Tensor): the first input Tensor to be multiplied, the data type is float16 or bfloat16.
    weight (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
    bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
        be performed. Otherwise, the bias is added to the matrix multiplication result.
    weight_scale (Tensor|None): the input scale Tensor Provided to weight for dequantization. Its rank must be 1.
    threshold(float): The min value of outlier in activation, outlier's channel will be apply multiply with x.dtype.

Returns:
    Tensor: the output Tensor, the data type is the same as that of x.

Examples:
    .. code-block:: pycon

        >>> # doctest: +SKIP('No testing required')
        >>> import paddle
        >>> from paddle.nn.quant import llm_int8_linear

        >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
        >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
        >>> scale = paddle.randn([32], dtype='float32')
        >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
        >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
        ...     out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
        ...     print(out.shape)
        paddle.Size([1, 2, 32])
llm_int8_linearrK   rN   	thresholdr6   r:   )r   r   rP   r
   r?   rE   r@   rA   )r5   rL   rN   rM   rQ   r6   r;   rB   rE   r<   r>   s              r#   rP   rP   %  s    L $$QIN
 T.VX. h)N

 "VF6Ni(77>CL	 	 	
 
r%   c                    [        5       (       a  [        R                  " X5      $ Sn[        U40 [	        5       D6nUR                  U R                  5      nUR                  UU /U/S.SU0S9  U$ )a  
Apply pre-quant per channel scale on activations

Args:
    x (Tensor): Input tensor representing the activations, the data type can be float16 or bfloat16.
    scales(Tensor): Per-channel scale factors for pre-quantization. Data type should be compatible with x.

Returns:
    out (Tensor): The Tensor which is the pre-quant results, the data type is compatible with x.

Examples:
    .. code-block:: python

        >>> # doctest: +SKIP('No testing required')
        >>> import paddle
        >>> from paddle.nn.quant import apply_per_channel_scale

        >>> paddle.seed(2023)
        >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
        >>> scales = paddle.rand(shape=[32], dtype=paddle.float16)
        >>> out = apply_per_channel_scale(x, scales)
apply_per_channel_scale)r5   scalesr6   )r;   r<   r=   )r   r   rS   r
   r?   r@   rE   rA   )r5   rT   r;   rB   r6   s        r#   rS   rS   g  sx    0 --a88(T.VX.77@1CL 	 	

 
r%   )r   Nr   )
r5   r   r8   r   r"   
int | Noner9   r   returnztuple[Tensor, Tensor])r   float16r   )r5   r   r7   r   r8   r   rF   r   r9   r   rV   r   )NNr3   Nr   )r5   r   rL   r   rN   Tensor | NonerM   rX   rH   r   r"   rU   r9   r   rV   r   )NNg      @)r5   r   rL   r   rN   rX   rM   rX   rQ   r4   rV   r   )r5   r   rT   r   rV   r   )
__future__r   typingr   r   r   r   paddle.base.data_feederr   paddle.devicer   paddle.device.cudar	   paddle.frameworkr
   r   typing_extensionsr   r   paddle._typingr   r   __annotations__r   r$   r2   rD   rJ   rP   rS    r%   r#   <module>rc      s   # )   / 5
 +(:E9  $K0J	0* %	DD
D D 	D
 DT %$555 5 	5
 5 5v "&$eee e  	e
 e e e eV "&??? ?  	?
 ? ?D$r%   