
    {-j8                       U d dl mZ d dlmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ er,d d	lmZ d d
lmZ d dlmZ ed         Zded<   ed         Zded<   d Z	 	 	 d/d0dZ	 	 	 d1d2d!Z	 	 	 	 	 d3d4d(Z	 	 	 d5d6d,Zd7d.ZdS )8    )annotations)TYPE_CHECKINGLiteralN)_C_ops)check_dtype)is_compiled_with_cuda)get_device_capability)LayerHelperin_dynamic_or_pir_mode)	TypeAlias)Tensor)	DTypeLike)weight_only_int8weight_only_int4zllm.int8r   _Algo)@      
_GroupSizec                    t                      rpt          j                                        } | | dk    st          j                    r(t                      \  }}t          |dz  |z             }|S t          d          dS )NFalse
   zmPaddle is not compiled with CUDA, we cannot get SMVersion from device, please try to compile Paddle with CUDAr   )r   paddleversioncudais_compiled_with_rocmr	   int
ValueError)cuda_versionmajorminorarchs       `/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/nn/quant/quantized_linear.py_get_arch_infor$   +   s     ~**,,$)@)@)++ *A022LE5urzE)**DK  
 q    r   r   xr   algor"   
int | None
group_sizereturntuple[Tensor, Tensor]c           	        |t                      }t                      r>|dk    s8|dk    s2|dk    s,|dk    s&|dk    s |dk    s|dk    s|d	k    sJ d
| d            |dk    s|dk    s|dk    sJ d| d            t                      rt          j        | |||          S d}t          |fi t                      }|                    d          }|                    d          }|                    |d| i||d|||d           ||fS )a~  
    Quantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be quantized, the data type is float16 or bfloat16.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4', 'llm.int8', 'w4a8' and 'w4afp8, default: 'weight_only_int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
        group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.

    Returns:
        out (Tensor): The Tensor which is the quantitative results, the data type is int8, the shape is transposition of x.
        scale (Tensor): The scale Tensor which is the scale of pre-channel, the data type is float32.
    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> print(out.shape)
            paddle.Size([32, 64])
            >>> print(scale.shape)
            paddle.Size([32])
    NF   K   P   V   Y   Z   \   d   KCurrently weight_quantize only support SM70/75/80/86/89/90/92/100. but got  r   r   r   5Currently group_size only support -1/64/128. but got weight_quantizeint8floatr&   )outscale)r'   r"   r)   typeinputsoutputsattrs)	r$   r   r   r   r8   r
   locals"create_variable_for_type_inference	append_op)r&   r'   r"   r)   r>   helperr;   r<   s           r#   r8   r8   >   s|   D | 
BJJrzzrzzrzzrzzrzzrzzs{{{aZ^aaa { zR//:3D3D3DM
MMM 4E3DD  %atZ@@@ T..VXX..77??99'BB8%00ZHH	 	 	
 	
 	
 U|r%   float16r<   	out_dtyper   c                @   |dk    s|dk    s|dk    sJ d| d            t                      rt          j        | |||          S d}t          |fi t	                      }|j        }|                    |          }|                    || |dd|i||d	
           |S )a  
    Dequantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be dequantized, the data type is int8.
        scale (Tensor): The scale Tensor which is the output of weight_quantize, the data type is float32.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'.
        out_dtype (str|np.dtype): [Deprecated][Not used] The output Tensor's data type, must be one of 'float16' and 'bfloat16', default: 'float16'.

    Returns:
        out (Tensor): The Tensor which is the dequantitative results, the data type is float16 or bfloat16, the shape is transposition of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize, weight_dequantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> x_dequant = weight_dequantize(out, scale)
    r   r   r   r7   r6   weight_dequantize)r&   r<   r;   )r'   r)   r=   )r   r   rI   r
   rB   dtyperC   rD   )r&   r<   r'   rG   r)   r>   rE   r;   s           r#   rI   rI      s    @ zR//:3D3D3DM
MMM 4E3DD  '5$
CCC"T..VXX..K	77	BBU++CL( 	 	 	
 	
 	
 
r%   r9   weightbiasTensor | Noneweight_scaleweight_dtypec           	     L   |t                      }t                      r>|dk    s8|dk    s2|dk    s,|dk    s&|dk    s |dk    s|dk    s|d	k    sJ d
| d            |dk    s|dk    s|dk    sJ d| d            t                      rt          j        | ||||||          }|S t          |dddgd           d}t          |fi t                      }	| j        }
| g|g|gd}||g|d<   |||d}|		                    |
          }|	
                    ||d|i|           |S )a  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): The first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): The second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): The input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, The bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): The input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        weight_dtype(str): The dtype of  weight Tensor, must be one of 'int8', 'int4', Defaulted to 'int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
        group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.
    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_only_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...     out = weight_only_linear(
            ...         x,
            ...         weight,
            ...         bias=bias,
            ...         weight_scale=scale,
            ...         weight_dtype='int8',
            ...     )
            ...     print(out.shape)
            paddle.Size([1, 2, 32])
    Nr-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r   r   r   zLCurrently weight_quantize only support group size of -1, 64 or 128. but got rO   r9   int4weight_only_linearr&   rK   rN   rL   )rO   r"   r)   r;   r=   )r$   r   r   r   rR   r   r
   rB   rJ   rC   rD   )r&   rK   rL   rN   rO   r"   r)   r;   r>   rE   rJ   r?   rA   s                r#   rR   rR      s   ^ | 
BJJrzzrzzrzzrzzrzzrzzs{{{aZ^aaa { zR//:3D3D3DdWaddd 4E3DD  "'vt\<z
 
 
.66*:<P	
 	
 	
 $T..VXX.. h)N
 

 "VF6N($
 
 77>>CL	 	 	
 	
 	
 
r%         @	thresholdr:   c                $   t                      rt          j        | ||||          }|S d}t          |fi t	                      }| j        }| g|g|gd}	|r|g|	d<   d|i}
|                    |          }|                    ||	d|i|
           |S )a#  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): the first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, the bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): the input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        threshold(float): The min value of outlier in activation, outlier's channel will be apply multiply with x.dtype.

    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import llm_int8_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...     out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
            ...     print(out.shape)
            paddle.Size([1, 2, 32])
    llm_int8_linearrS   rL   rU   r;   r=   )r   r   rW   r
   rB   rJ   rC   rD   )r&   rK   rL   rN   rU   r;   r>   rE   rJ   r?   rA   s              r#   rW   rW   %  s    L  $QlINN
 T..VXX.. h)N
 

  	$"VF6Ni(77>>CL	 	 	
 	
 	
 
r%   scalesc                    t                      rt          j        | |          S d}t          |fi t	                      }|                    | j                  }|                    || g|gdd|i           |S )aB  
    Apply pre-quant per channel scale on activations

    Args:
        x (Tensor): Input tensor representing the activations, the data type can be float16 or bfloat16.
        scales(Tensor): Per-channel scale factors for pre-quantization. Data type should be compatible with x.

    Returns:
        out (Tensor): The Tensor which is the pre-quant results, the data type is compatible with x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import apply_per_channel_scale

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> scales = paddle.rand(shape=[32], dtype=paddle.float16)
            >>> out = apply_per_channel_scale(x, scales)
    apply_per_channel_scale)r&   rX   r;   )r>   r?   r@   )r   r   rZ   r
   rB   rC   rJ   rD   )r&   rX   r>   rE   r;   s        r#   rZ   rZ   g  s    0  -a888(T..VXX..77@@11CL 	 	
 	
 	

 
r%   )r   Nr   )
r&   r   r'   r   r"   r(   r)   r   r*   r+   )r   rF   r   )r&   r   r<   r   r'   r   rG   r   r)   r   r*   r   )NNr9   Nr   )r&   r   rK   r   rL   rM   rN   rM   rO   r   r"   r(   r)   r   r*   r   )NNrT   )r&   r   rK   r   rL   rM   rN   rM   rU   r:   r*   r   )r&   r   rX   r   r*   r   )
__future__r   typingr   r   r   r   paddle.base.data_feederr   paddle.devicer   paddle.device.cudar	   paddle.frameworkr
   r   typing_extensionsr   r   paddle._typingr   r   __annotations__r   r$   r8   rI   rR   rW   rZ    r%   r#   <module>re      s   # " " " " " " ) ) ) ) ) ) ) )        / / / / / /      5 4 4 4 4 4       
  	1++++++((((((:E     $K0J0000  * %	D D D D DT %$5 5 5 5 5v "&$e e e e eV "&? ? ? ? ?D$ $ $ $ $ $r%   