§
    {-j8  ã                  ó  — U d dl mZ d dlmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ er,d d	lmZ d d
lmZ d dlmZ ed         Zded<   ed         Zded<   d„ Z	 	 	 d/d0d„Z	 	 	 d1d2d!„Z	 	 	 	 	 d3d4d(„Z	 	 	 d5d6d,„Zd7d.„ZdS )8é    )Úannotations)ÚTYPE_CHECKINGÚLiteralN)Ú_C_ops)Úcheck_dtype)Úis_compiled_with_cuda)Úget_device_capability)ÚLayerHelperÚin_dynamic_or_pir_mode)Ú	TypeAlias)ÚTensor)Ú	DTypeLike)Úweight_only_int8Úweight_only_int4zllm.int8r   Ú_Algo)éÿÿÿÿé@   é€   Ú
_GroupSizec                 ó  — t          ¦   «         rpt          j                             ¦   «         } | | dk    st          j        ¦   «         r(t          ¦   «         \  }}t          |dz  |z   ¦  «        }|S t          d¦  «        ‚dS )NÚFalseé
   zmPaddle is not compiled with CUDA, we cannot get SMVersion from device, please try to compile Paddle with CUDAr   )r   ÚpaddleÚversionÚcudaÚis_compiled_with_rocmr	   ÚintÚ
ValueError)Úcuda_versionÚmajorÚminorÚarchs       ú`/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/nn/quant/quantized_linear.pyÚ_get_arch_infor$   +   s‹   € åÑÔð Ý”~×*Ò*Ñ,Ô,ˆàÐ$¨¸Ò)@Ð)@ÝÔ)Ñ+Ô+ð *Aå0Ñ2Ô2‰LˆE5Ýu˜r‘z EÑ)Ñ*Ô*ˆDØˆKåØñô ð ð
 ˆqó    r   r   Úxr   Úalgor"   ú
int | NoneÚ
group_sizeÚreturnútuple[Tensor, Tensor]c           	     ó  — |€t          ¦   «         }t          ¦   «         r>|dk    s8|dk    s2|dk    s,|dk    s&|dk    s |dk    s|dk    s|d	k    sJ d
|› d¦   «         ‚|dk    s|dk    s|dk    sJ d|› d¦   «         ‚t          ¦   «         rt          j        | |||¦  «        S d}t          |fi t          ¦   «         ¤Ž}|                     d¦  «        }|                     d¦  «        }|                     |d| i||dœ|||dœ¬¦  «         ||fS )a~  
    Quantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be quantized, the data type is float16 or bfloat16.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4', 'llm.int8', 'w4a8' and 'w4afp8, default: 'weight_only_int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
        group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.

    Returns:
        out (Tensor): The Tensor which is the quantitative results, the data type is int8, the shape is transposition of x.
        scale (Tensor): The scale Tensor which is the scale of pre-channel, the data type is float32.
    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> print(out.shape)
            paddle.Size([32, 64])
            >>> print(scale.shape)
            paddle.Size([32])
    NéF   éK   éP   éV   éY   éZ   é\   éd   úKCurrently weight_quantize only support SM70/75/80/86/89/90/92/100. but got Ú r   r   r   ú5Currently group_size only support -1/64/128. but got Úweight_quantizeÚint8Úfloatr&   )ÚoutÚscale)r'   r"   r)   ©ÚtypeÚinputsÚoutputsÚattrs)	r$   r   r   r   r8   r
   ÚlocalsÚ"create_variable_for_type_inferenceÚ	append_op)r&   r'   r"   r)   r>   Úhelperr;   r<   s           r#   r8   r8   >   s|  € ðD €|ÝÑÔˆåÑÔð 
àBŠJˆJØrŠzˆzØrŠzˆzØrŠzˆzØrŠzˆzØrŠzˆzØrŠzˆzØsŠ{ˆ{ˆ{àaÐZ^ÐaÐaÐañ Œ{ðð ˜ÒÐ˜z¨RÒ/Ð/°:ÀÒ3DÐ3DÐ3DØMÀ
ÐMÐMÐMñ 4EÔ3DÐDõ ÑÔð ÝÔ% a¨¨t°ZÑ@Ô@Ð@à ˆÝ˜TÐ.Ð.¥V¡X¤XÐ.Ð.ˆØ×7Ò7¸Ñ?Ô?ˆØ×9Ò9¸'ÑBÔBˆà×ÒØØ˜8Ø¨%Ð0Ð0Ø¨¸ZÐHÐHð	 	ñ 	
ô 	
ð 	
ð Uˆ|Ðr%   Úfloat16r<   Ú	out_dtyper   c                ó@  — |dk    s|dk    s|dk    sJ d|› d¦   «         ‚t          ¦   «         rt          j        | |||¦  «        S d}t          |fi t	          ¦   «         ¤Ž}|j        }|                     |¦  «        }|                     || |dœd|i||d	œ¬
¦  «         |S )a¢  
    Dequantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be dequantized, the data type is int8.
        scale (Tensor): The scale Tensor which is the output of weight_quantize, the data type is float32.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'.
        out_dtype (str|np.dtype): [Deprecated][Not used] The output Tensor's data type, must be one of 'float16' and 'bfloat16', default: 'float16'.

    Returns:
        out (Tensor): The Tensor which is the dequantitative results, the data type is float16 or bfloat16, the shape is transposition of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize, weight_dequantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> x_dequant = weight_dequantize(out, scale)
    r   r   r   r7   r6   Úweight_dequantize)r&   r<   r;   )r'   r)   r=   )r   r   rI   r
   rB   ÚdtyperC   rD   )r&   r<   r'   rG   r)   r>   rE   r;   s           r#   rI   rI   …   sé   € ð@ ˜ÒÐ˜z¨RÒ/Ð/°:ÀÒ3DÐ3DÐ3DØMÀ
ÐMÐMÐMñ 4EÔ3DÐDõ ÑÔð ÝÔ'¨¨5°$¸
ÑCÔCÐCà"ˆÝ˜TÐ.Ð.¥V¡X¤XÐ.Ð.ˆØ”Kˆ	Ø×7Ò7¸	ÑBÔBˆà×ÒØØ UÐ+Ð+Ø˜CLàØ(ðð ð	 	ñ 	
ô 	
ð 	
ð ˆ
r%   r9   ÚweightÚbiasúTensor | NoneÚweight_scaleÚweight_dtypec           	     óL  — |€t          ¦   «         }t          ¦   «         r>|dk    s8|dk    s2|dk    s,|dk    s&|dk    s |dk    s|dk    s|d	k    sJ d
|› d¦   «         ‚|dk    s|dk    s|dk    sJ d|› d¦   «         ‚t          ¦   «         rt          j        | ||||||¦  «        }|S t          |dddgd¦  «         d}t          |fi t          ¦   «         ¤Ž}	| j        }
| g|g|gdœ}||g|d<   |||dœ}|	 	                    |
¦  «        }|	 
                    ||d|i|¬¦  «         |S )aæ  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): The first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): The second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): The input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, The bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): The input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        weight_dtype(str): The dtype of  weight Tensor, must be one of 'int8', 'int4', Defaulted to 'int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
        group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.
    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_only_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...     out = weight_only_linear(
            ...         x,
            ...         weight,
            ...         bias=bias,
            ...         weight_scale=scale,
            ...         weight_dtype='int8',
            ...     )
            ...     print(out.shape)
            paddle.Size([1, 2, 32])
    Nr-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r   r   r   zLCurrently weight_quantize only support group size of -1, 64 or 128. but got rO   r9   Úint4Úweight_only_linear©r&   rK   rN   rL   )rO   r"   r)   r;   r=   )r$   r   r   r   rR   r   r
   rB   rJ   rC   rD   )r&   rK   rL   rN   rO   r"   r)   r;   r>   rE   rJ   r?   rA   s                r#   rR   rR   ½   sÕ  € ð^ €|ÝÑÔˆåÑÔð 
àBŠJˆJØrŠzˆzØrŠzˆzØrŠzˆzØrŠzˆzØrŠzˆzØrŠzˆzØsŠ{ˆ{ˆ{àaÐZ^ÐaÐaÐañ Œ{ðð ˜ÒÐ˜z¨RÒ/Ð/°:ÀÒ3DÐ3DÐ3DØdÐWaÐdÐdÐdñ 4EÔ3DÐDõ ÑÔð "ÝÔ'Øˆvt˜\¨<¸¸zñ
ô 
ˆð ˆ
åØ˜.¨6°6Ð*:Ð<Pñ	
ô 	
ð 	
ð $ˆÝ˜TÐ.Ð.¥V¡X¤XÐ.Ð.ˆØ”ˆð ØhØ)˜Nð
ð 
ˆð
 ÐØ"˜VˆF6‰Nà(ØØ$ð
ð 
ˆð ×7Ò7¸Ñ>Ô>ˆà×ÒØØØ˜CLØð	 	ñ 	
ô 	
ð 	
ð ˆ
r%   ç      @Ú	thresholdr:   c                ó$  — t          ¦   «         rt          j        | ||||¦  «        }|S d}t          |fi t	          ¦   «         ¤Ž}| j        }| g|g|gdœ}	|r|g|	d<   d|i}
|                     |¦  «        }|                     ||	d|i|
¬¦  «         |S )a#  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): the first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, the bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): the input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        threshold(float): The min value of outlier in activation, outlier's channel will be apply multiply with x.dtype.

    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: pycon

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import llm_int8_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...     out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
            ...     print(out.shape)
            paddle.Size([1, 2, 32])
    Úllm_int8_linearrS   rL   rU   r;   r=   )r   r   rW   r
   rB   rJ   rC   rD   )r&   rK   rL   rN   rU   r;   r>   rE   rJ   r?   rA   s              r#   rW   rW   %  sØ   € õL ÑÔð ÝÔ$ Q¨°°lÀIÑNÔNˆØˆ
à ˆÝ˜TÐ.Ð.¥V¡X¤XÐ.Ð.ˆØ”ˆð ØhØ)˜Nð
ð 
ˆð
 ð 	$Ø"˜VˆF6‰NØ˜iÐ(ˆà×7Ò7¸Ñ>Ô>ˆà×ÒØØØ˜CLØð	 	ñ 	
ô 	
ð 	
ð ˆ
r%   Úscalesc                óô   — t          ¦   «         rt          j        | |¦  «        S d}t          |fi t	          ¦   «         ¤Ž}|                     | j        ¦  «        }|                     || g|gdœd|i¬¦  «         |S )aB  
    Apply pre-quant per channel scale on activations

    Args:
        x (Tensor): Input tensor representing the activations, the data type can be float16 or bfloat16.
        scales(Tensor): Per-channel scale factors for pre-quantization. Data type should be compatible with x.

    Returns:
        out (Tensor): The Tensor which is the pre-quant results, the data type is compatible with x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import apply_per_channel_scale

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> scales = paddle.rand(shape=[32], dtype=paddle.float16)
            >>> out = apply_per_channel_scale(x, scales)
    Úapply_per_channel_scale)r&   rX   r;   )r>   r?   r@   )r   r   rZ   r
   rB   rC   rJ   rD   )r&   rX   r>   rE   r;   s        r#   rZ   rZ   g  s•   € õ0 ÑÔð ÝÔ-¨a°Ñ8Ô8Ð8à(ˆÝ˜TÐ.Ð.¥V¡X¤XÐ.Ð.ˆØ×7Ò7¸¼Ñ@Ô@ˆà×ÒØØ˜¨¨Ð1Ð1Ø˜CLð 	ñ 	
ô 	
ð 	
ð
 ˆ
r%   )r   Nr   )
r&   r   r'   r   r"   r(   r)   r   r*   r+   )r   rF   r   )r&   r   r<   r   r'   r   rG   r   r)   r   r*   r   )NNr9   Nr   )r&   r   rK   r   rL   rM   rN   rM   rO   r   r"   r(   r)   r   r*   r   )NNrT   )r&   r   rK   r   rL   rM   rN   rM   rU   r:   r*   r   )r&   r   rX   r   r*   r   )Ú
__future__r   Útypingr   r   r   r   Úpaddle.base.data_feederr   Úpaddle.devicer   Úpaddle.device.cudar	   Úpaddle.frameworkr
   r   Útyping_extensionsr   r   Úpaddle._typingr   r   Ú__annotations__r   r$   r8   rI   rR   rW   rZ   © r%   r#   ú<module>re      s  ðð #Ð "Ð "Ð "Ð "Ð "Ð "à )Ð )Ð )Ð )Ð )Ð )Ð )Ð )à €€€Ø Ð Ð Ð Ð Ð Ø /Ð /Ð /Ð /Ð /Ð /ðð ð ð ð ð ð 5Ð 4Ð 4Ð 4Ð 4Ð 4ðð ð ð ð ð ð ð ð
 ð 	1Ø+Ð+Ð+Ð+Ð+Ð+àÐÐÐÐÐØ(Ð(Ð(Ð(Ð(Ð(àØ:ô€Eð ð ð ñ ð $ KÔ0€JÐ0Ð0Ð0Ñ0ðð ð ð* %ØØð	Dð Dð Dð Dð DðT %Ø$Øð5ð 5ð 5ð 5ð 5ðv Ø"&Ø$ØØðeð eð eð eð eðV Ø"&Øð?ð ?ð ?ð ?ð ?ðD$ð $ð $ð $ð $ð $r%   