
    }-ja&                         d dl mZ ddlmZ erddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZ  e            rd d	lZ ej        e          Z G d
 de          Zd	S )    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)FbgemmFp8Config)is_accelerate_availableis_fbgemm_gpu_availableis_kernels_availableis_torch_availableis_torch_cuda_availableis_torch_xpu_availablelogging)get_module_from_nameNc                        e Zd ZU dZdZded<    fdZd Zdd
Zddde	d	e
fdZddde	ddd	ef fdZ	 	 ddZd Zd Zd Zed	e
fd            Zd Z xZS )FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    Fr   quantization_configc                 <     t                      j        |fi | d S )N)super__init__)selfr   kwargs	__class__s      l/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   zFbgemmFp8HfQuantizer.__init__1   s)    ,7777777    c                    t                      st                      st          d          t                      rt                      st          d          t                      rt	                      st          d          t                      st          d          t                      r8t          j                                        }|\  }}|dk     rt          d          |
                    d          }|t                              d           d S t          |t                    rB| j        s=d	|                                v sd
|                                v rt          d          d S d S d S )Nz3Using fbgemm fp8 quantization requires a GPU or XPUz@Using FP8 fbgemm on XPU requires kernels (`pip install kernels`)zLoading an FP8 fbgemm quantized model on CUDA requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librarieszWLoading an FP8 quantized model requires accelerate (`pip install --upgrade accelerate`)	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu' or 'auto'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r   ImportErrorr   r
   r	   torchcudaget_device_capability
ValueErrorgetloggerwarning_once
isinstancedictpre_quantizedvalues)r   argsr   compute_capabilitymajor_r   s          r   validate_environmentz)FbgemmFp8HfQuantizer.validate_environment4   s   &(( 	U1G1I1I 	USTTT!## 	b,@,B,B 	b`aaa"$$ 	-D-F-F 	F   '(( 	i   #$$ 	!&!A!A!C!C)HE1qyy n   ZZ--
S     
D)) 	% 5J4E4E4G4G+G+G6U_UfUfUhUhKhKh n  	 	 KhKhr   dtypetorch.dtypereturnc                 z    |t           j        k    r*t                              d| d           t           j        }|S )NzSetting dtype to zP, but only bfloat16 is supported right now. Overwriting torch_dtype to bfloat16.)r"   bfloat16r'   r(   )r   r2   s     r   update_dtypez!FbgemmFp8HfQuantizer.update_dtypeX   sC    EN""{E{{{   NEr   modelr   
param_namec                     ddl m}m} t          ||          \  }}t	          ||          r| j        s|dk    rdS dS t	          ||          r| j        s|dk    rdS dS dS )Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasFT)integrationsr<   r=   r   r)   r+   )r   r8   r9   r   r<   r=   moduletensor_names           r   param_needs_quantizationz-FbgemmFp8HfQuantizer.param_needs_quantization`   s    NNNNNNNN25*EEfo.. 	! [F%:%:utf899 	! [F%:%:utur   paramztorch.Tensorc                 z    |                      ||          rdS t                                          |||          S )z4Return the element size (in bytes) for `param_name`.r   )rB   r   param_element_size)r   r8   r9   rC   r   s       r   rE   z'FbgemmFp8HfQuantizer.param_element_sizeq   s<    ((
;; 	1ww))%UCCCr   c                     ddl m} |                     || j        j        |j                  | _         ||| j        | j        | j        |j                  }d S )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r+   tp_plan)r?   rG   get_modules_to_not_convertr   rH   _keep_in_fp32_modulesr+   _tp_plan)r   r8   r   rG   s       r   $_process_model_before_weight_loadingz9FbgemmFp8HfQuantizer._process_model_before_weight_loadingx   sv    
 	BAAAAA&*&E&E4+BED_'
 '
# /.#'#> $ 8,N
 
 
r   c                     ddl m}m} |                                D ]H}t	          |||f          r4t          |d          r$|j                            | j        j	                   I|S )z
        Force update the input scale upper bound after weight loading and device dispatch are complete.
        This resolves issues where persistent buffers are zeroed out or overwritten during the loading process.
        r   r;   input_scale_ub)
integrations.fbgemm_fp8r<   r=   modulesr)   hasattrrO   fill_r   activation_scale_ub)r   r8   r   r<   r=   ms         r   #_process_model_after_weight_loadingz8FbgemmFp8HfQuantizer._process_model_after_weight_loading   s    
 	ZYYYYYYY 	Y 	YA!o/IJKK Y1.// Y$**4+C+WXXXr   c                    d|j         j        v rui ddddddddddddd	d
dddddddddddddddd
ddddddd
dddd}|                                ||                                _        n||_        |S |S )NLlama4z layers.*.self_attn.q_proj.weightcolwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightrowwisezlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalepacked_rowwise)z.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   config	text_plans      r   update_tp_planz#FbgemmFp8HfQuantizer.update_tp_plan   sW   v'000! 3I	!
 9)! 3I! 9)! 3I! 9)! 3I! 23F! ;<O! 2!$ G	%!& Mi'!( Ei)!* KI+!, G	-!. CI/!0 I)1!2 CLHQDM ?ODT;DA! ! !ID %%''3>G&&((;;,5)Mr   c                     dS )NT r   s    r   is_serializablez$FbgemmFp8HfQuantizer.is_serializable   s    tr   c                     dS )NFrd   re   s    r   is_trainablez!FbgemmFp8HfQuantizer.is_trainable   s    ur   c                 $    ddl m}  ||           S )Nr   )FbgemmFp8Quantize)rP   rj   )r   rj   s     r   get_quantize_opsz%FbgemmFp8HfQuantizer.get_quantize_ops   s%    ??????  &&&r   )r2   r3   r4   r3   )r8   r   )r]   
__module____qualname____doc__requires_calibration__annotations__r   r1   r7   strboolrB   floatrE   rM   rV   rb   rf   propertyrh   rk   __classcell__)r   s   @r   r   r   )   sn          !****8 8 8 8 8" " "H   .? S _c    "D(9 Ds DSa Dfk D D D D D D
 
 
 
 
&  * * *X   d    X' ' ' ' ' ' 'r   r   )typingr   baser   modeling_utilsr   utils.quantization_configr   utilsr	   r
   r   r   r   r   r   quantizers_utilsr   r"   
get_loggerr]   r'   r   rd   r   r   <module>r}      s3   !                  <000000;;;;;;                  3 2 2 2 2 2  LLL		H	%	%f' f' f' f' f'; f' f' f' f' f'r   