
    }-j3                         d dl mZ ddlmZ erddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ  e            r
d d	lZdd
lmZ  ej        e          Zd	Z G d de          Zd	S )    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)Mxfp4Config)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameN)WeightConverterc                        e Zd ZU dZdZded<    fdZd Zd Zdd	d
e	de
fdZddZ	 ddd	de
fdZd Zd Zd Zd Zede
fd            Zd Zd Z xZS )Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    Fr   quantization_configc                 J     t                      j        |fi | d | _        d S N)super__init__triton_kernels_hub)selfr   kwargs	__class__s      g/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   zMxfp4HfQuantizer.__init__2   s1    ,77777"&    c                     | j         5	 ddlm}  |d          | _         n# t          $ r t          d          w xY w| j         S )z3Lazy import and initialize kernels only when neededNr   )
get_kernelz(kernels-community/gpt-oss-triton-kernelsz2kernels package is required for MXFP4 quantization)r   integrations.hub_kernelsr   ImportError)r   r   s     r   _lazy_import_kernelsz%Mxfp4HfQuantizer._lazy_import_kernels6   sp    "*XAAAAAA*4*5_*`*`'' X X X!"VWWWX&&s     :c                    t                      st          d          | j        j        rd S t	                      st          d          t
          j                                        pt          j        d          }|j	        dvrF| j
        r,t                              d| d           d| j        _        d S t          d| d	          t
          j                                        r d}t!          d
          }t#                      }nt
          j                                        rBt
          j                                        }|dk    }t!          d          }t#                      }n1|j	        dk    r d}t!          d
          }t#                      }nd}d}d}| j
        r|s(t                              d           d| j        _        d S |s(t                              d           d| j        _        d S |s(t                              d           d| j        _        d S n3|st)          d          |st)          d          |st)          d          | j
        s|                                  |                    d          }|At/          |t0                    r.| j
        s)d|                                v rt)          d          d S d S d S d S )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z9Using mxfp4 requires Accelerate: `pip install accelerate`cpu)cudaxpur#   zGUsing MXFP4 quantized models requires model on cuda/xpu/cpu, but found zj, we will default to dequantizing the model to bf16. To use mxfp4, please disable the current accelerator.TzIQuantizing a model using MXFP4 requires model on cuda/xpu/cpu, but found z7. To use mxfp4, please disable the current accelerator.z3.5.0)      z3.4.0Fu   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series). We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton: CUDA requires Triton >= 3.4.0, XPU/CPU requires Triton >= 3.5.0. Please install triton: `pip install triton`. We will default to dequantizing the model to bf16.zMXFP4 quantization requires the `kernels` package: `pip install kernels>=0.12.0`. We will default to dequantizing the model to bf16.u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) or CPUzMXFP4 quantization requires Triton: CUDA requires Triton >= 3.4.0, XPU/CPU requires Triton >= 3.5.0. Please install triton: `pip install triton`zPMXFP4 quantization requires the `kernels` package: `pip install kernels>=0.12.0`
device_mapdiskzYou are attempting to load an FP4 model with a device_map that contains a disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the disk device from the device_map.)r   r    r   
dequantizer	   torchacceleratorcurrent_acceleratordevicetypepre_quantizedloggerwarning_onceRuntimeErrorr%   is_availabler   r
   r$   get_device_capability
ValueErrorr!   get
isinstancedictvalues)	r   argsr   r.   is_device_supported_mxfp4triton_availablekernels_installedcompute_capabilityr(   s	            r   validate_environmentz%Mxfp4HfQuantizer.validate_environmentA   s   !## 	]  
 #. 	F&(( 	[YZZZ"6688OEL<O<O;444! 	## Q^d  Q  Q  Q   7;(3" ``f  `  `  `   9!!## 	&(,%27;; 4 6 6Z$$&& 	&!&!A!A!C!C(:f(D%27;; 4 6 6[E!!(,%27;; 4 6 6(-%$ % &	q, ##I  
 7;(3# ##I  
 7;(3$ ##I  
 7;(3 + 	ql   " 	q`   # 	qoppp! 	(%%'''ZZ--
!jT&B&B!% &J4E4E4G4G*G*G g   "!!! *G*Gr   modelr   
param_namereturnc                 h    ddl m} t          ||          \  }}t          ||          r|dv rdS dS dS )Nr   Mxfp4GptOssExperts)down_proj_biasgate_up_proj_biasFT)integrationsrF   r   r8   )r   rA   rB   r   rF   moduletensor_names          r   param_needs_quantizationz)Mxfp4HfQuantizer.param_needs_quantization   sV    55555525*EEf011 	EEEu4ur   c                     t           j                                        r t           j                                         d S t           j                                        r t           j                                         d S d S r   )r+   r$   r4   empty_cacher%   )r   rA   r   s      r   #_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loading   sl    :""$$ 	$J""$$$$$Y##%% 	$I!!#####	$ 	$r   use_kernelsc                    ddl m} t          j                                        pt          j        d          }|r/|j        dvr&t                              d           d| j	        _
        |s/|j        dv r&t                              d           d| j	        _
        |                     || j	        j        |j                  | _         ||| j        | j	                  }d S )	Nr   )replace_with_mxfp4_linearr#   )r#   zYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseTzMXFP4 inference on CPU requires use_kernels=True, but use_kernels is disabled. We will dequantize the model to bf16. To run MXFP4 natively on CPU, please set use_kernels=True.)modules_to_not_convertr   )rI   rR   r+   r,   r-   r.   r/   r1   r2   r   r*   get_modules_to_not_convertrS   _keep_in_fp32_modules)r   rA   rP   r   rR   r.   s         r   $_process_model_before_weight_loadingz5Mxfp4HfQuantizer._process_model_before_weight_loading   s    	=<<<<< "6688OEL<O<O 	76;g55e   37D$/ 	7v{g55s   37D$/&*&E&E4+BED_'
 '
# *)$*E[_[s
 
 
r   c                     d|j         j        v r0t          |dd           |j                            ddddd           |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrrY   updater   configs     r   update_tp_planzMxfp4HfQuantizer.update_tp_plan   ^    V-666v3T::F)00DRDRAOAO	    r   c                     d|j         j        v r0t          |dd           |j                            ddddd           |S )NrX   base_model_ep_planrZ   r[   )r   r\   r]   rd   r^   r_   s     r   update_ep_planzMxfp4HfQuantizer.update_ep_plan   rb   r   c                 &   ddl m} |                                }t          |j        dd          }t          |j        dd          }|                                D ]0\  }}t          ||          r t          |d          rt          |d          s7d	D ]}t          ||          }	t          || d
          }
|	j        j	        
                    |	j        j                                      dd          }|dk    r|                    |ddd          }n|                    ||dd          }|
j        j        j	        
                    |
j        j        j                                      dd          }||| d| d<   ||| d| d<   2i }||fS )Nr   rE   num_local_experts    hidden_sizei@  gate_up_proj	down_proj)rj   rk   _precision_configZ      ._blocks_scales)rI   rF   
state_dictr]   r`   named_modulesr8   hasattrstoragelayoutunswizzle_datadata	transposereshapeweight_scale)r   rA   rF   rt   rg   ri   namerJ   projtriton_tensorprecision_configblocksscalesmetadatas                 r   get_state_dict_and_metadataz,Mxfp4HfQuantizer.get_state_dict_and_metadata   s   555555%%''
#EL2ErJJelM4@@!//11 	= 	=LD&6#566FN33 FK00
 5 = = ' 5 5#*6d3M3M3M#N#N &.5DD]EZE_``jjkmoqrr>))#^^,=r2rJJFF#^^,={BPRSSF)6>ETT$19> )B##  7=
d22T22236<
d22T22233=" 8##r   c                     dS )NT r   s    r   is_serializablez Mxfp4HfQuantizer.is_serializable  s    tr   c                 :    t                               d           dS )NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r1   r2   r   s    r   is_trainablezMxfp4HfQuantizer.is_trainable  s'     x	
 	
 	
 ur   c                 $    ddl m}  ||           S )Nr   )Mxfp4Quantize)integrations.mxfp4r   )r   r   s     r   get_quantize_opsz!Mxfp4HfQuantizer.get_quantize_ops  s$    666666}T"""r   c                 *   ddl m}m} | j        rI| j        j        r=t          ddgd ||           g          t          ddgd	g ||           g          gS t          ddgd	 ||           g          t          ddgd ||           g          gS )
Nr   )Mxfp4DequantizeMxfp4Deserializedown_proj_blocksdown_proj_scalesz
down_proj$)source_patternstarget_patterns
operationsgate_up_proj_blocksgate_up_proj_scaleszgate_up_proj$)r   r   r   r0   r   r*   r   )r   r   r   s      r   get_weight_conversionsz'Mxfp4HfQuantizer.get_weight_conversions  s	   JJJJJJJJ 	$":"E 	%79K$L$1 / 5 56  
  %:<Q$R%4$5 / 5 56    !68M N 0,,T223  
 !35G H -,,T223  
 	
r   )rA   r   )F)r\   
__module____qualname____doc__requires_calibration__annotations__r   r!   r@   strboolrL   rO   rV   ra   re   r   r   propertyr   r   r   __classcell__)r   s   @r   r   r   *   sl          !&&&&' ' ' ' '	' 	' 	'^ ^ ^@.? S _c    $ $ $ $ "
 
 
 
 
 
 
B    !$ !$ !$F   d    X# # #

 
 
 
 
 
 
r   r   )typingr   baser   modeling_utilsr   utils.quantization_configr   utilsr	   r
   r   r   r   quantizers_utilsr   r+   core_model_loadingr   
get_loggerr\   r1   r   r   r   r   r   <module>r      s7   !                  8000000777777              3 2 2 2 2 2  5LLL444444		H	%	% Q
 Q
 Q
 Q
 Q
{ Q
 Q
 Q
 Q
 Q
r   