
    }-jf)                         d dl mZ ddlmZmZmZmZ ddlmZ ddl	m
Z
  e            rd dlZerddlmZ dd	lmZ  ej        e          Z G d
 de          ZdS )    )TYPE_CHECKING   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModel)FineGrainedFP8Configc                        e Zd ZU dZdZded<    fdZd Zddd	ed
e	fdZ
ddd	eddd
ef fdZ	 	 ddZd Zd Zed
e	fd            Zed
e	fd            Zd Zd Zd Z xZS )FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    Fr   quantization_configc                 <     t                      j        |fi | d S )N)super__init__)selfr   kwargs	__class__s      q/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr   z"FineGrainedFP8HfQuantizer.__init__   s)    ,7777777    c                 r   t                      st          d          | j        j        rd S t          j                                        sLt                      s>| j        r(t          
                    d           d| j        _        d S t          d          t          j                                        rdt          j                                        }|\  }}|dk     s|dk    r5|dk     r/t          
                    d| d| d	           d| j        _        d S |                    d
          }|t          
                    d           d S t          |t                    rU| j        s)t!          |          dk    rd|                                v sd|                                v rt%          d          d S d S )NzMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)zUsing FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is availableTzANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.z`. We will default to dequantizing the model to bf16. Feel free to use a different quantization method like bitsandbytes or torchao
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. r	   cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   
dequantizetorchcudais_availabler   pre_quantizedloggerwarning_onceRuntimeErrorget_device_capabilityget
isinstancedictlenvalues
ValueError)r   argsr   compute_capabilitymajorminorr   s          r   validate_environmentz.FineGrainedFP8HfQuantizer.validate_environment   s
   &(( 	omnnn#. 	Fz&&(( 	h1G1I1I 	h! h## [   7;(3"#fggg:""$$ 
	!&!A!A!C!C-LE5		uzzeaii##[#([ [+0[ [ [  
 7;(3ZZ--
6    
 
D)) 	&

OOa''Z..0000Z..0000 k  	 	
 10r   modelr   
param_namereturnc                     ddl m}m} t          ||          \  }}t	          |||f          r| j        s|dk    rdS dS dS )Nr   )
FP8Experts	FP8LinearbiasFT)integrations.finegrained_fp8r9   r:   r   r+   r%   )r   r5   r6   r   r9   r:   moduletensor_names           r   param_needs_quantizationz2FineGrainedFP8HfQuantizer.param_needs_quantizationO   si    HHHHHHHH25*EEfy*566 	! [F%:%:utur   paramztorch.Tensorc                 z    |                      ||          rdS t                                          |||          S )z4Return the element size (in bytes) for `param_name`.r	   )r?   r   param_element_size)r   r5   r6   r@   r   s       r   rB   z,FineGrainedFP8HfQuantizer.param_element_sizeZ   s<    ((
;; 	1ww))%UCCCr   c                     ddl m} |                     || j        j        |j                  | _         ||| j        | j        | j                  }d S )Nr   )replace_with_fp8_linear)modules_to_not_convertr   r%   )r<   rD   get_modules_to_not_convertr   rE   _keep_in_fp32_modulesr%   )r   r5   r   rD   s       r   $_process_model_before_weight_loadingz>FineGrainedFP8HfQuantizer._process_model_before_weight_loadinga   sq    
 	KJJJJJ&*&E&E4+BED_'
 '
# ('#'#> $ 8,	
 
 
r   c                 R    d|j         j        v rddddddddddddddd}||_        |S )NQwen3colwiserowwise)z layers.*.self_attn.q_proj.weightz*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightz*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_inv)r   __name__base_model_tp_plan)r   config	text_plans      r   update_tp_planz(FineGrainedFP8HfQuantizer.update_tp_plans   sX    f&///4=>G4=>G4=>G4=>G1:;D/89B1:;D I" )2F%r   c                     dS NT r   s    r   is_serializablez)FineGrainedFP8HfQuantizer.is_serializable   s    tr   c                     dS )NFrT   rU   s    r   is_trainablez&FineGrainedFP8HfQuantizer.is_trainable   s    ur   c                     dS rS   rT   rU   s    r   is_compileablez(FineGrainedFP8HfQuantizer.is_compileable   s    tr   c                 $    ddl m}  ||           S )Nr   )Fp8Quantize)r<   r\   )r   r\   s     r   get_quantize_opsz*FineGrainedFP8HfQuantizer.get_quantize_ops   s$    >>>>>>{4   r   c                 z    ddl m} ddlm} | j        r'| j        j        r |g dd ||           g          gS g S )Nr   )WeightConverterFp8Dequantize)zweight$weight_scale_invactivation_scaleweightsource_patternstarget_patterns
operations)core_model_loadingr_   r<   ra   r%   r   r!   )r   r_   ra   s      r   get_weight_conversionsz0FineGrainedFP8HfQuantizer.get_weight_conversions   s    888888@@@@@@ 		$":"E 		  $W$W$W$, -d 3 34    	r   c                 x   | j         r| j        j        s||                                 z   S ddlm}m} ddlm}  |dd          }|gt          |          z   }g }|D ]}t          ||          s|                    |           (d |j        D             }|red |D             }	d	 |D             }
d
 |j        D             }|	|
z   |z   } ||           gt          |j                  z   } |||j        |          }|                    |           |                    |                                            |S )u  When loading with ``dequantize=True``, attach an :class:`Fp8Dequantize` op to
        every existing :class:`WeightConverter` so that per-block scales are folded into
        the weight *before* any later merge/concat ops collapse the per-expert structure.

        For each model-supplied converter that has a ``.weight`` source, we:
          1. anchor the existing weight patterns with ``$`` so they don't accidentally
             also match the ``.weight_scale_inv`` keys (the regex is searched, so the
             unanchored prefix would match both, sending scales to the wrong bucket);
          2. add anchored ``*.weight_scale_inv`` sources next to each weight pattern so
             the loader collects scale tensors alongside the weight tensors into the
             *same* converter bucket (both keys rewrite to the same target);
          3. prepend a fresh :class:`Fp8Dequantize` op so dequant runs first, before
             any merge/concat collapses the per-expert structure.

        The generic ``weight$ + weight_scale_inv → weight`` converter from
        :meth:`get_weight_conversions` is still appended at the end as a fallback for
        plain ``nn.Linear`` weights with no model-specific converter.
        r   )r_   WeightRenamingr`   z^(.+)\.scale$z\1.weight_scale_inv)rf   rg   c                 <    g | ]}|                     d           |S .weightendswith.0ps     r   
<listcomp>zGFineGrainedFP8HfQuantizer.update_weight_conversions.<locals>.<listcomp>   s)    WWWAIAVAVWaWWWr   c                     g | ]}|d z   S )$rT   rr   s     r   ru   zGFineGrainedFP8HfQuantizer.update_weight_conversions.<locals>.<listcomp>   s    "C"C"Cq1s7"C"C"Cr   c                 D    g | ]}|d t          d                    dz   S )Nro   z.weight_scale_inv$)r-   rr   s     r   ru   zGFineGrainedFP8HfQuantizer.update_weight_conversions.<locals>.<listcomp>   s2     e e eQR#4c)nn_#4!58L!L e e er   c                 <    g | ]}|                     d           |S rn   rp   rr   s     r   ru   zGFineGrainedFP8HfQuantizer.update_weight_conversions.<locals>.<listcomp>   s)    VVVq

9@U@UVVVVr   re   )r%   r   r!   rj   ri   r_   rl   r<   ra   listr+   appendrf   rh   _original_target_patternsextend)r   weight_conversionsr_   rl   ra   scale_renameupdatedconvweight_sourcesanchored_weightscale_sourcesothernew_sourcesnew_opss                 r   update_weight_conversionsz3FineGrainedFP8HfQuantizer.update_weight_conversions   s   & " 	Ft'?'J 	F%(C(C(E(EEEHHHHHHHH@@@@@@ &~6FXnooo*^d3E.F.FF& 	! 	!D dO44 t$$$WW)=WWWN 
"C"CN"C"C"C e eVd e e eVVD$8VVV-=E(=../$t2G2GG&$/$($B&  
 NN4    t2244555r   )r5   r   )rM   
__module____qualname____doc__requires_calibration__annotations__r   r4   strboolr?   floatrB   rH   rQ   rV   propertyrX   rZ   r]   rj   r   __classcell__)r   s   @r   r   r      s         
 !////8 8 8 8 8/ / /b	.? 	S 	_c 	 	 	 	D(9 Ds DSa Dfk D D D D D D
 
 
 
 
$  .   d    X     X! ! !
   8 8 8 8 8 8 8r   r   )typingr   utilsr   r   r   r   baser
   quantizers_utilsr   r"   modeling_utilsr   utils.quantization_configr   
get_loggerrM   r&   r   rT   r   r   <module>r      s                ` ` ` ` ` ` ` ` ` ` ` `       2 2 2 2 2 2  LLL A000000@@@@@@		H	%	%P P P P P P P P P Pr   