
    |-j                       d dl mZ d dlZd dlmZ d dlmZ d dlZd dlm	Z	 d dlm
Z ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZmZ  ej        e          Zej         Z! ej"        e!          j#        Z$ ej"        e!          j%        Z&dZ'd Z( ed           G d d                      Z)ej*        dLd            Z+ ed           G d d                      Z,ej*        dMd            Z-dNdZ.ej/        fdOd'Z0 G d( d)e	j1                  Z2dPd/Z3dPd0Z4dQd7Z5dRd<Z6dSd>Z7dPd?Z8 G d@ dAe	j9                  Z: G dB dCe          Z; e;            Z<	 dTdUdGZ= G dH dIe          Z> G dJ dKe          Z?dS )V    )annotationsN)Callable)	dataclass)
functional   )ACT2FN)ConversionOps)should_convert_module)logging)get_cuda_runtime_versionis_kernels_availableresolve_internal_import   )lazy_load_kernel)ExpertsInterfaceuse_experts_implementation   c                    |D ]$}t          | |          rt          | |          c S %t          t          |           j         d|           )Nz has none of: )hasattrgetattrAttributeErrortype__name__)objnamesnames      i/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/integrations/finegrained_fp8.py_first_attrr   /   sc     & &3 	&3%%%%%	&
DII.EEeEE
F
FF    T)frozenc                  <    e Zd ZU dZded<   ded<   ded<   ded<   dS )FineGrainedFP8zNEntry points exposed by the `kernels-community/finegrained-fp8` Triton kernel.r   
fp8_matmulfp8_act_quantbatched_fp8_matmulgrouped_fp8_matmulNr   
__module____qualname____doc____annotations__ r   r   r"   r"   6   sH         XX          r   r"   returnc                    t                      st          d          t          d          } | t          d          t          | dd          }t          | dd          }t          | dd          }t          | dd          }d	 d|fd|fd|fd|ffD             }|r&t          d
d                    |           d          t          ||||          S )z
    Load the finegrained-fp8 Triton kernel once and return its entry points.

    Raises `ImportError` if the `kernels` package is missing, or the kernel or required
    symbols cannot be found.
    z`finegrained-fp8 kernel requires the `kernels` package. Install it with `pip install -U kernels`.zfinegrained-fp8Nu   Failed to load the finegrained-fp8 kernel — check that `kernels-community/finegrained-fp8` has a build matching the current torch/CUDA.w8a8_fp8_matmulr$   w8a8_fp8_matmul_batchedw8a8_fp8_matmul_groupedc                    g | ]	\  }}||
S Nr,   .0r   attrs      r   
<listcomp>z0_load_finegrained_fp8_kernel.<locals>.<listcomp>Y   s-     	 	 	D$ < 	 <<r   z4finegrained-fp8 kernel is missing required symbols: , A. Please update the `kernels` package (`pip install -U kernels`).)r#   r$   r%   r&   )r   ImportErrorr   r   joinr"   )kernelr#   r$   r%   r&   missings         r   _load_finegrained_fp8_kernelr>   @   sI     !! 
n
 
 	
 /00F~;
 
 	

 !2D99JFOT::M )BDII )BDII	 	 
+m,&(:;&(:;	
	 	 	G  
N499WCUCU N N N
 
 	

 #--	   r   c                  2    e Zd ZU dZded<   ded<   ded<   dS )DeepGEMMzAEntry points exposed by the `kernels-community/deep-gemm` kernel.r   r#   r&   per_token_cast_to_fp8Nr'   r,   r   r   r@   r@   q   s<         KK    ######r   r@   c                    t                      st          d          t          j                                        st          d          t          j                                        d         } | dk     rt          d|  d          t                      \  }}|dk     s|dk    r|dk     rt          d	| d
| d          t          d          }|t          d          t          |dd          }t          |dd          }t          |d          }d d|fd|fd|ffD             }|r&t          dd
                    |           d          t          |||          S )z
    Load DeepGEMM once and return its entry points.

    Raises `ImportError` if CUDA/hardware requirements are not met, or the kernel or
    required symbols are not found.
    zYDeepGEMM kernel requires the `kernels` package. Install it with `pip install -U kernels`.zcDeepGEMM kernel requires CUDA, but CUDA is not available. Use a different `experts_implementation`.r   	   z_DeepGEMM requires a Hopper (SM90+) or newer GPU, but the current device has compute capability z-.x. Use a different `experts_implementation`.      z0DeepGEMM requires CUDA runtime 12.3+, but found .zO. Please upgrade your CUDA toolkit or use a different `experts_implementation`.z	deep-gemmNu|   Failed to load the DeepGEMM kernel — check that `kernels-community/deep-gemm` has a build matching the current torch/CUDA.fp8_gemm_nt m_grouped_fp8_gemm_nt_contiguouszutils.per_token_cast_to_fp8)chained_pathc                    g | ]	\  }}||
S r3   r,   r4   s      r   r7   z)_load_deepgemm_kernel.<locals>.<listcomp>   s-       D$
 < 	 <<r   z-DeepGEMM kernel is missing required symbols: r8   r9   )r#   r&   rA   )r   r:   torchcudais_availableget_device_capabilityr   r   r   r   r;   r@   )major
cuda_major
cuda_minorr<   r#   r&   rA   r=   s           r   _load_deepgemm_kernelrR   z   s	     !! wuvvv:""$$ 
q
 
 	

 J,,..q1Eqyy[&+[ [ [
 
 	
 677J
B:++
Q\z \ \J \ \ \
 
 	

 k**F~;
 
 	

 55J )KTRR3FIfggg  J'/1CD*,AB
  G  
NDIIg<N<N N N N
 
 	

 -3   r   aintbc                    | |z   dz
  |z  S )zCeiling division.r   r,   )rS   rU   s     r   _cdivrW      s    EAI!r   Atorch.TensorBAsBs
block_size	list[int]output_dtypetorch.dtypec                   |<|d         |d         cxk    rdk    r"n n	 t                      }|                     d| j        d                   }|                    d|j        d                   }t          j        |j        d         |j        d         | j        |          }	|                    ||                                f||                                f|	           |	                    | j        dd         |j        d         fz             S # t          $ r t          
                    d           Y nw xY wt                      }
|
                    | |||||          S )u  FP8 matmul: C = dequant(A, As) @ dequant(B, Bs)^T.

    Supports both per-tensor and block-wise quantization:
      - block_size=None or block_size=[N, K]: per-tensor mode (As is scalar/per-row, Bs is scalar)
      - block_size=[block_n, block_k]: block-wise mode (As and Bs are per-block scale grids)

    Dispatch order:
      1. DeepGEMM (Hopper+, block_size 128x128) if available
      2. Triton finegrained-fp8 kernel (universal fallback)

    Args:
        A:  (M, K) float8_e4m3fn — quantized activations
        B:  (N, K) float8_e4m3fn — quantized weights
        As: block-wise: (M, K//block_k) float32; per-tensor: (M,) per-row scales
        Bs: block-wise: (N//block_n, K//block_k) float32; per-tensor: scalar or (1,) single weight scale
        block_size: [block_n, block_k] for block-wise quantization, or None/[N, K] for per-tensor
        output_dtype: desired output dtype
    Nr   r   r   devicedtypea  DeepGEMM kernel is not available or compatible, falling back to Triton finegrained-fp8 kernel. To use DeepGEMM FP8 matmul, ensure you have a Hopper (SM90+) or newer GPU with CUDA runtime 12.3+, and that the `kernels` package is installed and up to date (`pip install -U kernels`).)rR   viewshaperK   emptyrd   r#   floatr:   loggerwarning_oncer>   )rX   rZ   r[   r\   r]   r_   deepgemmA_2dAs_2doutputfinegrained_fp8s              r   r/   r/      sh   4 *Q-:a="G"G"G"GC"G"G"G"G"G	=,..H 66"agbk**DGGB--E[A
18S_```Fu{{}} 5288::OOO;;qwss|qwqzm;<<<  	 	 	i    	 344O%%aBJMMMs   D $D=<D=c                  2     e Zd Zdddefd fdZddZ xZS )	FP8LinearNdynamicFin_featuresrT   out_featuresr]   tuple[int, int] | Noneactivation_schemestrhas_biasboolc                   t                                          ||           || _        || _        || _        t
          j                            t          j        |||                    | _	        | j        8t          j        t          j
        dt
          j                            | _        nz|| j        d         z   dz
  | j        d         z  }|| j        d         z   dz
  | j        d         z  }t          j        t          j        ||t
          j                            | _        | j        dk    r8t          j        t          j
        dt
          j                            | _        n|                     dd            | j        r2t          j        t          j        | j                            | _        d S |                     dd            d S )Nre         ?r   r   staticactivation_scalebias)super__init__ry   r]   rw   rK   nn	Parameterrh   weighttensorfloat32weight_scale_invr   register_parameterru   r   )
selfrt   ru   r]   rw   ry   re   scale_out_featuresscale_in_features	__class__s
            r   r   zFP8Linear.__init__   s    	l333 $!2h((\;V[)\)\)\]]?"$&Lc1W1W1W$X$XD!!".1C"Ca"GDO\]L^!^!,tq/A!AA!E$/Z[J\ \$&L.0AWWW% %D! !X--$&Lc1W1W1W$X$XD!!##$6==== 	2U[1B%C%CDDDIII##FD11111r   inputrY   r-   c                   | j                                         dk    r t          j        || j         | j                  S | j         }| j        }t          |t          j        j	        j
                  r(|                                }|                                }| j        dk    rGt                      }|                    || j        | j        d         n|j        d                   \  }}n| j        dk    ra| j                            t          j                  }||z                      t*          t,                                        t.                    }nt1          d| j                   t3          ||||| j        |j                  }| j        |                    | j                   |                    |j                  S )	Nr   rs   rb   r~   minmaxzUnsupported activation scheme: r_   r|   )r   element_sizeFlinearr   r   
isinstancerK   distributedr   DTensorto_localrw   r>   r$   r]   rg   r   tor   clamp_FP8_MIN_FP8_MAX
_FP8_DTYPENotImplementedErrorr/   re   add_)r   r   r   	scale_invrp   qinputscalero   s           r   forwardzFP8Linear.forward  s   ;##%%))8E4;	:::)	fe/6>?? 	-__&&F!**,,I!Y..:<<O+99T_-Htq))ekZ\o MFEE #x//),,U];;Eem**xX*FFII*UUFF%&`H^&`&`aaa O
 
 
 9 KK	"""yyu{y+++r   )
rt   rT   ru   rT   r]   rv   rw   rx   ry   rz   )r   rY   r-   rY   )r   r(   r)   r   r   r   __classcell__r   s   @r   rr   rr      se        
 .2!*"2 "2 "2 "2 "2 "2 "2H!, !, !, !, !, !, !, !,r   rr   r   torch.nn.Modulehidden_statestop_k_indextop_k_weightsc                   | j         dk    rt          d          t                      }|                    d          }|                    d          }|                    d          }|                    |d          }|                    d          }	|                    d          }
|
                    d| j        dz
             |                    || j	        r| j
        n| j        | j	        r| j        n| j        | j        |
          }| j	        r|                     |          }n|                     |          }|                    || j        | j        | j        |
          }||	                    |j                                      d          z  }|                    |||                              d          }|                    |j                  S )Nr~   zbatched_mm experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.rb   r   dimr   )r]   
expert_ids)rw   r   r>   sizerepeat_interleavereshapeclamp_num_expertsr%   has_gategate_up_projup_projgate_up_proj_scale_invup_proj_scale_invr]   _apply_gateact_fn	down_projdown_proj_scale_invr   re   	unsqueezerf   sum)r   r   r   r   rp   	num_top_k
num_tokens
hidden_dimselected_hidden_statessample_weightsr   proj_outweighted_outfinal_hidden_statess                 r   fp8_batched_mm_experts_forwardr   8  s    ))!W
 
 	

 344O  $$I##A&&J##B''J +<<YA<NN"**2..N$$R((J
 a)A-... 11!]<'+}P##$:P? 2  H } )##H-- ;;x(( 11 ? 2  H n//??II"MMML '++J	:NNRRWXRYY!!-"5666r   c                   | j         dk    rt          d          t                      }|j        }|                    d          }|                    d          }|                    d          }|                    d          }	|                    d          }
t          j        |
          \  }}|||z           }|	|         }|j        dk    r|	                                n|
                                }t          j        || j        d| j        dz
            }t          j        |dt          j                  }|| j        k                        d          }| j        r| j        n| j        }| j        r| j        n| j        }| j        }| j        }t/          |t          j        j        j                  rP|                                }|                                }|                                }|                                }|                    ||||| j        |	          }| j        r|                     |          }n|                     |          }|                    ||||| j        |	          }||                     |j!                                      d          z  }|"                    |d
           t          j#        |          }t          j$        |                    d          |          ||<   ||         }|%                    |||          &                    d          }|                     |j!                  S )Nr~   zgrouped_mm experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.rb   r   cpur   binsr   r   )r   re   )tokens_per_expertr]   offsets        rd   r   )'rw   r   r>   rd   r   r   rK   sortr   ri   rT   histcr   cumsumint32r   r   r   r   r   r   r   r   r   r   r   r   r   r&   r]   r   r   r   re   masked_fill_
empty_likearangerf   r   )r   r   r   r   rp   rd   r   r   r   r   r   expert_ids_gpermselected_hidden_states_gsample_weights_ghistc_inputr   r   sentinel_maskw_upws_upw_downws_downr   r   inv_permr   s                              r   fp8_grouped_mm_experts_forwardr   y  sJ    ))!W
 
 	

 344O!F  $$I##A&&J##B''J #**2..N$$R((J J//L$,TY->?%d+
 +1+*>*>,$$&&&LDTDTDVDVKKd6FASWScfgSghhhl,!5;GGGG "T%55@@DDM !%?44<D+/=TD''d>TE^F&G$)0899 %}}  """"$$ 11 +? 2  H } )##H-- ;;x(( 11+? 2  H .11(.AAKKBOOOL mS111 %%H\$))A,,v>>>HTN)L '++J	:NNRRWXRYY!!-"5666r   expert_ids_sortedr   	alignmentuse_psum_layoutrz   tuplec                   | j         }|                     d          }t          j        |                                 |d|dz
                                            }||z   dz
  |z  |z  }|t          ||          |dz
  z  z   }||z
  }	t          j        j        	                    |	
                    d          d          }
t          j        ||          |
|          z   }|r(|
                    d                                          }nRt          j        |fd|t          j                  }t          j        | |k     |                                 d          ||<   |||fS )a7  Build the TMA-aligned layout DeepGEMM's grouped GEMM expects.

    Returns `(sorted_to_padded, grouped_layout, total_padded_rows)`. `grouped_layout` encodes
    expert boundaries as a cumsum of aligned counts on Blackwell (`use_psum_layout=True`) or
    per-row expert ids with -1 for padding on Hopper.

    Accepts EP sentinels: values in `expert_ids_sorted` equal to `num_experts` (unclamped sentinels)
    are routed past the last aligned expert block and marked `-1` in the Hopper layout (and
    excluded from the Blackwell cumsum), so DeepGEMM skips them.
    r   r   r   )r   r   r   rb   rc   )rd   r   rK   r   rT   longr   r   r   padr   r   fullr   where)r   r   r   r   rd   r   r   aligned_tokens_per_experttotal_padded_rowspadding_per_expertcumulative_paddingsorted_to_paddedgrouped_layouts                r   !_build_deepgemm_contiguous_layoutr     su    %F"''**J$5$9$9$;$;+STZehiZijjjooqq"3i"?!"C	!QU^ ^"S[%A%AYQR]%SS 35FF,001C1J1J11M1MvVV|Jv>>>ASTeAff 	u
 399!<<@@BB %6$8"VSXS^___+0;7H;7VXiXmXmXoXoqs+t+t'(^->>>r   scalesr   r   !tuple[torch.Tensor, torch.Tensor]c                    t          j        || j        d         | j        | j                  }| ||<   t          j        ||j        d         | j        t           j                  }|||<   ||fS )zKPad sorted hidden states and scales into the TMA-aligned contiguous layout.r   rc   )rK   zerosrg   rd   re   r   )r   r   r   r   hidden_paddedscales_paddeds         r   "_pad_to_deepgemm_contiguous_layoutr   	  s{     K=.q1-:NVcVi  M '4M"#K 16<?=K_glgtuuuM&,M"#-''r   hidden_states_paddedc                    | |         S )z;Remove padding rows from the TMA-aligned contiguous layout.r,   )r   r   s     r   &_unpad_from_deepgemm_contiguous_layoutr     s       011r   c                f   | j         dk    rt          d          | j        t          d          | j        d         dk    s| j        d         dk    rt          d| j                   t	                      }|j        }|                    d          }|                    d          }|                    d          }|                    d          }	|                    d          }
t          j	        |
          \  }}|||z           }|	|         }t          j
                            |          d         d	k    }t          || j        t          |
          \  }}}|| j        k                        d          }| j        r| j        n| j        }| j        r| j        n| j        }| j        }| j        }t/          |t          j        j        j                  rP|                                }|                                }|                                }|                                }|                    |d          \  }}t;          ||||          \  }}t          j        ||j        d         |t          j                   }|!                    ||f||"                                f|||           | j        r| #                    |          }n| $                    |          }|                    |d          \  }}t          j        |||t          j                   }|!                    ||f||"                                f|||           tK          ||          }||&                    |j'                                      d          z  }|(                    |d           t          j)        |          }t          j*        |                    d          |          ||<   ||         }|+                    |||          ,                    d          }|&                    |j'                  S )Nr~   zDeepGEMM experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.zuDeepGEMM requires block-wise quantization (block_size=[128, 128]), but got per-tensor quantization (block_size=None).r   r   r   z-DeepGEMM requires block_size=(128, 128), got rb   
   )r   r   F)	use_ue8m0rc   )r   r   r   r   )-rw   r   r]   
ValueErrorrR   rd   r   r   rK   r   rL   rN   r   r   _DEEPGEMM_M_ALIGNMENTr   r   r   r   r   r   r   r   r   r   r   r   r   rA   r   rh   rg   bfloat16r&   ri   r   r   r   r   re   r   r   r   rf   r   ) r   r   r   r   rl   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   act_fp8
act_scalesr   proj_fp8proj_scalesr   r   r   s                                    r   fp8_deepgemm_experts_forwardr     sD    ))!W
 
 	
 A
 
 	
 qS  DOA$6#$=$=ZZZ[[[$&&H!F  $$I##A&&J##B''J #**2..N$$R((J J//L$,TY->?%d+j66v>>qARGO:[d&2GYh; ; ;7n&7 "T%55@@DDM !%?44<D+/=TD''d>TE^F&G$)0899 %}}  """"$$ #889Q]b8ccGZ<WjRbduvvGZ{,djmFRWR`aaaH	*ekkmm4h`o     
 } )##H--;;x(( %::8u:UUHk{,ju~^^^H	;	!'      6h@PQQH .11(.AAKKBOOOL mS111 %%H\$))A,,v>>>HTN)L '++J	:NNRRWXRYY!!-"5666r   c                  H     e Zd Zddddefd fdZddZddZ	 dddZ xZS ) 
FP8ExpertsNrs   FTr]   rv   rw   rx   ry   rz   r   c                   t                                                       |du s
J d            || _        || _        || _        || _        |j        | _        || _        t          |dd          | _
        t          |dd          | _        t          t          |dd                   | _        | j        rd	| j        z  | j        }}t          j        t!          j        | j
        |||
                    | _        | j        t'          || j        d                   nd}	| j        t'          || j        d                   nd}
t          j        t!          j        | j
        |	|
t           j        
                    | _        |                     dd            n| j        | j        }}t          j        t!          j        | j
        |||
                    | _        | j        t'          || j        d                   nd}| j        t'          || j        d                   nd}t          j        t!          j        | j
        ||t           j        
                    | _        |                     dd            | j        | j        }}t          j        t!          j        | j
        |||
                    | _        | j        t'          || j        d                   nd}| j        t'          || j        d                   nd}t          j        t!          j        | j
        ||t           j        
                    | _        |                     dd            | j        dk    rzt          j        t!          j        | j
        t           j        
                    | _        t          j        t!          j        | j
        t           j        
                    | _        d S d S )NFzWFP8Experts does not support bias for now, please open an issue if you want this featurenum_local_expertsr   moe_intermediate_sizeintermediate_sizehidden_activation
hidden_actr   r|   r   r   gate_up_proj_biasup_proj_biasdown_proj_biasr~   )r   r   configry   r   r]   hidden_sizer   rw   r   r   intermediate_dimr   r   r   r   rK   rh   r   rW   r   r   r   r   r   r   r   onesgate_up_proj_activation_scaledown_proj_activation_scale)r   r  r]   rw   ry   r   re   gu_proj_out
gu_proj_ingu_scale_outgu_scale_in
u_proj_out	u_proj_inu_scale_out
u_scale_in
d_proj_out	d_proj_ind_scale_out
d_scale_inr   s                      r   r   zFP8Experts.__init__  sx    	5   e !     $ ,!2&v/BMRR +F4KM` a a[1DlSST= 	:&'$*?&?K "U[9I;Xbjo-p-p-p q qDEI_E`5doa.@AAAfgLCG?C^%
DOA,>???deK*,,D,lKu}]]]+ +D' ##$7>>>>$($94?	J<D4DjR[ch(i(i(ijjDLCG?C^%
DOA,>???deKAEA\y$/!*<===bcJ%'\D,k:U][[[& &D" ##ND999 $1FI
ek$2BJPYaf&g&g&ghh?C?ZeJ(:;;;`a=A_=XU9doa&8999^_
#%<K(+zWWW$
 $
  	 0$777!X--13ejIYafan>o>o>o1p1pD..0l5:dFV^c^k;l;l;l.m.mD+++ .-r   gate_uprY   r-   c                f    |                     dd          \  }}|                     |          |z  S )Nr   rb   r   )chunkr   )r   r&  gateups       r   r   zFP8Experts._apply_gate  s2    ===++b{{4  2%%r   r   r   r   c                   t          j        |t           j                  }t          j                    5  t           j        j                            || j                  }|                    ddd          }t          j	        |
                    d          d                              d	                              d
          }d d d            n# 1 swxY w Y   |D ]v}|| j        k    rt          j        ||                   \  }}	||	         }
| j        dk    r| j        |         nd }|                     |
| j        r| j        |         n| j        |         | j        r| j        |         n| j        |         |          }| j        r|                     |          n|                     |          }| j        dk    r| j        |         nd }|                     || j        |         | j        |         |          }||	|d f         }||                    |j                  z  }|                    d|	|                    |j                             x|                    |j                  S )Nr|   )num_classesr   r   r   )rb   r   F)as_tuplerb   r~   )r   )rK   
zeros_liker   no_gradr   r   one_hotr   permutegreaterr   nonzerorf   r   rw   r  r   r   r   r   r   r   r   r   r  r   r   r   re   
index_add_)r   r   r   r   r   expert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategate_up_act_scaler   down_act_scalerouting_weightsr   s                   r   r   zFP8Experts.forward  s   
 $.}EMRRR]__ 	j 	j(-55ktO_5``K%--aA66K{8'D'DaHHPPZ_P``eefhiiJ	j 	j 	j 	j 	j 	j 	j 	j 	j 	j 	j 	j 	j 	j 	j
 % 	e 	eJT---#(;{:/F#G#G Iy))4MBFBX\dBdBd2:>>jn  {{15\!*--DLQ[D\;?=p+J77dNdeoNp!2	 #  H 6:]]t''111T\H]H]H?C?UYa?a?a/
;;gk  {{z*(4!/	 #  H ,Iy$,FGO#o&8&8&H&HHL**1iI\Ib9c9cdddd"%%m&9:::s   BCCCr   r   r   r   torch.Tensor | Nonec                0   |                                 dk    rt          j        ||d           S | j        dk    r^|\|                    t
          j                  }||z                      t          t                                        t                    }nFt                      }|                    || j        | j        d         n|j        d                   \  }}t          ||||| j        |j                  }|                    |j                  S )Nr   r~   r   rb   r   r|   )r   r   r   rw   r   rK   r   r   r   r   r   r>   r$   r]   rg   r/   re   )	r   r   r   r   r   r   r   rp   ro   s	            r   r   zFP8Experts.linear  s      1$$8E64000!X--2B2N$''66Eem**xX*FFII*UUFF:<<O+99T_-Htq))ekZ\o MFE !O
 
 
 yyu{y+++r   )r]   rv   rw   rx   ry   rz   r   rz   )r&  rY   r-   rY   )r   rY   r   rY   r   rY   r-   rY   r3   )
r   rY   r   rY   r   rY   r   r?  r-   rY   )	r   r(   r)   r   r   r   r   r   r   r   s   @r   r
  r
    s         .2!*7n 7n 7n 7n 7n 7n 7nr& & & &(; (; (; (;^ 15, , , , , , , , ,r   r
  c                      e Zd ZdZeeedZdS )FP8ExpertsInterfacez?Interface for registering custom FP8 experts forward functions.)
batched_mm
grouped_mmrl   N)r   r(   r)   r*   r   r   r  _global_mappingr,   r   r   rB  rB    s+        II 540 OOOr   rB  Fmodules_to_not_convertlist[str] | Nonec                   |j         r| S d}|                                 D ]C\  }}t          ||          s|ri nddi}d}t          j        d          5  |                    d          rt          |dd          }	t          |dd          }
t          |d	| j                                                  }t          t          t          |
|	
          } |d||j        |j        |
|	d|}nGt          |t          j                  r-t#          d|j        |j        |j        |j        |j        dud|}||                     ||           d}ddd           n# 1 swxY w Y   E|st,                              d           | S )a  
    A helper function to replace all `torch.nn.Linear` modules by `FP8Linear` modules.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
            Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
        quantization_config (`FbgemmFp8Config`):
            The quantization config object that contains the quantization parameters.
        pre_quantized (`book`, defaults to `False`):
            Whether the model is pre-quantized or not
    Fre   Nmetaz.expertsr   Try   r  )experts_classexperts_interfacery   r   )r  r]   rw   ry   r   )rt   ru   r]   rw   ry   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.r,   )
dequantizenamed_modulesr
   rK   rd   endswithr   r  get_text_configr   r
  ALL_FP8_EXPERTS_FUNCTIONSweight_block_sizerw   r   r   Linearrr   rt   ru   r   set_submodulerj   warning)modelrF  quantization_configpre_quantizedhas_been_replacedmodule_namemodulemodule_kwargs
new_moduler   ry   r  	new_classs                r   replace_with_fp8_linearr^    s0   " % $2244 %) %)V$[2HII 	 ,@'4
\&!! 	) 	)##J// "6:t<<"6:u== 5<3O3O3Q3QRR6",&?%%	  	 'Y !2D&9&K%%  $ 

 FBI.. &  & 2!'!42D&9&K#[4  $ 
 %##K<<<$(!=	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	)@  
<	
 	
 	
 Ls   C7EE	E	c                  H    e Zd ZdZd ZddZddZddZedd            Z	dS )Fp8Quantizez^
    A quantization operation that creates two tensors, weight and scale out of a weight.
    c                    || _         d S r3   hf_quantizerr   rc  s     r   r   zFp8Quantize.__init__f      (r   valuerY   r-   tuple[int, int]c                (   d }| j         j        Zt          | j         j        t                    r | j         j                            d          }nt          | j         j        dd           }||j        d         |j        d         f}t          |          S )NrQ  r-  rb   )rc  rV  r   dictgetr   rg   r   )r   rf  r]   s      r   _resolve_block_sizezFp8Quantize._resolve_block_sizei  s    
0<$+?FF g!.BFFGZ[[

$T%6%JL_aeff
+b/5;r?;JZ   r   keyrx   dict[str, torch.Tensor]c                $   |j         dk     r||iS |                     |          \  }}|j        d         |j        d         }}||z  dk    s	||z  dk    r||iS |j        d d         }||z  }||z  }	|j        }
|                    t          j                  } |j        g ||||	|R  }|                                                    d          }t	          j	        |dk    |t	          j
        |                    }t          |z  }t	          j	        |dk    |t	          j
        |                    }|                    d                              d          }||z  }t	          j        |t          t                                        t                    }|                    |
          }d	|z                      t          j                  }|                    d
          r|                    dd          d         dz   n|dz   }||||iS )Nr   r-  rb   r   )rb   r   ro  r   r}   r   rF   r   .weight_scale_inv
_scale_inv)ndimrk  rg   r   rK   r   r   absamaxr   	ones_liker   r   r   r   r   rN  rsplit)r   rl  rf  block_mblock_nrowscolsleading_shape
rows_tiles
cols_tilesoriginal_shape
value_fp32reshapedmax_abssafe_max_absr   scales_broadcastscaled	quantized
inv_scales	scale_keys                        r   _quantize_onezFp8Quantize._quantize_onet  s
    :>><33E::[_ek"od'>Q$.A"5"5< CRC(W_
W_
XXem,,
%:%_}_j_'_:_W^___,,..%%(%33{7Q;9Q9QRRL(Wq[&%/&2I2IJJ!++B//99"==,,KH(CCCFFzRR	%%n55	Fl&&u}55
CF<<PXCYCYqCJJsA&&q),???_beq_q	Y	:66r   
input_dictc                    i }|                                 D ]M\  }}t          |t                    r|d         n|}|                    |                     ||                     N|S )Nr   )itemsr   listupdater  )r   r  kwargsresultrl  rf  r   s          r   convertzFp8Quantize.convert  sn     +-$**,, 	; 	;JC!+E4!8!8CU1XXeFMM$,,S&99::::r   r	   c                *    t          | j                  S r3   )Fp8Dequantizerc  r   s    r   
reverse_opzFp8Quantize.reverse_op  s    T.///r   N)rf  rY   r-   rg  )rl  rx   rf  rY   r-   rm  )r  rY   r-   rm  r-   r	   )
r   r(   r)   r*   r   rk  r  r  propertyr  r,   r   r   r`  r`  a  s         ) ) )	! 	! 	! 	! 7  7  7  7D    0 0 0 X0 0 0r   r`  c                  X    e Zd ZdZd ZddZdZdd
ZddZ	 dddZ	e
dd            ZdS )r  u  Dequantize FP8 weights using their per-block ``weight_scale_inv``.

    Designed to run as the *first* op in any :class:`WeightConverter` chain when
    loading with ``dequantize=True`` — :meth:`update_weight_conversions` on the
    FP8 quantizer attaches it to each existing model-specific converter so that
    per-expert (weight, scale) pairs are folded into full-precision tensors before
    the chain's merge / concat ops collapse the per-expert structure.

    Pattern semantics
        Input ``input_dict`` carries one entry per source pattern; each value is a
        list of tensors (one per ``*`` match). For every weight pattern that has a
        sibling ``*.weight_scale_inv`` pattern in the dict, this op pairs them up by
        index, dequantizes per-pair, and emits the dequantized list under the
        original *weight* key. Scale entries are dropped from the output so the
        remaining ops only see weights.
    c                    || _         d S r3   rb  rd  s     r   r   zFp8Dequantize.__init__  re  r   weight_patternrx   r-   c                    |                     d          }|r
|d d         n|}|                     d          r|d t          d                    dz   }n|dk    rd}n|dz   }|r|dz   n|S )N$rb   z.weightrp  r   r   rq  )rN  len)r   r  anchoredbaser   s        r   _scale_pattern_forz Fp8Dequantize._scale_pattern_for  s    !**3//&.B~crc""N==## 	(*C	NN?*+.AAEEX&EE<'E&1us{{E1r   )r   g      ?r}   g      ?g       @g      @g      @g      @g       g      g      g      g       g      g      g      packedrY   c                   t          j        | j        t           j        |j                  }|                                                    t           j                  }|dz                                  }|dz	  dz                                  }t          j	        ||         ||         gd          } |j
        g |j        dd         d|j        d         z  R  S )uR   Two ``e2m1`` FP4 values per byte → float32 tensor twice as wide on the last dim.)re   rd         rb   r   Nr   )rK   r   _FP4_E2M1_LUTr   rd   
contiguousrf   uint8r   stackr   rg   )r   r  lutu8lowhighunpackeds          r   _unpack_fp4zFp8Dequantize._unpack_fp4  s    l4-U]6=YYY  %%ek22CxooqC%%'';C#d)4"===xIcrc!2IAR8H4HIIIIr   r  r   c                <   t          t          dd           }|j        t          j        k    s|!|j        |k    r|                     |          }n|                    t          j                  }|j        dd          \  }}|j        dd          \  }}||z  s||z  rt          d| d| d| d| d	          ||z  }	||z  }
|j        j	        r|
                                dk    r|j        nt          j        }|j        }|                    d||	||
          }|                    t          j                                      d||                              d                              d          }||z                      |                              |          S )	Nfloat4_e2m1fn_x2r-  zWeight shape (r8   z) not divisible by scale grid (z).r   rb   )r   rK   re   int8r  r   r   rg   r  is_floating_pointr   r  r   r   )r   r  r   	fp4_dtypequantized_fp32ry  rz  
scale_rows
scale_colsrw  rx  	out_dtyper~  qss                  r   _dequantize_onezFp8Dequantize._dequantize_one  s    E#5t<<	?ej((Y-ByZcGcGc!--i88NN&\\%-88N#)"##.
d "(bcc!2
J* 	z 1 	jjjjjjjj\fjjj   *$*$ %+L$BuvGZGZG\G\`aGaGaFLLglgu	'-""2z7JPPIIem$$,,RZHHRRSUVV``abccAzz)$$,,^<<<r   Nr  ,dict[str, list[torch.Tensor] | torch.Tensor]full_layer_name
str | Nonec                    d|v rn|d         }t          |t                    r|d         n|}d|v r?|d         }t          |t                    r|d         n|}|                     ||          iS ||iS i }|                                D ]\  }}d|v sd|v r                     |          }	|	|vr|||<   -t          |t                    r|n|g}
||	         }t          |t                    r|n|g}t          |
          t          |          k    r3t          d| dt          |
           dt          |           d           fd	t          |
|          D             ||<   |S )
Nzweight$r   r   r   z/Fp8Dequantize: weight/scale count mismatch for z (z weights vs z	 scales).c                B    g | ]\  }}                     ||          S r,   )r  )r5   wr  r   s      r   r7   z)Fp8Dequantize.convert.<locals>.<listcomp>  s-    WWW$!Q4//155WWWr   )r   r  r  r  r  r  r  zip)r   r  r  r  r  r   r  rl  rf  r  weightss   `          r   r  zFp8Dequantize.convert  s    
"""9-I(29d(C(CR	!I!Z//#$67&0&>&>JF')=)=i)P)PQQ#Y// @B$**,, 	X 	XJC!S((,>#,E,E//44I
**#s)%66CeeUGG	*F)&$77EVVfXF7||s6{{** Ic I IGI I25f++I I I   XWWW#gvBVBVWWWF3KKr   r	   c                *    t          | j                  S r3   )r`  rc  r  s    r   r  zFp8Dequantize.reverse_op  s    
 4,---r   )r  rx   r-   rx   )r  rY   r-   rY   )r  rY   r   rY   r-   rY   r3   )r  r  r  r  r-   r  r  )r   r(   r)   r*   r   r  r  r  r  r  r  r  r,   r   r   r  r    s         ") ) )
2 
2 
2 
2 mMJ J J J= = = => '+& & & & &P . . . X. . .r   r  )r-   r"   )r-   r@   )rS   rT   rU   rT   r-   rT   )rX   rY   rZ   rY   r[   rY   r\   rY   r]   r^   r_   r`   r-   rY   )
r   r   r   rY   r   rY   r   rY   r-   rY   )
r   rY   r   rT   r   rT   r   rz   r-   r   )
r   rY   r   rY   r   rY   r   rT   r-   r   )r   rY   r   rY   r-   rY   )NNF)rF  rG  )@
__future__r   	functoolscollections.abcr   dataclassesr   rK   torch.nnr   r   r   activationsr   core_model_loadingr	   quantizers.quantizers_utilsr
   utilsr   utils.import_utilsr   r   r   hub_kernelsr   moer   r   
get_loggerr   rj   float8_e4m3fnr   finfor   r   r   r   r  r   r"   cacher>   r@   rR   rW   r   r/   rR  rr   r   r   r   r   r   r  Moduler
  rB  rP  r^  r`  r  r,   r   r   <module>r     s+   # " " " " "     $ $ $ $ $ $ ! ! ! ! ! !        $ $ $ $ $ $             . . . . . . ? ? ? ? ? ?       h h h h h h h h h h ) ) ) ) ) ) = = = = = = = = 
	H	%	%  
5;z""&5;z""&  G G G $! ! ! ! ! ! ! ! - - - -` $$ $ $ $ $ $ $ $ = = = =@    !&,N ,N ,N ,N ,N^F, F, F, F, F,	 F, F, F,R>7 >7 >7 >7Bb7 b7 b7 b7J(? (? (? (?V( ( ( ( 2 2 2 2g7 g7 g7 g7TC, C, C, C, C, C, C, C,L    *    0/11  ejA A A A AHA0 A0 A0 A0 A0- A0 A0 A0Hx. x. x. x. x.M x. x. x. x. x.r   