
    |-j-                       d Z ddlmZ ddlZddlmZ ddlmZ ddlZddl	m
Z
 dd	lmZ  e
j        e          Zd
dddZ ed           G d d                      Zej        d+d            Zej        j        d,d%            Zd-d*ZdS ).zSonicMoE integration: fused MoE using CuteDSL kernels from `kernels-community/sonic-moe`.

Provides `sonicmoe_experts_forward` registered as "sonicmoe" in the ExpertsInterface.
Requirements: CUDA, `kernels`, `nvidia-cutlass-dsl`, has_gate=True.
    )annotationsN)Callable)	dataclass   )logging   )lazy_load_kernelswiglugeglureglu)silugelureluT)frozenc                  (    e Zd ZU dZded<   ded<   dS )SonicMoEzAEntry points exposed by the `kernels-community/sonic-moe` kernel.typeactivation_type_enumr   moe_general_routing_inputsN)__name__
__module____qualname____doc____annotations__     b/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/integrations/sonicmoe.pyr   r   '   s0         KK((((((r   r   returnc                    t           j                                        st          d          t           j                                        d         } | dk     rt          d|  d          t          d          }|t          d          t          t          |d	d          d
d          }t          |dd          }d d|fd|ffD             }|r&t          dd                    |           d          t          ||          S )z
    Load sonic-moe once and return its entry points.

    Raises `ImportError` if CUDA/hardware requirements are not met, or if the kernel or
    required symbols are not found.
    zdsonic-moe kernel requires CUDA, but CUDA is not available. Use a different `experts_implementation`.r   	   z`sonic-moe requires a Hopper (SM90+) or newer GPU, but the current device has compute capability z-.x. Use a different `experts_implementation`.z	sonic-moeNu}   Failed to load the sonic-moe kernel — check that `kernels-community/sonic-moe` has a build matching the current torch/CUDA.enumsActivationTyper   c                    g | ]	\  }}||
S )Nr   ).0nameattrs      r   
<listcomp>z)_load_sonicmoe_kernel.<locals>.<listcomp>O   s-       D$ < 	
 <<r   zenums.ActivationTypez.sonic-moe kernel is missing required symbols: z, zN. Make sure you have the `kernels` package and `nvidia-cutlass-dsl` installed.)r   r   )	torchcudais_availableImportErrorget_device_capabilityr	   getattrjoinr   )majorkernelr   r   missings        r   _load_sonicmoe_kernelr2   /   st    :""$$ 
r
 
 	

 J,,..q1Eqyy[&+[ [ [
 
 	

 k**F~;
 
 	

 #767D#A#ACSUYZZ!(1Mt!T!T  $%9:)+EF
  G  
[TYYw=O=O [ [ [
 
 	

 1#=   r   hidden_statestorch.Tensorrouter_scores
expert_ids	token_idxw1b1torch.Tensor | Nonew2b2act_namestrnum_expertsintconcat_layoutboolis_inference_mode_enabledc                    t                      }|j        }t          |t                              |d                                          |j                  }|                    | ||||||||	|||
d          \  }}|S )us  Module-level shim around `moe_general_routing_inputs` so `allow_in_graph` can wrap it.

    sonicmoe asserts `not torch.compiler.is_compiling()` internally because it dispatches
    CuteDSL kernels, which Dynamo can't trace. `allow_in_graph` keeps the call in the FX
    graph as a single opaque node (no tracing into the body, no graph break) while still
    running the real Python at runtime — autograd through `_UpProjection` / `_DownProjection`
    flows normally. The decorator must be applied at module load time, not inside the compiled
    function — hence this shim plus the `allow_in_graph` decorator above.
    r
   N)Eactivation_typerC   rA   	stream_id)r2   r   r-   ACT_MAPgetupperSWIGLUr   )r3   r5   r6   r7   r8   r9   r;   r<   r=   r?   rA   rC   sonicmoer   rF   output_s                    r   _sonicmoe_wrapperrO   c   s    0 %&&H#8gkk(H==CCEEG[Gb O 33




'";# 4  IFA Mr   selftorch.nn.Moduletop_k_indextop_k_weightsc                   | j         st          d          |j        j        dk    rt          d          |j        }|                    d          }|                    d          }t          j        ||                              d                              d|          	                    d          
                                }|	                    d                              |j                  }|	                    d          
                                }	| j        }
| j        }| j        r| j        nd }| j        r| j        nd }t%          |
t
          j        j        j                  rX|
                                }
|                                }||                                nd }||                                nd }t/          | j        dd	                                          }| j        rd
nd} |
j        | }
 |j        | }t9          |||	||
||||| j        | j        t          j                               S )Nz/sonicmoe requires gated experts (has_gate=True)r)   zsonicmoe requires CUDA devicer   )devicer   
hidden_actr   )r   r   r   )r   r   r   )r3   r5   r6   r7   r8   r9   r;   r<   r=   r?   rA   rC   ) has_gate
ValueErrorrV   r   sizer(   arange	unsqueezeexpandreshaper@   todtypegate_up_proj	down_projhas_biasgate_up_proj_biasdown_proj_bias
isinstancedistributedtensorDTensorto_localr-   configloweris_transposedpermuterO   r?   is_concatenatedis_grad_enabled)rP   r3   rR   rS   rV   	num_top_k
num_tokensr7   r5   r6   r8   r;   r9   r<   r=   perms                   r   sonicmoe_experts_forwardrt      s4    = LJKKK F**8999!F  $$I##A&&J Z777AA!DDKKBPYZZbbceffjjllI!))"--001DEEM$$R((,,..J 
	B	B#'=	:		dB $	7		4B"e'.677 7[[]][[]] nR[[]]]$ nR[[]]]$ t{L&99??AAH *999	D	T	B	T	B##$*&+&;&=&="=   r   )r   r   )r3   r4   r5   r4   r6   r4   r7   r4   r8   r4   r9   r:   r;   r4   r<   r:   r=   r>   r?   r@   rA   rB   rC   rB   r   r4   )
rP   rQ   r3   r4   rR   r4   rS   r4   r   r4   )r   
__future__r   	functoolscollections.abcr   dataclassesr   r(   utilsr   hub_kernelsr	   
get_loggerr   loggerrH   r   cacher2   _dynamoallow_in_graphrO   rt   r   r   r   <module>r      sO    # " " " " "     $ $ $ $ $ $ ! ! ! ! ! !        ) ) ) ) ) ) 
	H	%	% Wg
>
> $) ) ) ) ) ) ) ) 0 0 0 0f + + + +\= = = = = =r   