
    |-ji                    t   d dl mZ d dlmZ d dlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZ dd	lmZ  e            r8d d
lZej                            e          Zej                            e          Z ej        e          Z	 	 d7d8dZd9dZd:dZd:dZd Zd Z e            rVej                            d ed!d"#           ej                            d e           ej                             d ee$           d;d%Z!d:d&Z"	 	 d7d<d'Z#d9d(Z$ G d) d*e	          Z% e%            Z&d=d,Z'	 d>e&d-ddd-d.d?d6Z(d
S )@    )annotations)Callable)wraps   )logging)GeneralInterface)is_torch_availableis_torch_greater_or_equalis_torch_less_or_equalis_torchdynamo_compiling   )sonicmoe_experts_forwardNFinputtorch.Tensorweightbiastorch.Tensor | Noneis_transposedboolreturnc                &   |r<t          j        |                     d          |                              d          }n;t          j        ||                     d                                        d          }||                    |           |S )a  Batched linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (batch_size, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
            else of shape (batch_size, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (batch_size, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
    r   )torchbmm	unsqueezesqueezeadd_)r   r   r   r   outs        ]/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/integrations/moe.py_batched_linearr    S   s    *  Ai**F33;;A>> i 3 344<<R@@J    selftorch.nn.Modulehidden_statestop_k_indextop_k_weightsc                   |                     d          }|                     d          }|                     d          }|                    |d          }|                    d          }|                    d          }	|	                    d| j        dz
             | j        r$| j        |	         }
| j        r| j        |	         nd }n#| j	        |	         }
| j        r| j
        |	         nd }t          ||
|| j                  }| j        r|                     |          }n|                     |          }| j        |	         }
| j        r| j        |	         nd }t          ||
|| j                  }||                    d          z  }|                    |||                              d          }|                    |j                  S )Nr   r   dimr   r   r   )sizerepeat_interleavereshapeclamp_num_expertshas_gategate_up_projhas_biasgate_up_proj_biasup_projup_proj_biasr    r   _apply_gateact_fn	down_projdown_proj_biasr   viewsumtodtype)r"   r$   r%   r&   	num_top_k
num_tokens
hidden_dimselected_hidden_statessample_weights
expert_idsselected_weightsselected_biasesproj_outweighted_outfinal_hidden_statess                  r   batched_mm_experts_forwardrI   u   s      $$I##A&&J##B''J +<<YA<NN"**2..N$$R((J
 a)A-... } S,Z8@DW$0<<SW<
3;?=R$+J77d  0VZVh  H
 } )##H-- ;;x(( ~j19=Pd)*55DO "HZ  H
 n66r:::L '++J	:NNRRWXRYY!!-"5666r!   offsc                T   t          j        |                     d          |                    d          | j        | j                  }d}t          |                                          D ];\  }}||k    rt          j        | ||         ||         |||                    |}<|S )a(  
    Fallback grouped matrix multiplication used when `torch.nn.functional.grouped_mm` and `torch._grouped_mm`
    are unavailable or incompatible with `torch.compile` (e.g. non-bfloat16 weights).

    Args:
        input (`torch.Tensor`): Input of shape (S, input_dim), sorted by expert id.
        weight (`torch.Tensor`): Expert weights of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`): Cumulative token counts per expert of shape (num_experts,).
    Returns:
        `torch.Tensor`: Output of shape (S, output_dim).
    r   r   devicer=   r   )r   zerosr+   rM   r=   	enumeratetolistmm)r   r   rJ   outputstartiends          r   _grouped_mm_fallbackrW      s     [AAu|SXS^___FE DKKMM**  3C<<uSy!6!9&s2CDDDDMr!   c                   |                                  dk    sJ dt          | j                               |                                 dk    sJ dt          |j                               |                                 dk    sJ dt          |j                               |                    d          |                    d          k    s6J d|                    d           d	|                    d                       |                     d          |                    d          k    s6J d
|                     d           d|                    d                       |j        t
          j        t
          j        fv sJ d|j                     t          j        |                     d          |                    d          | j	        | j                  S )zRShape/dtype inference stub for `_grouped_mm_fallback` required by `torch.compile`.r   z+input must be 2D (S, input_dim), got shape    zBweight must be 3D (num_experts, input_dim, output_dim), got shape r   z*offs must be 1D (num_experts,), got shape r   zoffs length z must match number of experts zinput_dim mismatch: input has z, weight has z$offs must be an integer tensor, got rL   )
r)   tupleshaper+   r=   r   int32int64emptyrM   r   r   rJ   s      r   _grouped_mm_fallback_faker`      s   99;;!_5QVQ\K]K]__::<<1bUSYS_M`M`bb  88::???\tzIZIZ\\???99Q<<6;;q>>)))+v$))A,,+v+vflfqfqrsftft+v+v)))::a==FKKNN***UAUUV[[QR^^UU +** :%+u{33335h\`\f5h5h333;uzz!}}fkk!nnU\QVQ\]]]]r!   c                d    |                      |d         |d                    |d         | _        dS )zjSaves input and weight for backward; offs is stored directly as it is a non-differentiable integer tensor.r   r   r   N)save_for_backwardrJ   )ctxinputsrS   s      r   "_grouped_mm_fallback_setup_contextre      s/    &)VAY///ayCHHHr!   c                   | j         \  }}t          j        |          }t          j        |          }d}t          | j                                                  D ]r\  }}||k    rt          j        |||         ||         j        |||                    t          j        |||         j        |||         ||                    |}s||dfS )zuBackward pass for `_grouped_mm_fallback`. Computes grad_input and grad_weight per expert group; offs has no gradient.r   rN   N)saved_tensorsr   
zeros_likerP   rJ   rQ   rR   T)	rc   grad_outputr   r   
grad_inputgrad_weightrT   rU   rV   s	            r   _grouped_mm_fallback_backwardrm      s    %ME6!%((J"6**KE CHOO--..  3C<<U3Y'*U3Y:OPPPPuSy!#[s%;QPPPP{D((r!   z!transformers::grouped_mm_fallback z4(Tensor input, Tensor weight, Tensor offs) -> Tensor)mutates_argsschema)setup_contextc                    t                      r|j        t          j        k    sW|j        j        dk    rIt          dd          r8|                                dz  dk    s|                                 dz  dk    rdS |j        j        dk    rt          t          j	        j
        d	          r(t          j                            |j                  d
k    S t          t          d          rat          dd          r(t          j                            |j                  d
k    S t          j                            |j                  dk    S dS t          t          j	        j
        d	          pt          t          d          S )a  
    Check if torch.nn.functional.grouped_mm or torch._grouped_mm can be used based on availability and compatibility with torch.compile.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `bool`: True if grouped_mm can be used, False otherwise.
    cpuz2.10.0T)
accept_dev   r   Fcuda
grouped_mm)   r   _grouped_mmz2.9)	   r   )r   r=   r   bfloat16rM   typer   data_ptrhasattrnn
functionalrv   get_device_capabilityr
   r_   s      r   _can_use_grouped_mmr     sT    	!"" v|u~'E'Ee##"8=== 	$ __#q((ENN,<,<r,AQ,F,F u
 }V##58&55 	M:33FMBBfLL5-(( 	Q(4@@@ Qz77FF&PPz77FF&PPu58&55V9V9VVr!   c                   t          | ||          rt          t          j        j        d          r?t          j        j                            |                     |j                  ||          S t          t          d          r/t          j        |                     |j                  ||          S t          j	        j
                            | ||          S )a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    rw   rJ   ry   )r   r~   r   r   r   rw   r<   r=   ry   opstransformersgrouped_mm_fallbackr_   s      r   ry   ry   5  s    $ 5&$// P
 58&55 	P8&11%((6<2H2H&W[1\\\UM** 	P$UXXfl%;%;V$OOOO9!55eV$5OOOr!   c                    |rt          | ||          }n&t          | |                    dd          |          }||                    |           |S )a  Grouped linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim) if `is_transposed`,
            else of shape (num_experts, output_dim, input_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (num_experts, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    r   r   )ry   	transposer   )r   r   rJ   r   r   r   s         r   _grouped_linearr   T  sc    0  F%d333 %!1!1"b!9!9EEEJr!   c                   |j         }|                    d          }|                    d          }|                    d          }|                    d          }|                    d          }	t          j        |	          \  }
}|||z           }||         }|j        dv r|
                                n|
                                }t          j        || j	        d| j	        dz
            }t          j
        |dt          j                  }|
| j	        k                        d          }|
                    | j	        dz
             | j        r| j        }| j        r| j        |
         nd }n| j        }| j        r| j        |
         nd }|                    |d           t+          ||||| j        	          }| j        r|                     |          }n|                     |          }| j        }| j        r| j        |
         nd }t+          ||||| j        	          }||                    d          z  }|                    |d           t          j        |          }t          j        |                    d          |
          ||<   ||         }|                    |||                              d          }|                    |j                   S )Nr   r   )rs   mpsr   )binsminmax)r)   r=   )r   g        r*   )rM   r(   )!rM   r+   r-   r   sortr|   floatinthistcr/   cumsumr\   r   r.   r0   r1   r2   r3   r4   r5   masked_fill_r   r   r6   r7   r8   r9   
empty_likearanger:   r;   r<   r=   )r"   r$   r%   r&   rM   r>   r?   r@   rB   rC   expert_ids_gpermselected_hidden_states_gsample_weights_ghistc_inputtokens_per_expertoffsetssentinel_maskrD   rE   rF   rG   inv_permrH   s                           r   grouped_mm_experts_forwardr   z  s    !F  $$I##A&&J##B''J #**2..N$$R((J J//L$,TY->?%d+ +1+*G*G,$$&&&\M]M]M_M_KKd6FASWScfgSghhhl,!5;GGGG" "T%55@@DDMD,q0111 } U,BF-Y$0>>UY<=A]T$+L99PT ))-===  "2G/aeas  H
 } )##H-- ;;x(( ~;?=Rd),77dO "G/QUQc  H
 .88<<<L mS111 %%H\$))A,,v>>>HTN)L '++J	:NNRRWXRYY!!-"5666r!   c                  0     e Zd ZdZeeedZd	 fdZ xZ	S )
ExpertsInterfacez;Interface for registering custom experts forward functions.)
batched_mmrw   sonicmoeexperts_implementationstrdefaultr   r   c                    |t                               d           n|dk    r|| vrt          d| d          t                                          ||          S )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.Na
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r"   r   r   	__class__s      r   get_interfacezExpertsInterface.get_interface  s    !)N   
 $w..3IQU3U3Ux*xxx   ww{{17;;;r!   )r   r   r   r   r   r   )
__name__
__module____qualname____doc__rI   r   r   _global_mappingr   __classcell__)r   s   @r   r   r     sZ        EE 10, O< < < < < < < < < <r!   r   gate_up_outc                f    |                     dd          \  }}|                     |          |z  S )a  
    Default gating mechanism: splits the gate_up_out into gate and up parts,
    applies the activation function to the gate part, and multiplies it with the up part.
    Args:
        gate_up_out (`torch.Tensor`):
            The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
    Returns:
        `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
    r   r   r(   )chunkr7   )r"   r   gateups       r   _default_apply_gater     s7        ++HD";;tr!!r!   T)experts_interfaceis_concatenatedr   r2   r0   experts_classtype[torch.nn.Module] | Noner   r   r2   r0   type[torch.nn.Module]c               >    dfd}|  ||           S |S )a  Decorator to modify experts class to support different experts implementations.

    Args:
        experts_class (`type[torch.nn.Module]`, *optional*):
            The experts class to modify. If not provided, returns a decorator that can be applied to the class.
        experts_interface (`ExpertsInterface`, *optional*, defaults to `ALL_EXPERTS_FUNCTIONS`):
            The experts interface to use for dispatching the forward method.
        is_concatenated (`bool`, *optional*, defaults to `True`):
            Whether the expert weights are stored in concatenated layout [gate;up]
            or interleaved layout [gate0, up0, gate1, up1, ...].
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the expert weights are stored in transposed format.
        has_bias (`bool`, *optional*, defaults to `False`):
            Whether the expert layers include bias terms or not.
        has_gate (`bool`, *optional*, defaults to `True`):
            Whether the experts use a gating mechanism or not.
            Whether it has gate_up_proj weights or just up_proj weights.

    Returns:
        `type[torch.nn.Module]`: The modified experts class.
    r   r   r   c                    | j         | j        t                    	fd            }t                    fd            }t          | d          st          | _        || _         || _        | S )Nc                h     | |g|R i | || _         | _        | _        | _        | _        d S N)configr0   r2   r   r   )	r"   r   argskwargsr2   r0   r   r   original_inits	       r   __init__z=use_experts_implementation.<locals>.wrapper.<locals>.__init__.  sP    M$8888888 DK$DM$DM!.D#2D   r!   c                \                         | j        j                  } || g|R i |S r   )r   r   _experts_implementation)r"   r   r   experts_forwardr   original_forwards       r   forwardz<use_experts_implementation.<locals>.wrapper.<locals>.forward7  s>    /==dk>acsttO"?49$999&999r!   r6   )r   r   r   r~   r   r6   )
r   r   r   r   r   r   r2   r0   r   r   s
      @@r   wrapperz+use_experts_implementation.<locals>.wrapper*  s    %.(0	}			3 	3 	3 	3 	3 	3 	3 	3 
		3 
	 	 	: 	: 	: 	: 	: 
!	 	: }m44 	<(;M%!) 'r!   N)r   r   r   r   rn   )r   r   r   r   r2   r0   r   s    ````` r   use_experts_implementationr     sV    >         2  w}%%%Nr!   )NF)
r   r   r   r   r   r   r   r   r   r   )
r"   r#   r$   r   r%   r   r&   r   r   r   )r   r   r   r   rJ   r   r   r   )r   r   r   r   rJ   r   r   r   )r   r   r   r   rJ   r   r   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   r   r   r   r2   r   r0   r   r   r   ))
__future__r   collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr	   r
   r   r   r   r   r   _dynamoassume_constant_result
get_loggerr   r   r    rI   rW   r`   re   rm   library	custom_opregister_fakeregister_autogradr   ry   r   r   r   ALL_EXPERTS_FUNCTIONSr   r   rn   r!   r   <module>r      s   # " " " " " $ $ $ $ $ $             , , , , , ,            / . . . . .  ZLLL
 !& D DE^ _ _"]AABXYY 
	H	%	%\ !%	    D<7 <7 <7 <7D   4^ ^ ^ ^  ) ) )&  	M+E	     
M CE^___	M##+%8 $   *W *W *W *WZP P P PF !%# # # # #Le7 e7 e7 e7P< < < < <' < < <0 )(** " " " " 37; +@ ; ; ; ; ; ; ; ;r!   