
    |-jp;                         d Z ddlZddlZddlmZ ddlmZ ddlmZ  ed          e G d	 d
e                                  Z	 ed          e G d de                                  Z
d
dgZdS )zSpeechT5 model configuration    N)strict   )PreTrainedConfig)auto_docstringzmicrosoft/speecht5_asr)
checkpointc                   <    e Zd ZU dZdZdddZdZeed<   dZ	eed	<   d
Z
eed<   d
Zeed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   d
Zeed<   dZeez  ed<   dZeed<   dZeez  ed<   dZeez  ed<   dZeez  ed<   dZeez  ed<   dZeed<   dZeed<   dZeed<   d Zeed!<   d"Zeez  ed#<   dZeed$<   d%Z e!e         e"ed&f         z  ed'<   d(Z#e!e         e"ed&f         z  ed)<   d*Z$e!e         e"ed&f         z  ed+<   dZ%eed,<   d-Z&eed.<   d/Z'eed0<   d1Z(eed2<   d3Z)eez  ed4<   d5Z*eed6<   d7Z+eed8<   d"Z,eez  ed9<   d5Z-eed:<   d;Z.eed<<   d=Z/ed>z  ed?<   d;Z0ed>z  ed@<   d7Z1ee!e         z  d>z  edA<   d7Z2ed>z  edB<   dCZ3eedD<   d7Z4eedE<   dFZ5eedG<   dHZ6eez  edI<   dJZ7eedK<   dLZ8eedM<   dFZ9eedN<   dLZ:eedO<   dHZ;eez  edP<   d7Z<eedQ<   dRZ=eedS<   dTZ>eedU<   dVZ?eedW<   d1Z@eedX<   d7ZAeedY<   dZZBeed[<   d\ZCeed]<   d1ZDeed^<   d1ZEeed_<   d1ZFeed`<    fdaZGdb ZHdc ZI xZJS )dSpeechT5Configat  
    positional_dropout (`float`, *optional*, defaults to 0.1):
        The dropout probability for the text position encoding layers.
    feat_extract_norm (`str`, *optional*, defaults to `"group"`):
        The norm to be applied to 1D convolutional layers in the speech encoder pre-net. One of `"group"` for group
        normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
        convolutional layers.
    feat_proj_dropout (`float`, *optional*, defaults to 0.0):
        The dropout probability for output of the speech encoder pre-net.
    feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the 1D convolutional layers of the feature
        extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
    conv_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
        A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
        speech encoder pre-net. The length of *conv_dim* defines the number of 1D convolutional layers.
    conv_stride (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
        A tuple of integers defining the stride of each 1D convolutional layer in the speech encoder pre-net. The
        length of *conv_stride* defines the number of convolutional layers and has to match the length of
        *conv_dim*.
    conv_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
        A tuple of integers defining the kernel size of each 1D convolutional layer in the speech encoder pre-net.
        The length of *conv_kernel* defines the number of convolutional layers and has to match the length of
        *conv_dim*.
    conv_bias (`bool`, *optional*, defaults to `False`):
        Whether the 1D convolutional layers have a bias.
    num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
        Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
        embeddings layer.
    num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
        Number of groups of 1D convolutional positional embeddings layer.
    apply_spec_augment (`bool`, *optional*, defaults to `True`):
        Whether to apply *SpecAugment* data augmentation to the outputs of the speech encoder pre-net. For
        reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
        Recognition](https://huggingface.co/papers/1904.08779).
    mask_time_prob (`float`, *optional*, defaults to 0.05):
        Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
        procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
        reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
        masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
        actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
    mask_time_length (`int`, *optional*, defaults to 10):
        Length of vector span along the time axis.
    mask_time_min_masks (`int`, *optional*, defaults to 2),:
        The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
        irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
        mask_time_min_masks''
    mask_feature_prob (`float`, *optional*, defaults to 0.0):
        Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
        masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
        the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
        span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
        may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
        True`.
    mask_feature_length (`int`, *optional*, defaults to 10):
        Length of vector span along the feature axis.
    mask_feature_min_masks (`int`, *optional*, defaults to 0),:
        The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
        step, irrespectively of `mask_feature_prob`. Only relevant if
        ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
    num_mel_bins (`int`, *optional*, defaults to 80):
        Number of mel features used per input features. Used by the speech decoder pre-net. Should correspond to
        the value used in the [`SpeechT5Processor`] class.
    speech_decoder_prenet_layers (`int`, *optional*, defaults to 2):
        Number of layers in the speech decoder pre-net.
    speech_decoder_prenet_units (`int`, *optional*, defaults to 256):
        Dimensionality of the layers in the speech decoder pre-net.
    speech_decoder_prenet_dropout (`float`, *optional*, defaults to 0.5):
        The dropout probability for the speech decoder pre-net layers.
    speaker_embedding_dim (`int`, *optional*, defaults to 512):
        Dimensionality of the *XVector* embedding vectors.
    speech_decoder_postnet_layers (`int`, *optional*, defaults to 5):
        Number of layers in the speech decoder post-net.
    speech_decoder_postnet_units (`int`, *optional*, defaults to 256):
        Dimensionality of the layers in the speech decoder post-net.
    speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5):
        Number of convolutional filter channels in the speech decoder post-net.
    speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5):
        The dropout probability for the speech decoder post-net layers.
    reduction_factor (`int`, *optional*, defaults to 2):
        Spectrogram length reduction factor for the speech decoder inputs.
    max_speech_positions (`int`, *optional*, defaults to 4000):
        The maximum sequence length of speech features that this model might ever be used with.
    max_text_positions (`int`, *optional*, defaults to 450):
        The maximum sequence length of text features that this model might ever be used with.
    encoder_max_relative_position (`int`, *optional*, defaults to 160):
        Maximum distance for relative position embedding in the encoder.
    use_guided_attention_loss (`bool`, *optional*, defaults to `True`):
        Whether to apply guided attention loss while training the TTS model.
    guided_attention_loss_num_heads (`int`, *optional*, defaults to 2):
        Number of attention heads the guided attention loss will be applied to. Use -1 to apply this loss to all
        attention heads.
    guided_attention_loss_sigma (`float`, *optional*, defaults to 0.4):
        Standard deviation for guided attention loss.
    guided_attention_loss_scale (`float`, *optional*, defaults to 10.0):
        Scaling coefficient for guided attention loss (also known as lambda).

    Example:

    ```python
    >>> from transformers import SpeechT5Model, SpeechT5Config

    >>> # Initializing a "microsoft/speecht5_asr" style configuration
    >>> configuration = SpeechT5Config()

    >>> # Initializing a model (with random weights) from the "microsoft/speecht5_asr" style configuration
    >>> model = SpeechT5Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```speecht5encoder_attention_headsencoder_layers)num_attention_headsnum_hidden_layersQ   
vocab_sizei   hidden_size   i   encoder_ffn_dim皙?encoder_layerdrop   decoder_layersdecoder_ffn_dimdecoder_attention_headsdecoder_layerdropgelu
hidden_actpositional_dropouthidden_dropoutattention_dropoutactivation_dropoutg{Gz?initializer_rangegh㈵>layer_norm_epsFscale_embeddinggroupfeat_extract_normg        feat_proj_dropoutfeat_extract_activation)   r(   r(   r(   r(   r(   r(   .conv_dim)      r+   r+   r+   r+   r+   conv_stride)
   r   r   r   r   r+   r+   conv_kernel	conv_bias   num_conv_pos_embeddings   num_conv_pos_embedding_groupsTapply_spec_augmentg?mask_time_probr-   mask_time_lengthr+   mask_time_min_masksmask_feature_probmask_feature_lengthr   mask_feature_min_masks   Npad_token_idbos_token_ideos_token_iddecoder_start_token_idP   num_mel_binsspeech_decoder_prenet_layers   speech_decoder_prenet_unitsg      ?speech_decoder_prenet_dropoutr(   speaker_embedding_dimr*   speech_decoder_postnet_layersspeech_decoder_postnet_unitsspeech_decoder_postnet_kernelspeech_decoder_postnet_dropoutreduction_factori  max_speech_positionsi  max_text_positions   encoder_max_relative_positionuse_guided_attention_lossguided_attention_loss_num_headsg?guided_attention_loss_sigmag      $@guided_attention_loss_scale	use_cacheis_encoder_decodertie_word_embeddingsc                 l    t          | j                  | _         t                      j        di | d S )N )lenr)   num_feat_extract_layerssuper__post_init__)selfkwargs	__class__s     s/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/models/speecht5/configuration_speecht5.pyr\   zSpeechT5Config.__post_init__   s8    '*4='9'9$'''''''    c           
      R   t          | j                  | j        k    s:t          | j                  | j        k    st          | j                  | j        k    rOt          dt          | j                   dt          | j                   dt          | j                   d          dS )zOPart of `@strict`-powered validation. Validates the architecture of the config.zConfiguration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) = z`, `len(config.conv_stride) = z`, `len(config.conv_kernel) = z`.N)rY   r,   rZ   r.   r)   
ValueErrorr]   s    r`   validate_architecturez$SpeechT5Config.validate_architecture   s     !""d&BBBD$%%)EEEDM""d&BBBI&&I IFI$JZF[F[I I 0343C/D/DI I I   CBra   c                 L    t          j        t          j        | j        d          S )Nr;   )	functoolsreduceoperatormulr,   rd   s    r`   inputs_to_logits_ratioz%SpeechT5Config.inputs_to_logits_ratio   s    d.>BBBra   )K__name__
__module____qualname____doc__
model_typeattribute_mapr   int__annotations__r   r   r   r   r   floatr   r   r   r   r   strr   r   r   r    r!   r"   r#   boolr%   r&   r'   r)   listtupler,   r.   r/   r1   r3   r4   r5   r6   r7   r8   r9   r:   r<   r=   r>   r?   rA   rB   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rO   rP   rQ   rR   rS   rT   rU   rV   r\   re   rk   __classcell__)r_   s   @r`   r	   r	      s        m m^ J,E\lmmMJKNC#%S%%%OS%(us{(((NCOS#%S%%%%(us{(((J&))))"%NECK%%%%(us{(((&))))#u### NE   !OT!!!$s$$$%(us{(((#)S))),OHd3i%S/)OOO/DKcU38_,DDD/EKcU38_,EEEIt#&S&&&)+!3+++####"&NECK&&&c    %(us{(((!!!!"#C### L#*    L#*   +,L#S	/D(,,,)*C$J***L#() #)))'****14!53;444!$3$$$)*!3***(+ #+++)*!3***25"ECK555c $#$$$!!!!),!3,,,&*t***+,#S,,,),,,,)----It#### $$$$( ( ( ( (  C C C C C C Cra   r	   c                      e Zd ZU dZdZdZeed<   dZeed<   dZ	eed<   d	Z
ee         eed
f         z  ed<   dZee         eed
f         z  ed<   dZee         eed
f         z  ed<   dZeez  ed<   dZeed<   dZeed<   dZeed<   dS )SpeechT5HifiGanConfiga  
    model_in_dim (`int`, *optional*, defaults to 80):
        The number of frequency bins in the input log-mel spectrogram.
    upsample_initial_channel (`int`, *optional*, defaults to 512):
        The number of input channels into the upsampling network.
    upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
        A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
        length of *upsample_rates* defines the number of convolutional layers and has to match the length of
        *upsample_kernel_sizes*.
    upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[8, 8, 8, 8]`):
        A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
        length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
        *upsample_rates*.
    resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`):
        A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
        fusion (MRF) module.
    resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
        A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
        multi-receptive field fusion (MRF) module.
    leaky_relu_slope (`float`, *optional*, defaults to 0.1):
        The angle of the negative slope used by the leaky ReLU activation.
    normalize_before (`bool`, *optional*, defaults to `True`):
        Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.

    Example:

    ```python
    >>> from transformers import SpeechT5HifiGan, SpeechT5HifiGanConfig

    >>> # Initializing a "microsoft/speecht5_hifigan" style configuration
    >>> configuration = SpeechT5HifiGanConfig()

    >>> # Initializing a model (with random weights) from the "microsoft/speecht5_hifigan" style configuration
    >>> model = SpeechT5HifiGan(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```speecht5_hifiganr@   model_in_dimi>  sampling_rater(   upsample_initial_channel)   r   r   r   .upsample_rates)   r   r   r   upsample_kernel_sizes)r         resblock_kernel_sizes)r;   r   r*   r   r   resblock_dilation_sizesg{Gz?r!   r   leaky_relu_slopeTnormalize_beforeN)rl   rm   rn   ro   rp   r}   rr   rs   r~   r   r   rw   rx   r   r   r   r!   rt   r   r   rv   rX   ra   r`   r{   r{      s        % %N $JL#M3$'c'''2>NDIc3h/>>>9E49uS#X6EEE9C49uS#X6CCC,MTE\MMM#u###!e!!!!d!!!!!ra   r{   )ro   rg   ri   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   r{   __all__rX   ra   r`   <module>r      s   # "      . . . . . . 3 3 3 3 3 3 # # # # # # 3444AC AC AC AC AC% AC AC  54ACH 34443" 3" 3" 3" 3", 3" 3"  543"l 4
5ra   