
    |-j                    6   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,  e)j-        e.          Z/dZ0dej1        de2de2fdZ3	 ddej1        de2dej1        dz  fdZ4	 	 dde5e2e2f         de6de2dej7        dz  de2dej8        fd Z9 G d! d"e          Z: G d# d$e          Z; G d% d&e          Z< G d' d(ej=                  Z> G d) d*ej=                  Z? G d+ d,ej=                  Z@ G d- d.ej        j=                  ZA G d/ d0ej=                  ZB G d1 d2ej=                  ZC G d3 d4ej=                  ZD G d5 d6ej=                  ZE G d7 d8ej=                  ZF G d9 d:ej=                  ZG G d; d<ej=                  ZH G d= d>ej=        e%          ZI G d? d@ej=        e%          ZJ G dA dBej=        e%          ZK G dC dDej=                  ZL G dE dFej=                  ZM G dG dHe          ZN G dI dJe          ZOe( G dK dLe&                      ZP G dM dNeP          ZQ G dO dPeP          ZR G dQ dReP          ZS G dS dTeP          ZT G dU dVeP          ZU G dW dXeP          ZV G dY dZeP          ZW G d[ d\eP          ZX G d] d^ej=                  ZY G d_ d`ej=                  ZZ e(dab           G dc ddeP                      Z[ e(deb           G df dgePe                      Z\	 	 	 	 	 	 	 	 ddlePdej]        dmej]        dz  dej7        dz  dne6doe6dpe6dqej=        dz  dre^dse^dej]        e5ej]        ej]        f         z  fdtZ_ e(dub           G dv dweP                      Z` e(dxb           G dy dzeP                      Za G d{ d|ej=                  Zb e(d}b           G d~ de&                      Zcg dZddS )zPyTorch SpeechT5 model.    N)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)EmbeddingAccessMixinPreTrainedModel)auto_docstringlogging   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     |                      | j                  }| ddddf                                         |ddddf<   ||dddf<   |t          d          |                    |dk    |           |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r    r!   shifted_input_idss       n/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr+   4   s     "++IO<<(CRC06688aaae4aaadLMMM""#4#<lKKK    input_valuesreduction_factorattention_maskc                    |dk    r&| dd|dz
  d|f         } ||dd|dz
  d|f         }|                      | j                  }| ddddf                                         |ddddf<   |                    |dk    d           ||fS )zw
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    r   Nr#         Y        )r$   r%   r&   r(   )r-   r.   r/   shifted_input_valuess       r*   shift_spectrograms_rightr4   D   s     !#AAA'7!';'O?O'O$OP%+AAA/?!/C/WGW/W,WXN'11,2DEE".qqq#2#v"6"<"<">">ABB %%&:f&DcJJJ//r,   r%   	mask_probmask_length	min_masksreturnc                 @   | \  }dk     rt          d          k    rt          d d d          t          j                            d                                          fd}|9|                                                    d                                          nfd	t          |          D             }t          j	        |ft          
          }g }	 |          }
|
dk    r|S |D ]} ||          }t          j                            t          j        |dz
  z
            |d          }t          |          dk    rdz
  }n|d         }t          j        |t          j        |
|z
  t          j        
          |z  g          }|	                    |           t          j        |	          }	t          j        |	dddddf         ||
f          }	|	                    ||
z            }	t          j                  ddddf         }t          j        |||
f                              ||
z            }|	|z   }	|	                                dz
  k    rdz
  |	|	dz
  k    <   t          j        ||	dd           |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t          | z  z  z             }t          |          }|z  k    rz  }| dz
  z
  |k     rt          | dz
  z
  d          }|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr6   r5   r7   sequence_lengths     r*   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span   s~    i,6DwNOOoy99 [(?::-<O ;?+o==!,+/"BAFFOr,   Nr#   c                     g | ]}S  rD   ).0_rA   s     r*   
<listcomp>z)_compute_mask_indices.<locals>.<listcomp>   s    999!o999r,   dtyper   F)replace)r'   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper=   put_along_axis)r%   r5   r6   r/   r7   
batch_sizerB   input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr>   r?   spec_aug_mask_idxdummy_mask_idxoffsetsr@   rA   s    `` `           @@r*   _compute_mask_indicesrh   Z   sP   0 #(JQABBB_$$:^i : :'6: : :
 
 	
 innQ$$&&G        $ % 	##B''..0009999uZ'8'8999  Hj/:$GGGM11/BBa% 5 511,?? I,,IlkAo677RW - 
 
  !!Q&& -q0NN.q1NN(;o(MUWU] ^ ^ ^ao op
 
 	!!"34444"455 111aaa:&5H+(V  ,33J@SVa@abb i$$T4]3Gog
4G'UVV^^'+5 G ,g5 /A"555GVYZGZ-!0CCD m%7B???r,   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5NoLayerNormConvLayerr   c                 Z   t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        d S )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      r*   rq   z%SpeechT5NoLayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@r,   c                 Z    |                      |          }|                     |          }|S N)ry   r{   r}   hidden_statess     r*   forwardz$SpeechT5NoLayerNormConvLayer.forward   s*    		-0066r,   r   __name__
__module____qualname__rq   r   __classcell__r   s   @r*   rj   rj      sR        A A A A A A      r,   rj   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5LayerNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r   rl   T)elementwise_affine)rp   rq   rr   rs   rt   r   ru   rv   rw   rx   ry   	LayerNorm
layer_normr	   rz   r{   r|   s      r*   rq   z#SpeechT5LayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@r,   c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )Nr#   )ry   	transposer   r{   r   s     r*   r   z"SpeechT5LayerNormConvLayer.forward   se    		-00%//B7766%//B7766r,   r   r   r   s   @r*   r   r      sR        A A A A A A      r,   r   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5GroupNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        t          j        | j        | j        d          | _        d S )Nr   r   rl   T)
num_groupsnum_channelsaffine)rp   rq   rr   rs   rt   r   ru   rv   rw   rx   ry   r	   rz   r{   	GroupNormr   r|   s      r*   rq   z#SpeechT5GroupNormConvLayer.__init__  s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@,$2CRVRclpqqqr,   c                     |                      |          }|                     |          }|                     |          }|S r   )ry   r   r{   r   s     r*   r   z"SpeechT5GroupNormConvLayer.forward  s;    		-006666r,   r   r   r   s   @r*   r   r     sR        r r r r r r       r,   r   c            	            e Zd ZdZddedededz  f fdZddedededz  fdZeddedededz  fd	            Z e	j
                    dde	j        defd            Z	 dde	j        dededz  fdZ xZS )%SpeechT5SinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsembedding_dimpadding_idxc                     t                                                       d| _        || _        || _        || _        |                     || j        z   ||           d S N   )rp   rq   offsetr   r   r   make_weights)r}   r   r   r   r   s       r*   rq   z.SpeechT5SinusoidalPositionalEmbedding.__init__   s]    **&-$+5}kRRRRRr,   num_embeddingsc                     |                      |||          }t          | d          r+|                    | j        j        | j        j                  }|                     d|d           d S )NweightsrI   deviceF
persistent)get_embeddinghasattrtor   rI   r   register_buffer)r}   r   r   r   emb_weightss        r*   r   z2SpeechT5SinusoidalPositionalEmbedding.make_weights(  sl    ((TT4## 	_%..t|/A$,J].^^KYFFFFFr,   c                    |dz  }t          j        d          |dz
  z  }t          j        t          j        |t          j                                                  | z            }t          j        | t          j                                                                      d          |                    d          z  }t          j        t          j	        |          t          j
        |          gd                              | d          }|dz  dk    r+t          j        |t          j        | d          gd          }|	d||ddf<   |                    t          j                              S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   rH   r   dimr#   N)mathlogtorchexprV   int64float	unsqueezecatsincosviewrS   r   get_default_dtype)r   r   r   half_dimembs        r*   r   z3SpeechT5SinusoidalPositionalEmbedding.get_embedding0  s?    !A%huooA.iXU[AAAGGIISDPQQl>===CCEEOOPQRRUXUbUbcdUeUeei338a@@@EEnVXYY1!!)S%+na"@"@AqIIIC""#CQQQvve-//000r,   r   r   past_key_values_lengthc                    |                                 \  }}|                     || j        |                              |j                  }| j        dz   |z   }|| j                             d          k    r)|                     || j        z   | j        | j                   | j        	                    d|
                    d                    
                    ||d                                          S )Nr   r   r#   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rO   )r}   r   r   bszseq_lenposition_idsmax_poss          r*   r   z-SpeechT5SinusoidalPositionalEmbedding.forwardB  s     ~~''W>>y$JZ\rssvv
 

 "Q&0T\&&q))))g3T5GIYZZZ|((L,=,=b,A,ABBGGWVXYY``bbbr,   c                     |                     |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        r   r   )ner<   r   cumsumtype_aslong)r}   r   r   r   maskincremental_indicess         r*   r   zHSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsQ  sg     ||K((,,..$|Da888@@FFI__cgg"''))K77r,   r   r   )r   r   r   __doc__r<   rq   r   staticmethodr   r   no_gradTensorr   r   r   r   s   @r*   r   r     sl       NNS Sc S# SCRVJ S S S S S SG G3 Gs GQTW[Q[ G G G G 1 1c 1# 1CRVJ 1 1 1 \1" U]__c c cs c c c _c _`8 88478QTW[Q[8 8 8 8 8 8 8 8r,   r   c                   $     e Zd Z fdZd Z xZS )SpeechT5PositionalConvEmbeddingc                    t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        t          j        j	        }t          t          j        j        d          rt          j        j        j	        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t-          |j                  | _        t0          |j                 | _        d S )	Nr   )rm   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rp   rq   r   ru   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsry   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r	   rz   r{   )r}   r~   r   r   r   r   r   s         r*   rq   z(SpeechT5PositionalConvEmbedding.__init__d  s   I62a77
 
 
	 h*28,m<< 	@(3?K%'' 	E22493CST2UU I I'K	aHHH	I I I I I I I I I I I I I I Ity"455 .95<F95<F9-9-N66tXFFFN66tXFFFF#DIH!DDDDI+F,JKK !?@s   C??DDc                     |                     dd          }|                     |          }|                     |          }|                     |          }|                     dd          }|S Nr   r   )r   ry   r   r{   r   s     r*   r   z'SpeechT5PositionalConvEmbedding.forward  se    %//155		-00]3366%//155r,   r   r   s   @r*   r   r   c  sM        A A A A AB      r,   r   c                   *     e Zd ZdZd fd	Zd Z xZS ) SpeechT5ScaledPositionalEncodingu[   
    Scaled positional encoding, see §3.2 in https://huggingface.co/papers/1809.08895
      c                 L   t          j        ||          }t          j        d|                              d          }t          j        t          j        d|dt           j                                                  t          j        d          |z   z            }t          j	        |                                |z            |d d dd df<   t          j
        |                                |z            |d d dd df<   |                    d          }t                                                       |                     d|d           t          j        |	          | _        || _        || _        t          j        t          j        d
                    | _        d S )Nr   r   r   rH        @peFr   p      ?)r   rS   rV   r   r   r   r   r   r   r   r   rp   rq   r   r   Dropoutdropoutr   max_len	Parametertensoralpha)r}   r   r   r   r   positiondiv_termr   s          r*   rq   z)SpeechT5ScaledPositionalEncoding.__init__  s_   [#&&<7++55a889U\!S!5;GGGMMOOTXT\]dTeTehkTkRllmmi 0 08 ;<<111add7i 0 08 ;<<111add7\\!__T2%888zG,,,\%,s"3"344


r,   c                     || j         | j        d d d |                    d          f         z  z   }|                     |          }|S )Nr   )r   r   r   r   )r}   r   s     r*   r   z(SpeechT5ScaledPositionalEncoding.forward  sG    DJMchhqkkM)9!:::ll3
r,   )r   )r   r   r   r   rq   r   r   r   s   @r*   r   r     sV         5 5 5 5 5 5      r,   r   c                   &     e Zd Zd fd	Zd Z xZS )"SpeechT5RelativePositionalEncoding  c                     t                                                       || _        || _        t          j                            d|z  |          | _        d S r   )rp   rq   r   
max_lengthr   r   	Embeddingpe_k)r}   r   r  r   s      r*   rq   z+SpeechT5RelativePositionalEncoding.__init__  sH    $H&&q:~s;;			r,   c                    |j         d         }t          j        d|                              |j        t          j                  }|d d d f         |d d d f         z
  }t          j        || j         k     | j         |          }t          j        || j        k    | j        dz
  |          }|| j        z   }|                     |          S )Nr   r   r   rI   )	r%   r   rV   r   r   r   wherer  r
  )r}   r   r   pos_seqs       r*   r   z*SpeechT5RelativePositionalEncoding.forward  s    %a(,q'**--]5IQVQ[-\\!!!T'"WT111W%55+g(884?:JGTT+g8$/A:MwWWDO+yy!!!r,   )r  r   r   s   @r*   r  r    sL        < < < < < <	" 	" 	" 	" 	" 	" 	"r,   r  c                   $     e Zd Z fdZd Z xZS )r   c                 l    t                                                       |dz  dk    rdnd| _        d S )Nr   r   r   )rp   rq   num_pad_remove)r}   r   r   s     r*   rq   zSpeechT5SamePadLayer.__init__  s:    #:Q#>!#C#Caar,   c                 J    | j         dk    r|d d d d d | j          f         }|S Nr   )r  r   s     r*   r   zSpeechT5SamePadLayer.forward  s;    "")!!!QQQ0F43F2F0F*FGMr,   r   r   s   @r*   r   r     sL        K K K K K      r,   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )SpeechT5FeatureEncoderz.Construct the features from raw audio waveformc                    t                                                       j        dk    r7t          d          gfdt	          j        dz
            D             z   }nDj        dk    r!fdt	          j                  D             }nt          dj         d	          t          j        |          | _	        d
| _
        d| _        d S )Ngroupr   r   c                 8    g | ]}t          |d z             S )r   r  )rj   rE   ir~   s     r*   rG   z3SpeechT5FeatureEncoder.__init__.<locals>.<listcomp>  s>     N N NIJ,Va!eDDDN N Nr,   r   layerc                 2    g | ]}t          |           S )r  )r   r  s     r*   rG   z3SpeechT5FeatureEncoder.__init__.<locals>.<listcomp>  s4       CD*6A>>>  r,   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rp   rq   feat_extract_normr   rR   num_feat_extract_layersr'   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)r}   r~   r!  r   s    ` r*   rq   zSpeechT5FeatureEncoder.__init__  s   #w..5fqIIIJ N N N NNSTZTruvTvNwNwN N N KK %00   HMfNlHmHm  KK t1Ittt   =55&+#"r,   c                 P    |                                  D ]	}d|_        
d| _        d S )NF)
parametersrequires_gradr#  )r}   params     r*   _freeze_parametersz)SpeechT5FeatureEncoder._freeze_parameters  s4    __&& 	( 	(E"'E#r,   c                 r    |d d d f         }| j         r| j        rd|_        | j        D ]} ||          }|S NT)r#  trainingr&  r!  )r}   r-   r   
conv_layers       r*   r   zSpeechT5FeatureEncoder.forward  s[    $QQQW-  	/4= 	/*.M'* 	6 	6J&J}55MMr,   )r   r   r   r   rq   r(  r   r   r   s   @r*   r  r    s\        88# # # # #&$ $ $

 
 
 
 
 
 
r,   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeatureProjectionc                 .   t                                                       t          j        |j        d         |j                  | _        t          j        |j        d         |j                  | _	        t          j
        |j                  | _        d S )Nr#   eps)rp   rq   r   r   rr   layer_norm_epsr   Linearr   
projectionr   feat_proj_dropoutr   r}   r~   r   s     r*   rq   z"SpeechT5FeatureProjection.__init__  sn    ,vr':@UVVV)FOB$79KLLz&":;;r,   c                     |                      |          }|                     |          }|                     |          }||fS r   )r   r4  r   )r}   r   norm_hidden_statess      r*   r   z!SpeechT5FeatureProjection.forward  sC    !__];;(:;;]33000r,   r   r   s   @r*   r.  r.    sG        < < < < <1 1 1 1 1 1 1r,   r.  c                        e Zd Z fdZd Z	 	 ddej        dej        dz  dej        dz  fdZ	de
dej        fd	Zd
ej        e
z  fdZ	 	 ddej        dej        dz  dej        dz  fdZ xZS )SpeechT5SpeechEncoderPrenetc                    t                                                       || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        t!          |          | _        t%          |j        |j        z   dz   |j        |j                  | _        d S )Nr2   r   )rp   rq   r~   r  feature_encoderr.  feature_projectionmask_time_probmask_feature_probr   r   r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr    pos_sinusoidal_embedr6  s     r*   rq   z$SpeechT5SpeechEncoderPrenet.__init__  s    5f==";F"C"C  3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"=fEE$I'&*==A%
 %
!!!r,   c                 8    | j                                          d S r   )r<  r(  r}   s    r*   freeze_feature_encoderz2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //11111r,   Nr-   r/   mask_time_indicesc                 (   |                      |          }|                    dd          }|!|                     |j        d         |          }|                     |          \  }}|                     |||          }|                     |          }||z   }|(|                    d                                          }n3t          j
        |j        d d         t          j        |j                  }|                     |          }||z   }||fS )Nr   r   )rH  r/   r   )r<  r   "_get_feature_vector_attention_maskr%   r=  _mask_hidden_statesrB  r   r   r   rS   r   rD  )	r}   r-   r/   rH  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r*   r   z#SpeechT5SpeechEncoderPrenet.forward  s/     //==+55a;;%!DD &q) N
 +/*A*ABR*S*S''00->~ 1 
 
 %)$7$7$F$F!%(AA%),,Q//4466LL ;}':2A2'>ejYfYmnnnL+/+D+D\+R+R(%(HHn,,r,   feature_vector_lengthc                    |                     d          d d df         }|                     |                              t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                               d          
                    dg                                          }|S )Nr#   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r%   rS   rI   r   rV   fliprT   )r}   rP  r/   non_padded_lengthsoutput_lengthsr`   s         r*   rJ  z>SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask9  s     ,22r2::111b5A>>?QRRUUV[V`aa#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr,   ra   c                 z    d }t          | j        j        | j        j                  D ]\  }} ||||          }|S )zH
        Computes the output length of the convolutional layers
        c                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r   div)r>   rm   rn   s      r*   _conv_out_lengthzVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthN  s&     9\K7wWWWZ[[[r,   )zipr~   rv   rw   )r}   ra   r\  rm   rn   s        r*   rS  z<SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsI  s\    
	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMMr,   r   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r5   r6   r/   r7   r  )r5   r6   r7   r#   )getattrr~   r   rA  r   rI   r>  r+  rh   mask_time_lengthmask_time_min_masksr   r   r   rT   r?  mask_feature_lengthmask_feature_min_masksexpand)r}   r   rH  r/   r`   rA   r   mask_feature_indicess           r*   rK  z/SpeechT5SpeechEncoderPrenet._mask_hidden_statesY  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./r,   NN)r   r   r   rq   rG  r   r   
LongTensorFloatTensorr   r<   rJ  rS  rK  r   r   s   @r*   r:  r:    s1       
 
 
 
 
"2 2 2 376:	 -  -l - (4/ - !,t3	 -  -  -  -F ]b]m     e>NQT>T    & 7;26	, ,(, !,t3, (4/	, , , , , , , ,r,   r:  c                   R     e Zd Z fdZd Z	 ddej        dej        dz  fdZ xZS )SpeechT5SpeechDecoderPrenetc                    t                                                       | _        t          j        fdt          j                  D                       | _        t          j        j	        j
                  | _        t          j        j
        j                  | _        t          j        j        j
        z   j
                  | _        d S )Nc                 h    g | ].}t          j        |d k    rj        nj        j                  /S r   )r   r3  num_mel_binsspeech_decoder_prenet_unitsr  s     r*   rG   z8SpeechT5SpeechDecoderPrenet.__init__.<locals>.<listcomp>  sS       
 	 	+,66F''v7Y6   r,   )rp   rq   r~   r   r   rR   speech_decoder_prenet_layerslayersr3  ro  r   final_layerr   positional_dropoutrC  encode_positionsspeaker_embedding_dimspeaker_embeds_layerr6  s    `r*   rq   z$SpeechT5SpeechDecoderPrenet.__init__  s    m   
 vBCC  
 
 9V%GI[\\ @%'!
 !

 %'If.JVM_._agas$t$t!!!r,   c                     t          j        |d         |          }|                    d                              |                    d          dd          }t          j        |dk    |d          dz  d|z
  z  S )Nr   r   r   )r   	bernoullir   repeatr   r  )r}   inputs_embedsr   r   	all_maskss        r*   _consistent_dropoutz/SpeechT5SpeechDecoderPrenet._consistent_dropout  sr    }Q/1555NN1%%,,]-?-?-B-BAqII	{9>=!<<q@AEJJr,   Nr-   speaker_embeddingsc                 R   |}| j         D ]J}t          j                             ||                    }|                     || j        j                  }K|                     |          }|                     |          }|t          j        	                    |          }|
                    d                              d|                    d          d          }t          j        ||gd          }t          j                            |                     |                    }|S )Nr   r#   r   )rq  r   
functionalrelur|  r~   speech_decoder_prenet_dropoutrr  rt  	normalizer   re  r   r   r   rv  )r}   r-   r}  rz  r  s        r*   r   z#SpeechT5SpeechDecoderPrenet.forward  s    %[ 	o 	oEM..uu]/C/CDDM 44]DKDmnnMM((77--m<<)!#!8!89K!L!L!3!=!=a!@!@!G!GML^L^_`LaLace!f!f!I}6H&IrRRRMM..t/H/H/W/WXXMr,   r   )	r   r   r   rq   r|  r   r   r   r   r   s   @r*   rk  rk    s        u u u u u,K K K 37 l "L4/       r,   rk  c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5BatchNormConvLayerr   c                    t                                                       |dk    r|j        }n|j        }||j        dz
  k    r|j        }n|j        }t          j        |||j        d|j        dz
  dz  d          | _        t          j	        |          | _
        ||j        dz
  k     rt          j                    | _        nd | _        t          j        |j                  | _        d S )Nr   r   r   F)rm   rn   r   ro   )rp   rq   rn  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   ru   speech_decoder_postnet_kernelry   BatchNorm1d
batch_normTanhr{   r   speech_decoder_postnet_dropoutr   )r}   r~   r   rs   rt   r   s        r*   rq   z#SpeechT5BatchNormConvLayer.__init__  s    q== -KK =Kv;a???!.LL!>LI<9A=!C
 
 
	 .66f:Q>>> giiDOO"DOz&"GHHr,   c                     |                      |          }|                     |          }| j        |                     |          }|                     |          }|S r   )ry   r  r{   r   r   s     r*   r   z"SpeechT5BatchNormConvLayer.forward  sT    		-0066?& OOM::M]33r,   r   r   r   s   @r*   r  r    sR        I I I I I I<      r,   r  c                   J     e Zd Z fdZdej        fdZdej        fdZ xZS )SpeechT5SpeechDecoderPostnetc                 f   t                                                       | _        t          j        j        j        j        z            | _        t          j        j        j                  | _	        t          j
        fdt          j                  D                       | _        d S )Nc                 0    g | ]}t          |          S rD   )r  r  s     r*   rG   z9SpeechT5SpeechDecoderPostnet.__init__.<locals>.<listcomp>  s$    hhhq'22hhhr,   )rp   rq   r~   r   r3  r   rn  r.   feat_outprob_outr   rR   r  rq  r6  s    `r*   rq   z%SpeechT5SpeechDecoderPostnet.__init__  s    	&"4f6IFLc6cdd	&"4f6MNNmhhhhE&Bf<g<ghhh
 
r,   r   c                 <   |                      |                              |                    d          d| j        j                  }|                     |          }|                     |                              |                    d          d          }|||fS )Nr   r#   )r  r   r   r~   rn  postnetr  )r}   r   outputs_before_postnetoutputs_after_postnetlogitss        r*   r   z$SpeechT5SpeechDecoderPostnet.forward  s    !%}!=!=!B!B=CUCUVWCXCXZ\^b^i^v!w!w $-C D D}--22=3E3Ea3H3H"MM%'<fDDr,   c                     |                     dd          }| j        D ]} ||          }||                     dd          z   S r   )r   rq  )r}   r   layer_outputr  s       r*   r  z$SpeechT5SpeechDecoderPostnet.postnet  sT    $..q!44[ 	/ 	/E 5..LL|55a;;;;r,   )	r   r   r   rq   r   r   r   r  r   r   s   @r*   r  r    sw        	
 	
 	
 	
 	
EU\ E E E E<U\ < < < < < < < <r,   r  c                   4     e Zd Z fdZdej        fdZ xZS )SpeechT5TextEncoderPrenetc                     t                                                       || _        t          j        |j        |j        |j                  | _        t          |j
        |j        |j                  | _        d S r   )rp   rq   r~   r   r	  
vocab_sizer   r    embed_tokensr   rs  max_text_positionsrt  r6  s     r*   rq   z"SpeechT5TextEncoderPrenet.__init__  sj    L):F<NPVPcdd @%%!
 !
r,   r   c                 Z    |                      |          }|                     |          }|S r   )r  rt  )r}   r   rz  s      r*   r   z!SpeechT5TextEncoderPrenet.forward  s.    )))44--m<<r,   )r   r   r   rq   r   r   r   r   r   s   @r*   r  r    sU        
 
 
 
 
        r,   r  c                   X     e Zd Z fdZ	 	 ddej        dej        dz  dedz  fdZ xZ	S )SpeechT5TextDecoderPrenetc                    t                                                       || _        t          j        |j                  | _        |j        rt          j	        |j
                  nd| _        t          j        |j        |j
        |j                  | _        t!          |j        |j        z   dz   |j
        |j                  | _        d S )Nr   r   )rp   rq   r~   r   r   rs  r   scale_embeddingr   sqrtr   embed_scaler	  r  r    r  r   r  embed_positionsr6  s     r*   rq   z"SpeechT5TextDecoderPrenet.__init__  s    z&";<<<B<R[49V%7888X[L):F<NPVPcddD%(;;a? 
  
r,   Nr   r/   past_key_valuesc                 X   |1|                                 }|                    d|d                   }nt          d          |dn|                                }|                     ||          }|                     |          | j        z  }||z  }|                     |          }||fS )Nr#   z'You have to specify `decoder_input_ids`r   )r   r   r'   get_seq_lengthr  r  r  r   )r}   r   r/   r  input_shaper   	positionsrz  s           r*   r   z!SpeechT5TextDecoderPrenet.forward  s      #..**K!r;r?;;IIFGGG&5&=?CaCaCcCc((4JKK	)))44t7GG"]33n,,r,   rg  )
r   r   r   rq   r   r   rh  r
   r   r   r   s   @r*   r  r    s        
 
 
 
 
" 37(,	- -<- (4/- 	- - - - - - - -r,   r  c                   @     e Zd Z fdZdej        fdZd Zd Z xZ	S )SpeechT5TextDecoderPostnetc                     t                                                       || _        t          j        |j        |j        d          | _        d S )NFro   )rp   rq   r~   r   r3  r   r  lm_headr6  s     r*   rq   z#SpeechT5TextDecoderPostnet.__init__4  sB    y!3V5FUSSSr,   r   c                 ,    |                      |          S r   r  r   s     r*   r   z"SpeechT5TextDecoderPostnet.forward9  s    ||M***r,   c                     | j         S r   r  rF  s    r*   get_output_embeddingsz0SpeechT5TextDecoderPostnet.get_output_embeddings<  s     |r,   c                     || _         d S r   r  r}   new_embeddingss     r*   set_output_embeddingsz0SpeechT5TextDecoderPostnet.set_output_embeddingsA  s    %r,   )
r   r   r   rq   r   r   r   r  r  r   r   s   @r*   r  r  3  sx        T T T T T
+U\ + + + +  
& & & & & & &r,   r  c                        e Zd ZdZ	 	 	 	 ddedededz  d	edz  d
edz  dedz  f fdZ	 	 	 	 	 ddej	        dej	        dz  de
dz  dej	        dz  dej	        dz  dedeej	        ej	        dz  e
dz  f         fdZ xZS )SpeechT5Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    r2   FTN	embed_dim	num_headsr   
is_decoderro   	layer_idxc                    t                                                       || _        || _        || _        ||z  | _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _        || _	        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rp   rq   r  r  r   head_dimr'   scalingr  r  r   r3  k_projv_projq_projout_proj)r}   r  r  r   r  ro   r  r   s          r*   rq   zSpeechT5Attention.__init__K  s	    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr,   r   key_value_statesr  r/   position_biasoutput_attentionsr8   c                 
   |du}|                                 \  }	}
}|                     |          | j        z  }d}|Ht          |t                    r1|j                            | j                  }|r|j        }n
|j	        }n|}|r|n|}|r3|1|r/|j
        | j                 j        }|j
        | j                 j        }n|                     |          }|                     |          }|                    |	d| j        | j                                      dd          }|                    |	d| j        | j                                      dd          }|E|                    ||| j                  \  }}|r$t          |t                    rd|j        | j        <   |	| j        z  d| j        f}|                    |	|
| j        | j                                      dd          } |j        | } |j        | } |j        | }|                     d          }t+          j        ||                    dd                    }|                                 |	| j        z  |
|fk    r2t/          d|	| j        z  |
|f d|                                            ||                                                    |	| j        z  d| j                                      d	d          }t+          j        ||                    d
d                    }|                    d	d                              |	| j        z  |                     d	          |                     d                    }||z  }||                                 |	d|
|fk    r+t/          d|	d|
|f d|                                            |                    |	| j        |
|          |z   }|                    |	| j        z  |
|          }t4          j                            |d          }|r=|                    |	| j        |
|          }|                    |	| j        z  |
|          }nd}t4          j                            || j        | j                  }t+          j        ||          }|                                 |	| j        z  |
| j        fk    r5t/          d|	| j        |
| j        f d|                                            |                    |	| j        |
| j                  }|                    dd          }|                    |	|
| j                  }|                      |          }||fS )z#Input shape: Batch x Time x ChannelNFr#   r   r   Tz$Attention weights should be of size z	, but is r   r   z!Attention mask should be of size r   )r   r+  z `attn_output` should be of size )!r   r  r  
isinstancer   
is_updatedgetr  cross_attention_cacheself_attention_cacherq  keysvaluesr  r  r   r  r  r   updater^   r   bmmr'   
contiguousmatmulr   r  softmaxr   r+  r  r  )r}   r   r  r  r/   r  r  kwargsis_cross_attentionr   tgt_lenrF   query_statesr  curr_past_key_valuescurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                             r*   r   zSpeechT5Attention.forwardh  s    .T9',,..Wa {{=11DL@
&/+>?? 7,7;;DNKK
% P+:+P((+:+O(('6$-?R))] 	F/"=*"=-4T^DIJ/6t~FMLL^44J;;~66L#b$.$-PPZZ[\^_``J',,S"dndmTT^^_`bcddL*+?+F+FzS_aeao+p+p(
L% F*_FY*Z*Z FAEO.t~>DN*B>
#((gt~t}UU__`acdee+|+Z8'Z'4
+|+Z8//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *   $$//1166sT^7KRQUQ^__iijkmnooI <	=3J3J2r3R3RSSL'11!Q77<<dn$m&8&8&;&;]=O=OPQ=R=R L L(L%""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11 "))#wGGmmK00111r,   )r2   FTN)NNNNF)r   r   r   r   r<   r   rT   rq   r   r   r
   tupler   r   r   s   @r*   r  r  E  sd         !$"' !%C CC C 	C
 4KC TkC $;C C C C C C@ 15(,.2-1"'r2 r2|r2  ,-r2 	r2
 t+r2 |d*r2  r2 
u|U\D0%$,>	?r2 r2 r2 r2 r2 r2 r2 r2r,   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeedForwardc                    t                                                       t          j        |j                  | _        t          j        |j        |          | _        t          |j
        t                    rt          |j
                 | _        n|j
        | _        t          j        ||j                  | _        t          j        |j                  | _        d S r   )rp   rq   r   r   activation_dropoutintermediate_dropoutr3  r   intermediate_denser  
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)r}   r~   intermediate_sizer   s      r*   rq   zSpeechT5FeedForward.__init__  s    $&Jv/H$I$I!"$)F,>@Q"R"Rf'-- 	9'-f.?'@D$$'-'8D$I&79KLL j)>??r,   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )r  r  r  r  r  r   s     r*   r   zSpeechT5FeedForward.forward  sg    //>>00??11-@@))-88++M::r,   r   r   s   @r*   r  r    sL        @ @ @ @ @      r,   r  c            	       n     e Zd Zdef fdZ	 	 	 d
dej        dej        dz  dej        dz  defd	Z xZ	S )SpeechT5EncoderLayerr~   c                    t                                                       t          |j        |j        |j        d          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          ||j                  | _        t          j        |j        |j                  | _        d S )NF)r  r  r   r  r0  )rp   rq   r  r   encoder_attention_headsattention_dropout	attentionr   r   r  r   r   r2  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normr6  s     r*   rq   zSpeechT5EncoderLayer.__init__  s    *(4,	
 
 
 z&"788,v'9v?TUUU/8NOO "V-?VEZ [ [ [r,   NFr   r/   r  r  c                    |}|                      ||||          \  }}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            position_bias (`torch.FloatTensor`):
                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r/   r  r  )r  r   r   r  r  )r}   r   r/   r  r  residualr  outputss           r*   r   zSpeechT5EncoderLayer.forward  s    ( !&*nn')'/	 '5 '
 '
#| ]33 =066%(9(9-(H(HH--m<< " 	'&Gr,   )NNF)
r   r   r   r   rq   r   r   rT   r   r   r   s   @r*   r  r    s        \~ \ \ \ \ \ \  /3-1"'( (|( t+( |d*	(
  ( ( ( ( ( ( ( (r,   r  c                        e Zd Zddef fdZ	 	 	 	 	 	 ddej        dej        dz  dej        dz  d	ej        dz  d
edz  dedz  dedz  fdZ	 xZ
S )SpeechT5DecoderLayerNr~   c                 6   t                                                       t          |j        |j        |j        d|          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |j        |j        |j        d|          | _        t          j        |j        |j                  | _        t!          ||j                  | _        t          j        |j        |j                  | _        d S )NT)r  r  r   r  r  r0  )r   r  r  )rp   rq   r  r   decoder_attention_headsr  	self_attnr   r   r  r   r   r2  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr  r  )r}   r~   r  r   s      r*   rq   zSpeechT5DecoderLayer.__init__/  s    *(4,
 
 
 z&"788$&L1CI^$_$_$_!-*,
 
 
 (*|F4FFLa'b'b'b$/8NOO "V-?VEZ [ [ [r,   FTr   r/   encoder_hidden_statesencoder_attention_maskr  r  	use_cachec                    |}	|                      ||||          \  }}
|                     |          }|	|z   }|                     |          }d}|N|}	|                     |||||          \  }}|                     |          }|	|z   }|                     |          }||                     |          z   }|                     |          }|f}|r||
|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r  r/   r  N)r   r  r/   r  r  )r  r   r  r	  r
  r  r  )r}   r   r/   r  r  r  r  r  r  r  self_attn_weightscross_attn_weightsr  s                r*   r   zSpeechT5DecoderLayer.forwardG  s+   2 ! ,0>>'+)/	 ,: ,
 ,
(( ]33 =011-@@ " ,$H040A0A+!65 /"3 1B 1 1-M- !LL77M$}4M 88GGM &(9(9-(H(HH--m<< " 	?)+=>>Gr,   r   )NNNNFT)r   r   r   r   rq   r   r   r
   rT   r   r   r   s   @r*   r  r  .  s        \ \~ \ \ \ \ \ \6 /3596:(,).!%? ?|? t+?  %|d2	?
 !&t 3? ?  $;? $;? ? ? ? ? ? ? ?r,   r  c                   j    e Zd ZU eed<   dZdZdZdZ e	j
                    dej        fd            ZdS )	SpeechT5PreTrainedModelr~   speecht5r-   audioTmodulec           
      
   | j         j        }t          |t                    rwt	          j        |j        j        ddt          j	        d|j        j
        d         |j        j        z  z            z             t	          j        |j        j        d           nt          |t                    rZt	          j        |j                   |j        |j        }}t'          j        ||          }t'          j        d|                              d          }t'          j        t'          j        d|dt&          j                                                  t          j        d          |z   z            }t'          j        |                                |z            |dddddf<   t'          j        |                                |z            |dddddf<   |                    d          }t	          j        |j        |           n:t          |t>                    rgt          j	        d|j         j!        z            }t	          j"        |j         j        | |           t	          j"        |j         j        | |           nt          |tF          j$                  r>t	          j        |j        d	|           |j        t	          j%        |j                   nft          |tF          j&        tF          j'        tF          j(        f          rt	          j%        |j                   t	          j        |j                   tS          |d
d          Kt	          j%        |j*                   t	          j        |j+                   t	          j%        |j,                   nt          |tF          j-                  rnt	          j.        |j                   |j        Lt          j	        |j/        |j        |j
        d         z  z            }t	          j"        |j        | |           nt          |tF          j0                  r^t	          j        |j        d	|           |j1        :tS          |j        dd          s$t	          j%        |j        |j1                            nt          |td                    rI|3                    |j4        |j5        z   |j6        |j1                  }	t	          j        |j7        |	           nGt          |tp                    r2t	          j%        |j9                   t	          j        |j:                   tw          |d          rt	          j"        |j<                   dS dS )zInitialize the weightsr   r   r   )meanstdrH   r   N)abr2   running_mean_is_hf_initializedFrA  )=r~   initializer_ranger  r   initnormal_ry   r   r   r  rm   in_channels	constant_ro   r   ones_r   r   r   r   rS   rV   r   r   r   r   r   r   r   copy_r   r.  r4  in_featuresr@  r   r3  zeros_r   r   r  r`  r  running_varnum_batches_trackedru   kaiming_normal_r   r	  r   r   r   r   r   r   r   SpeechT5HifiGanr  scaler   rA  )
r}   r  r  r   r   r   r  r  kr   s
             r*   _init_weightsz%SpeechT5PreTrainedModel._init_weights  s    k+f=>> 1	%L"	!v{'>q'AFKD['["\]]]   
 N6;+Q//// @AA *	%Jv|$$$!:v~CWc**B|Aw//99!<<Hyaau{!K!K!K!Q!Q!S!SX\X`ahXiXiloXoVp!pqqH)HNN$4$4x$?@@Bqqq!$Q$wK)HNN$4$4x$?@@Bqqq!$Q$wKaBJvy"%%%% 9::  	%	!f/;;<<AM&+2qbA>>>>M&+0QB!<<<<<	** 	%LSc::::{&FK(((r|R^ LMM 	%K$$$Jv}%%%v~t44@F/000
6-...F6777	** 	% ///{&Ifmv/AFDVWXDY/YZ[[fkaR15555-- 	%LSc::::!-gfmMach6i6i-FM&*<=>>> EFF 	% ..$v}4f6JFL^ K Jv~{333300 	%K$$$Jv|$$$6.// 	4M&233333	4 	4r,   N)r   r   r   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointingr   r   r   Moduler-  rD   r,   r*   r  r    sh         "$O&*#U]__74BI 74 74 74 _74 74 74r,   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej        dej        dz  de	dz  de	dz  d	e	dz  d
e
ez  fdZ xZS )SpeechT5Encoderzu
    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
    r~   c                    t                                                     t          j        j        j                  | _        t          j        j                  | _	        j
        | _        t          j        fdt          j                  D                       | _        t!          j        j        z  j                  | _        d| _        |                                  d S )Nr0  c                 .    g | ]}t                    S rD   )r  )rE   rF   r~   s     r*   rG   z,SpeechT5Encoder.__init__.<locals>.<listcomp>  s"    $h$h$ha%9&%A%A$h$h$hr,   F)rp   rq   r   r   r   r2  r   r   r  r   encoder_layerdrop	layerdropr   rR   encoder_layersrq  r  r  encoder_max_relative_positionr  r"  	post_initr6  s    `r*   rq   zSpeechT5Encoder.__init__  s       ,v'9v?TUUUz&"7881m$h$h$h$h5QWQfKgKg$h$h$hiiA&"@@&Bf 
  
 ',# 	r,   Nr   r/   r  output_hidden_statesreturn_dictr8   c                    ||n| j         j        }||n| j         j        }||n| j         j        }t	          | j         ||          }|                     |          }|                     |          }|                     |          }t                      pt          |           }|rdnd}	|rdnd}
t          | j                  D ]b\  }}|r|	|fz   }	d}| j        rt          j        g           }|| j        k     }|r|r |||||          }|d         }|rd}|r|
|d         fz   }
c|r|	|fz   }	|st!          d	 ||	|
fD                       S t#          ||	|

          S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the encoder prenet.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        N)r~   rz  r/   rD   F)r/   r  r  r   rg  r   c              3      K   | ]}||V  	d S r   rD   rE   vs     r*   	<genexpr>z*SpeechT5Encoder.forward.<locals>.<genexpr>4  s(      mmq_`_l_l_l_l_lmmr,   last_hidden_stater   
attentions)r~   r  r=  r>  r   r   r   r  r   r   	enumeraterq  r+  r   rM   r9  r  r   )r}   r   r/   r  r=  r>  r  r  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r*   r   zSpeechT5Encoder.forward  s   < 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+BY2;')
 
 
 66]33,,];;022R6LT6R6R"6@BBD$5?bb4"+DK"8"8 	P 	PC# I$58H$H! #N} F&+jnn#!4t~!E! 1[ 1 -!#1"/&7	! ! ! !.a 0 - ,  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r,   NNNNr   r   r   r   r   rq   r   ri  r   rT   r  r   r   r   r   s   @r*   r5  r5    s         ~      ( /3)-,0#'X
 X
(X
 t+X
  $;	X

 #TkX
 D[X
 
	 X
 X
 X
 X
 X
 X
 X
 X
r,   r5  c                        e Zd ZdZdef fdZ	 	 	 	 ddej        dej        dz  de	dz  de	dz  d	e	dz  d
e
ez  fdZ xZS )SpeechT5EncoderWithSpeechPrenetz
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    r~   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )rp   rq   r:  prenetr5  wrapped_encoderr<  r6  s     r*   rq   z(SpeechT5EncoderWithSpeechPrenet.__init__C  R       1&99.v66 	r,   Nr-   r/   r  r=  r>  r8   c                 l    |                      ||          \  }}|                     |||||          }|S N)r   r/   r  r=  r>  rU  rV  	r}   r-   r/   r  r=  r>  r  r   r  s	            r*   r   z'SpeechT5EncoderWithSpeechPrenet.forwardK  sL     )-L.(Q(Q%~&&')/!5# ' 
 
 r,   rP  rQ  r   s   @r*   rS  rS  =  s         
~       /3)-,0#' ' t+  $;	
 #Tk D[ 
	        r,   rS  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 ddej	        dej
        dz  d	edz  d
edz  dedz  deez  fdZ xZS )SpeechT5EncoderWithTextPrenetz|
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    r~   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )rp   rq   r  rU  r5  rV  r<  r6  s     r*   rq   z&SpeechT5EncoderWithTextPrenet.__init__f  R       /77.v66 	r,   c                 4    | j                                         S r   rU  get_input_embeddingsrF  s    r*   rb  z2SpeechT5EncoderWithTextPrenet.get_input_embeddingsn      {//111r,   c                 :    | j                             |           d S r   rU  set_input_embeddingsr}   values     r*   rf  z2SpeechT5EncoderWithTextPrenet.set_input_embeddingsq      ((/////r,   Nr-   r/   r  r=  r>  r8   c                 d    |                      |          }|                     |||||          }|S rY  rZ  r[  s	            r*   r   z%SpeechT5EncoderWithTextPrenet.forwardt  sE     L11&&')/!5# ' 
 
 r,   rP  )r   r   r   r   r   rq   rb  rf  r   ri  r   rT   r  r   r   r   r   s   @r*   r]  r]  a  s         ~      2 2 20 0 0 /3)-,0#' ' t+  $;	
 #Tk D[ 
	        r,   r]  c                        e Zd ZdZdef fdZ	 	 	 	 ddej        dej        dz  de	dz  de	dz  d	e	dz  d
e
ez  fdZ xZS )SpeechT5EncoderWithoutPrenet
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    r~   c                     t                                          |           t          |          | _        |                                  d S r   )rp   rq   r5  rV  r<  r6  s     r*   rq   z%SpeechT5EncoderWithoutPrenet.__init__  C       .v66 	r,   Nr-   r/   r  r=  r>  r8   c                 6    |                      |||||          S rY  )rV  )r}   r-   r/   r  r=  r>  r  s          r*   r   z$SpeechT5EncoderWithoutPrenet.forward  s0     ##&)/!5# $ 
 
 	
r,   rP  rQ  r   s   @r*   rl  rl    s         
~       /3)-,0#'
 
'
 t+
  $;	

 #Tk
 D[
 
	 
 
 
 
 
 
 
 
r,   rl  c                        e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 ddej        dz  dej        dz  dej        dz  dej        dz  d	e	dz  d
e
dz  de
dz  de
dz  de
dz  deez  fdZ xZS )SpeechT5Decoderzt
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
    r~   c                    t                                                     j        | _        t	          j        fdt          j                  D                       | _        d| _	        | 
                                 d S )Nc                 2    g | ]}t          |           S ))r  )r  r  s     r*   rG   z,SpeechT5Decoder.__init__.<locals>.<listcomp>  s(    $u$u$uST%9&A%N%N%N$u$u$ur,   F)rp   rq   decoder_layerdropr9  r   r   rR   decoder_layersrq  r"  r<  r6  s    `r*   rq   zSpeechT5Decoder.__init__  sy       1m$u$u$u$uX]^d^sXtXt$u$u$uvv&+# 	r,   Nr   r/   r  r  r  r  r  r=  r>  r8   c
           
         ||n| j         j        }||n| j         j        }||n| j         j        }|	|	n| j         j        }	| j        r%| j        r|rt                              d           d}|r8|6t          t          | j                   t          | j                             }t          | j         |||          }||t          | j         |||          }t                      pt          |           }|rdnd}|rdnd}|r|dnd}t          | j                  D ]p\  }}|r||fz   }d}| j        rt#          j        g           }|| j        k     }|r|s: ||||||||          }|d	         }|r||d
         fz   }|||d         fz   }q|r||fz   }|	st)          d |||||fD                       S t+          |||||          S )aQ  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the decoder prenet.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r~   )r~   rz  r/   r  )r~   rz  r/   r  rD   )r  r  r  r  r   r   r   c              3      K   | ]}||V  	d S r   rD   rA  s     r*   rC  z*SpeechT5Decoder.forward.<locals>.<genexpr>:  s0        =  === r,   )rE  r  r   rF  cross_attentions)r~   r  r=  r  r>  r"  r+  loggerwarning_oncer   r   r   r   r   r   rG  rq  r   rM   r9  r  r   )r}   r   r/   r  r  r  r  r  r=  r>  r  rH  rI  rJ  all_cross_attentionsrK  decoder_layerrM  rN  rO  s                       r*   r   zSpeechT5Decoder.forward  s   r 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+BY& 	"4= 	" "##p   "	 	v01,dk2R2R2RT`hlhsTtTtTtuuO+;')+	
 
 
 !,1G1S%>{+5&;	& & &" 122R6LT6R6R #7@BBD$5?bb4&7h<Q<]rrdh"+DK"8"8 	V 	VC# I$58H$H! #N} F&+jnn#!4t~!E k )M%'= /"3#  M *!,M  V&9]1=M<O&O#(4+?=QRCSBU+U( 	E 1]4D D 	  ':KM`bvw      9+++*1
 
 
 	
r,   	NNNNNNNNNr   r   r   r   r   rq   r   ri  rh  r
   rT   r  r   r   r   r   s   @r*   rr  rr    s3        	~ 	 	 	 	 	 	 3726:>:>(,!%)-,0#'M
 M
(4/M
 (4/M
  %047	M

 !& 04 7M
 M
 $;M
  $;M
 #TkM
 D[M
 
:	:M
 M
 M
 M
 M
 M
 M
 M
r,   rr  c                        e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 ddej        dz  dej        dz  dej        dz  dej        dz  d	ej	        dz  d
e
dz  dedz  dedz  dedz  dedz  deez  fdZ xZS )SpeechT5DecoderWithSpeechPrenetz
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    r~   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )rp   rq   rk  rU  rr  wrapped_decoderr<  r6  s     r*   rq   z(SpeechT5DecoderWithSpeechPrenet.__init__O  rW  r,   Nr-   r/   r  r  r}  r  r  r  r=  r>  r8   c                 n    |                      ||          }|                     ||||||||	|
	  	        }|S N)	r   r/   r  r  r  r  r  r=  r>  rU  r  )r}   r-   r/   r  r  r}  r  r  r  r=  r>  r  decoder_hidden_statesr  s                 r*   r   z'SpeechT5DecoderWithSpeechPrenet.forwardW  sU     !%L:L M M&&/)"7#9+/!5# ' 

 

 r,   )
NNNNNNNNNN)r   r   r   r   r   rq   r   ri  rh  r   r
   rT   r  r   r   r   r   s   @r*   r  r  I  s4        
~       2626:>:>26(,!%)-,0#' '$. (4/  %047	
 !& 04 7 "L4/  $;  $; #Tk D[ 
:	:       r,   r  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 	 	 	 	 	 ddej	        dz  dej
        dz  d	ej	        dz  d
ej
        dz  dedz  dedz  dedz  dedz  dedz  deez  fdZ xZS )SpeechT5DecoderWithTextPrenetz{
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    r~   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )rp   rq   r  rU  rr  r  r<  r6  s     r*   rq   z&SpeechT5DecoderWithTextPrenet.__init__{  r_  r,   c                 4    | j                                         S r   ra  rF  s    r*   rb  z2SpeechT5DecoderWithTextPrenet.get_input_embeddings  rc  r,   c                 :    | j                             |           d S r   re  rg  s     r*   rf  z2SpeechT5DecoderWithTextPrenet.set_input_embeddings  ri  r,   Nr-   r/   r  r  r  r  r  r=  r>  r8   c
                 v    |                      |||          \  }}|                     |||||||||		  	        }|S r  r  )r}   r-   r/   r  r  r  r  r  r=  r>  r  r  r  s                r*   r   z%SpeechT5DecoderWithTextPrenet.forward  s\     15L.Zi0j0j-~&&/)"7#9+/!5# ' 

 

 r,   r~  )r   r   r   r   r   rq   rb  rf  r   ri  rh  r
   rT   r  r   r   r   r   s   @r*   r  r  v  s>        ~      2 2 20 0 0
 2626:>:>(,!%)-,0#' '$. (4/  %047	
 !& 04 7  $;  $; #Tk D[ 
:	:       r,   r  c                        e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 ddej        dz  dej        dz  dej        dz  dej        dz  d	e	dz  d
e
dz  de
dz  de
dz  de
dz  deez  fdZ xZS )SpeechT5DecoderWithoutPrenetrm  r~   c                     t                                          |           t          |          | _        |                                  d S r   )rp   rq   rr  r  r<  r6  s     r*   rq   z%SpeechT5DecoderWithoutPrenet.__init__  ro  r,   Nr-   r/   r  r  r  r  r  r=  r>  r8   c
                 B    |                      |||||||||		  	        }|S r  )r  )r}   r-   r/   r  r  r  r  r  r=  r>  r  r  s               r*   r   z$SpeechT5DecoderWithoutPrenet.forward  s@     &&&)"7#9+/!5# ' 

 

 r,   r~  r  r   s   @r*   r  r    s         
~       2626:>:>(,!%)-,0#' '$. (4/  %047	
 !& 04 7  $;  $; #Tk D[ 
:	:       r,   r  c                        e Zd ZdZdef fdZdej        dej        dej        dej	        fdZ
d	 Zed
             Z xZS )$SpeechT5GuidedMultiheadAttentionLossz
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://huggingface.co/papers/1710.08969), adapted for multi-head attention.
    r~   c                 x    t                                                       |j        | _        |j        | _        d S r   )rp   rq   guided_attention_loss_sigmasigmaguided_attention_loss_scaler+  r6  s     r*   rq   z-SpeechT5GuidedMultiheadAttentionLoss.__init__  s1    7
7


r,   rF  input_masksoutput_masksr8   c                 V   |                      |||j                  }|                    d          |                    d          z  }|                    |j                                      d          }||z  }t	          j        |                    |                    }| j        |z  S )aY  
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        r#   r   r   )_make_guided_attention_masksr   r   r   r   r  masked_selectr+  )r}   rF  r  r  guided_attn_masksmaskslosseslosss           r*   r   z,SpeechT5GuidedMultiheadAttentionLoss.forward  s    " !==k<YcYjkk&&r**[-B-B2-F-FF*++55a88"Z/z&..u5566zD  r,   c                    |                     d          }|                     d          }t          j        t          |          |j        d         |j        d         f|          }t          t          ||                    D ]/\  }\  }}	|                     ||	| j        |          ||d |	d |f<   0|	                    d          S )Nr#   r   rR  )
rP   r   rS   rW   r%   rG  r]  _make_guided_attention_maskr  r   )
r}   r  r  r   ra   rV  r  rK  ilenolens
             r*   r  zASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masks  s    #++%))"--!K[)9)9<;Ma;PR]RcdeRf(gpvwww!*3}n+M+M!N!N 	t 	tC$373S3STXZ^`d`jlr3s3sc5D5%4%/00 **1---r,   c                 0   t          j        t          j        | |          t          j        ||          d          \  }}|                                |z  }|                                | z  }dt          j        ||z
  dz   d|dz  z  z            z
  S )NrR  xy)indexingr   r   )r   meshgridrV   r   r   )r>   output_lengthr  r   grid_ygrid_xs         r*   r  z@SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_mask  s    Lf555Lv666
 
 

 -/,.UY&6/a!78ANKLLLLr,   )r   r   r   r   r   rq   r   ri  
BoolTensorr   r   r  r   r  r   r   s   @r*   r  r    s         
8~ 8 8 8 8 8 8
!+!:?:J!Z_Zj!	! ! ! !2	. 	. 	. M M \M M M M Mr,   r  c                        e Zd ZdZdef fdZ	 ddej        dej        dej        dej        d	ej        d
ej        dz  dej	        fdZ
 xZS )SpeechT5SpectrogramLossz;
    Loss computation used by SpeechT5ForTextToSpeech.
    r~   c                 >   t                                                       |j        | _        |j        | _        |j        | _        t                      | _        t          t          j	        d                    | _
        | j        rt          |          | _        d S d S )Ng      @)
pos_weight)rp   rq   use_guided_attention_lossguided_attention_loss_num_headsr.   r   l1_criterionr   r   r   bce_criterionr  attn_criterionr6  s     r*   rq   z SpeechT5SpectrogramLoss.__init__  s    )/)I&/5/U, & 7"HH.%,s:K:KLLL) 	O"Fv"N"ND	O 	Or,   Nr/   r  r  r  labelsry  r8   c                 b    |dk    }|                     |          }|                     |          }|                     |          }                     ||                               ||          z   }|d d d d df         }	t          j        |	 dz  t          j        |	                    d          d                              |	j                  gd          }
|
d d dd f                              |	          }
|                     |	          }                     ||
          }||z   } j	        rzt          j         fd|D             d          }|dk    }|d d d d df         } j
        dk    r|d d  j
        dz
  d  j
        f         }                     |||          }||z  }|S )Nr1   r   r   r   r   c                 6    g | ]}|d d d j         f         S r   )r  )rE   xr}   s     r*   rG   z3SpeechT5SpectrogramLoss.forward.<locals>.<listcomp><  s0    eeeqa#IT%I#I IJeeer,   )r  r  r   r   rY   r   r   r   r  r  r.   r  )r}   r/   r  r  r  r  ry  rN  l1_lossr  stop_labelsbce_lossr  attnr  r  	attn_losss   `                r*   r   zSpeechT5SpectrogramLoss.forward  s    ' %%l33!7!E!El!S!S 5 C CL Q Q ##$96BBTEVEVWmouEvEvv QQQ1W%i%#uz%**Q--/K/K/N/Nu|/\/\ ]cdeee!!!!QRR%(66u==%%e,, %%fk:: ! ) 	9eeeeTdeeeklmmmD(A-K'111a0L$q((+AAAt/Dq/H/aDLa/a,ab++D+|LLIIDr,   r   )r   r   r   r   r   rq   r   rh  ri  r   r   r   r   s   @r*   r  r  
  s         
O~ 
O 
O 
O 
O 
O 
O& 6:) )() !& 1)  %0	)
 !) !)  +d2) 
) ) ) ) ) ) ) )r,   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc                   v    e Zd Z	 	 ddedej        dz  dej        dz  f fdZd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 dd	ej        dz  d
ej        dz  dej        dz  dej        dz  deeej                          dz  dedz  dedz  dej        dz  dedz  dedz  dedz  deej                 ez  fd            Z xZS )SpeechT5ModelNr~   encoderdecoderc                     t                                          |           || _        |t          |          n|| _        |t          |          n|| _        |                                  dS )z
        encoder (`PreTrainedModel`, *optional*):
            The encoder model to use.
        decoder (`PreTrainedModel`, *optional*):
            The decoder model to use.
        N)rp   rq   r~   rl  r  r  r  r<  )r}   r~   r  r  r   s       r*   rq   zSpeechT5Model.__init__M  sp     	   ?F3F;;;T[?F3F;;;T[ 	r,   c                     t          | j        t                    r| j                                        S t          | j        t
                    r| j                                        S t          r   )r  r  r]  rb  r  r  NotImplementedErrorrF  s    r*   rb  z"SpeechT5Model.get_input_embeddingsa  sZ    dl$ABB 	7<44666dl$ABB 	7<44666!!r,   c                     t          | j        t                    r| j                            |           t          | j        t
                    r| j                            |           d S d S r   )r  r  r]  rf  r  r  rg  s     r*   rf  z"SpeechT5Model.set_input_embeddingsh  sh    dl$ABB 	5L--e444dl$ABB 	5L--e44444	5 	5r,   c                 z    t          | j        t                    r | j        j                                         dS dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  rS  rU  rG  rF  s    r*   rG  z$SpeechT5Model.freeze_feature_encodern  s@    
 dl$CDD 	9L6688888	9 	9r,   r-   r/   decoder_input_valuesdecoder_attention_maskencoder_outputsr  r  r}  r  r=  r>  r8   c                 .   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }||                     |||	|
|          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|Lt          | j        t                    r2| j        j
                            |d         j        d         |          }n|}t          | j        t                    rd|i}ni } | j        d
|||d         ||||	|
|d	|}|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j        	          S )a  
        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
            the vocabulary, or hidden states.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        N)r-   r/   r  r=  r>  r   r   r   rD  r}  )	r-   r/   r  r  r  r  r  r=  r>  )rE  r  r  decoder_attentionsry  encoder_last_hidden_stater  encoder_attentionsrD   )r~   r  r=  r  r>  r  r  r   rW   rS  rU  rJ  r%   r  r  r   rE  r  r   rF  ry  )r}   r-   r/   r  r  r  r  r  r}  r  r=  r>  r  r  decoder_argsdecoder_outputss                   r*   r   zSpeechT5Model.forwardv  s!   D 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+BY ""ll)-"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O %*T\Cb*c*c%%)\%8%[%["(+^& &"" &4"dl$CDD 	02DELLL&$, 
-1"1!"4#9+/!5#
 
 
 
  	5"_44!-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r,   rg  NNNNNNNNNNN)r   r   r   r   r   r3  rq   rb  rf  rG  r   r   r   rh  r  ri  r
   rT   r   r   r   r   s   @r*   r  r  G  s        %)$(	  T! T!	     (" " "5 5 59 9 9  -12648:>BF(,!%7;)-,0#'_
 _
lT)_
 (4/_
 $lT1	_

 !& 04 7_
 uU%6784?_
 _
 $;_
 "-4_
  $;_
 #Tk_
 D[_
 
u 	!$6	6_
 _
 _
 ^_
 _
 _
 _
 _
r,   r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c                   :    e Zd ZddiZdef fdZd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 dd	e
j        dz  d
e
j        dz  de
j        dz  de
j        dz  deee
j                          dz  dedz  dedz  dedz  dedz  dedz  de
j        dz  deez  fd            Z xZS )SpeechT5ForSpeechToTextz#text_decoder_postnet.lm_head.weightz+speecht5.decoder.prenet.embed_tokens.weightr~   c                 @   t                                          |           |j        t          d| j         d          t          |          }t          |          }t          |||          | _        t          |          | _
        |                                  d S )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rp   rq   r  r'   r   rS  r  r  r  r  text_decoder_postnetr<  )r}   r~   speech_encodertext_decoderr   s       r*   rq   z SpeechT5ForSpeechToText.__init__  s       $/ / / /   9@@4V<<%fnlKK$>v$F$F! 	r,   c                 \    |                                  j                                         dS r  get_encoderrU  rG  rF  s    r*   rG  z.SpeechT5ForSpeechToText.freeze_feature_encoder  +    
 	!88:::::r,   c                 4    | j                                         S r   )r  r  rF  s    r*   r  z-SpeechT5ForSpeechToText.get_output_embeddings  s    (>>@@@r,   c                 :    | j                             |           d S r   )r  r  r  s     r*   r  z-SpeechT5ForSpeechToText.set_output_embeddings  s    !77GGGGGr,   Nr-   r/   decoder_input_idsr  r  r  r  r  r=  r>  r  r8   c                 0   |
|
n| j         j        }
|'|%t          || j         j        | j         j                  }|                     |||||||||	d
  
        }|                     |d                   }d}|Kt                      } ||                    d| j         j	                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j        |j        |j        |j        |j        |j        	  	        S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
        >>> predicted_ids = model.generate(**inputs, max_length=100)

        >>> # transcribe speech
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        >>> transcription[0]
        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
        ```

        ```python
        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

        >>> # compute loss
        >>> loss = model(**inputs).loss
        >>> round(loss.item(), 2)
        19.68
        ```
        NT)
r-   r/   r  r  r  r  r  r  r=  r>  r   r#   r   )	r  r  r  r  r  ry  r  r  r  )r~   r>  r+   r    r!   r  r  r   r   r  r   r  r  r  ry  r  r  r  )r}   r-   r/   r  r  r  r  r  r  r=  r>  r  r  r  r  r  loss_fctoutputs                     r*   r   zSpeechT5ForSpeechToText.forward  sV   f &1%<kk$+BY ($6DK4dk6X% %! --%)!2#9++/!5   
 
 **71:66'))H8FKKDK,BCCV[[QS__UUD 	FY,F)-)9TGf$$vE#3")"?&9$5&-&G")"?&9

 

 

 
	
r,   r  )r   r   r   _tied_weights_keysr   rq   rG  r  r  r   r   ri  rh  r  r
   rT   r   r   r   r   s   @r*   r  r    s        @Ano~      (; ; ;A A AH H H  262659:>BF(,!%)-,0#'*.|
 |
'$.|
 (4/|
 !+d2	|

 !& 04 7|
 uU%6784?|
 |
 $;|
  $;|
 #Tk|
 D[|
  4'|
 
	 |
 |
 |
 ^|
 |
 |
 |
 |
r,   r        ?r2         4@Fmodelr}  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
           
        "#$ |t          d          |&d|| j        j        k                                    z
  }
n|}
|                    d          }| j                            ||
d          }|j        }t          | j        j        t                    r6| j        j        j
                            |d         j        d         |
          }
t          |                    d          |z  | j        j        z            }t          |                    d          |z  | j        j        z            }|                    |d| j        j                  }g }g }d }d}i "	 |dz  }| j        j        
                    ||          }| j        j                            |d d dd f         d ||
|d|d          }|r.|                    t'          j        |j        d                     |j                            d          }|j        }| j                            |          }|                    || j        j        | j        j                  }|                    |           |d d dd d f                             |d| j        j                  }t'          j        ||fd          }t'          j        | j                            |                    }||k     r~||k     rGt'          j        |d          |k    }t'          j        |          d                                         }ntA          tC          |                    }"fd	|D             }tC          |          dk    rht'          j"        |          }|#                    dd          $                    dd
          }| j        %                    |          }|D ]}||         "|<   tC          "          |k    rn"fdtA          tC          "                    D             }|	s|dk    r|d         n*t&          j&        j'        j(        )                    |d          }| ||          }n|}|rlt'          j        |d
          }|dk    rL |j        |t          |                    d          |z            g|                                dd          R  }||f}n*g #tA          |          D ]0} #                    ||                              d                     1|0t&          j&        j'        j(        )                    |d          }|#f}nKg $t&          j&        j'        j(        )                    |d          } ||          $#$fd#D             }!$|!f}|rit'          j        |d
          } |j        |t          |                    d          |z            g|                                dd          R  }g ||R }|S )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r-   r/   r>  r#   )r   r/   r  r  r  r  r  r>  r   c                     g | ]}|v|	S rD   rD   rE   r  result_spectrograms     r*   rG   z$_generate_speech.<locals>.<listcomp>  s$    SSS!q@R7R7RA7R7R7Rr,   r   c                      g | ]
}|         S rD   rD   r  s     r*   rG   z$_generate_speech.<locals>.<listcomp>  s    RRRa&q)RRRr,   )batch_firstc                 z    g | ]7}t                              d           t                    z            |z  8S r   )r<   r   r=   )rE   r  spectrogram_lengths	waveformss     r*   rG   z$_generate_speech.<locals>.<listcomp>	  sB    sssZ[INN1$5$5<O8P8P$P Q QTU Usssr,   )*r'   r~   r    r<   r   r  r  rE  r  rS  rU  rJ  r%   r.   r$   rn  r  r  r[   r   r   ry  squeezer  speech_decoder_postnetr  r   sigmoidr  rP   r  rQ   rR   rW   stackr   flattenr  r   r   rnnpad_sequence)%r  r-   r}  r/   r  r  r  r  r  r  r  r   encoder_outr  maxlenminlenoutput_sequencespectrogramry  r  rK  r  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesspectrograms
meet_indexr  r  waveform_lengthsr  r  r  s%                                     @@@r*   _generate_speechr    s    !
 
 	
 !"lel6O&O%T%T%V%V!V!/


A

C.((!- )  K !, = %.(*IJJ 
!&!7!>!a!aN #%;"
 "
 *//22[@5<C``aaF*//22[@5<C``aaF 099#q%,B[\\OKO
C4q !& 6 = =oOa b bn,<</2337";#9+5 = 	
 	
 # 	T##EIk.JPQ$R$R$RSSS);CCAFF%5 /889LMM==el&CU\E^__8$$$ #111b!!!8,11#q%,:STT)_o$FANNN}U9BBCVWWXX<< V||"')Db"9"9"9Y"F${?;;A>EEGG$SYY//SSSS|SSSL<  1$$${;77+55a;;CCAqII$;CCLQQ". N NJ5A*5M&z22%&&#--i4j SRRR5=O9P9P3Q3QRRRL   3),l1ooux~7I7V7VWcqu7V7v7vgk**GG!G" 	2$y)9qAAAQww#8#3#8-22155;<<$?O?T?T?V?VWYWZWZ?[$ $ $   01G !s 	@ 	@A&&|A';';A'>'>????? 8>-::<UY:ZZL#%89GGI 8>-::<UY:ZZL--Isssss_rsss "23G" 	3$y)9qAAA4/4S)..q11C788 ;K;P;P;R;RSUSVSV;W      32!122GNr,   zB
    SpeechT5 Model with a text encoder and a speech decoder.
    c                   $    e Zd ZdZdZdef fdZedefd            Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 d!dej        dz  dej        dz  d	ej        dz  d
ej        dz  deeej                          dz  dedz  dedz  dedz  dedz  dedz  dej        dz  dej        dz  dej        dz  deez  fd            Z ej                    	 	 	 	 	 	 	 	 d"dej        dej        dz  dej        dz  dedededej        dz  dededej        eej        ej        f         z  fd            Z ej                    	 	 	 	 	 	 	 	 d"dej        dej        dz  dej        dz  dedededej        dz  dededej        eej        ej        f         z  fd             Z xZS )#SpeechT5ForTextToSpeech)textr   r~   c                 @   t                                          |           |j        t          d| j         d          t          |          }t          |          }t          |||          | _        t          |          | _
        |                                  d S )Nr  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rp   rq   r  r'   r   r]  r  r  r  r  r  r<  )r}   r~   text_encoderspeech_decoderr   s       r*   rq   z SpeechT5ForTextToSpeech.__init__	  s       $/ / / /   5V<<8@@%flNKK&B6&J&J# 	r,   r8   c                     dS r*  rD   )clss    r*   can_generatez$SpeechT5ForTextToSpeech.can_generate.	  s	    
 tr,   Nr/   r  r  r  r  r  r  r=  r>  r}  r  r  c                    |
|
n| j         j        }
|.|t          || j         j        |          \  }}| j         j        rd}|                     ||||||||||	d          }|                     |d                   \  }}}d}|)t          | j                   } |||||||j                  }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j        |j        |j        |j        |j        |j        	  	        S )ab  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
            [`~PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
            for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
        >>> import torch

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([15872])
        ```
        NTr-   r/   r  r  r  r  r  r}  r  r=  r>  r   r   	r  r  r  r  r  ry  r  r  r  )r~   r>  r4   r.   r  r  r  r  ry  r   r  r  r  r  r  r  )r}   r   r/   r  r  r  r  r  r  r=  r>  r}  r  r  r  r  r  r  r  r  	criterionr  s                         r*   r   zSpeechT5ForTextToSpeech.forward5	  s|   J &1%<kk$+BY#+?WDK8:P@ @<$&< {4 )$(!--")!5#9++1/!5   
 
 AE@[@[\cde\f@g@g= 5v/<<I9&%( D  	F+-;F)-)9TGf$$vE'-#3")"?&9$5&-&G")"?&9

 

 

 
	
r,   r  r2   r  Fr  r  r  r  r  r  c
                    |m|                     d          }|                     d          |k    r?|                     d          dk    r|                    |d          }nt          d          t          | |||||||||	
  
        S )aE  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Attention mask from the tokenizer, required for batched inference to signal to the model where to
                ignore padded tokens from the input_ids.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   ry  r'   r  )r}   r   r/   r}  r  r  r  r  r  r  r  r`   s               r*   generatez SpeechT5ForTextToSpeech.generate	  s    J )"**J!&&q))Z77%**1--22);)B)B:q)Q)Q&&$o    #!
 
 	
r,   c
                    |m|                     d          }
|                     d          |
k    r?|                     d          dk    r|                    |
d          }nt          d          t          | |||||||||	
  
        S )a  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r   )r}   r   r}  r/   r  r  r  r  r  r  r`   s              r*   generate_speechz'SpeechT5ForTextToSpeech.generate_speech
  s    R )"**J!&&q))Z77%**1--22);)B)B:q)Q)Q&&$o    #!
 
 	
r,   NNNNNNNNNNNNNNNr  r2   r  NFF)r   r   r   r1  r0  r   rq   classmethodrT   r  r   r   rh  ri  r  r
   r   r   r   r   r   r   r3  r!  r#  r   r   s   @r*   r  r  	  s        !!O~      ( T    [  .2269=:>BF(,!%)-,0#'7;+/+/x
 x
#d*x
 (4/x
 $/$6	x

 !& 04 7x
 uU%6784?x
 x
 $;x
  $;x
 #Tkx
 D[x
 "-4x
 !D(x
 \D(x
  
)	)!x
 x
 x
 ^x
t U]__ 377; !$((-&+Y
 Y
#Y
 (4/Y
 "-4	Y

 Y
 Y
 Y
 T!Y
 "&Y
  $Y
 
	U5#4e6G#GH	HY
 Y
 Y
 _Y
v U]__ 8<26 !$((-&+]
 ]
#]
 "-4]
 (4/	]

 ]
 ]
 ]
 T!]
 "&]
  $]
 
	U5#4e6G#GH	H]
 ]
 ]
 _]
 ]
 ]
 ]
 ]
r,   r  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c                       e Zd Zdef fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddej        dz  dej	        dz  dej        dz  dej	        dz  d	e
e
ej                          dz  d
edz  dedz  dedz  dedz  dedz  dej        dz  dej        dz  dej        dz  de
ez  fd            Z ej                    	 	 	 	 	 	 	 	 d dej        dej        dz  dej	        dz  dedededej        dz  dededej        fd            Z xZS )!SpeechT5ForSpeechToSpeechr~   c                    t                                          |           t          |          }t          |          }t	          |||          | _        t          |          | _        |                                  d S r   )	rp   rq   rS  r  r  r  r  r  r<  )r}   r~   r  r  r   s       r*   rq   z"SpeechT5ForSpeechToSpeech.__init__s
  sp       8@@8@@%fnnMM&B6&J&J# 	r,   c                 \    |                                  j                                         dS r  r  rF  s    r*   rG  z0SpeechT5ForSpeechToSpeech.freeze_feature_encoder
  r  r,   Nr-   r/   r  r  r  r  r  r  r=  r>  r}  r  r  r8   c                    |
|
n| j         j        }
| |t          || j         j        |          \  }}|                     ||||||||||	d          }|                     |d                   \  }}}d}|
s|f|dd         z   }||f|z   n|S t          |||j        |j        |j	        |j
        |j        |j        |j        	  	        S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into
            a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
            [`SpeechT5Processor.__call__`] for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
        >>> from datasets import load_dataset
        >>> import torch

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([77824])
        ```
        NTr  r   r   r  )r~   r>  r4   r.   r  r  r   r  r  r  ry  r  r  r  )r}   r-   r/   r  r  r  r  r  r  r=  r>  r}  r  r  r  r  rF   r  r  r  r  s                        r*   r   z!SpeechT5ForSpeechToSpeech.forward
  s$   X &1%<kk$+BY#+?WDK8:P@ @<$&< --%)!5#9++1/!5   
 
 "&!<!<WQZ!H!H; 	F!^gabbk1F)-)9TGf$$vE'##3")"?&9$5&-&G")"?&9

 

 

 
	
r,   r  r2   r  Fr  r  r  r  r  r  c
                 l    |t          j        d|j                  }t          | |||||||||	
  
        S )a'  
        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
        speech waveform using a vocoder.

        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Float values of input raw speech waveform.

                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`,
                a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`)
                or the soundfile library (`pip install soundfile`).
                To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and
                conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        N)r   i   rR  )r   rS   r   r  )
r}   r-   r}  r/   r  r  r  r  r  r  s
             r*   r#  z)SpeechT5ForSpeechToSpeech.generate_speech
  sS    T %!&Xl>Q!R!R!R#!
 
 	
r,   r$  r%  )r   r   r   r   rq   rG  r   r   ri  rh  r  r
   rT   r   r   r   r   r   r   r3  r#  r   r   s   @r*   r(  r(  m
  s{       
~ 
 
 
 
 
 
; ; ;  26269=:>BF(,!%)-,0#'7;+/+/s
 s
'$.s
 (4/s
 $/$6	s

 !& 04 7s
 uU%6784?s
 s
 $;s
  $;s
 #Tks
 D[s
 "-4s
 !D(s
 \D(s
  
)	)!s
 s
 s
 ^s
j U]__ 8<26 !$((-&+W
 W
'W
 "-4W
 (4/	W

 W
 W
 W
 T!W
 "&W
  $W
 
	W
 W
 W
 _W
 W
 W
 W
 W
r,   r(  c                   :     e Zd Zd
 fd	ZddZd Zd Zd	 Z xZS )HifiGanResidualBlockr   r   r      皙?c                 d    t                                                       | _        t          j         fdt          t                              D                        _        t          j         fdt          t                              D                        _        d S )Nc                     g | ]<}t          j        d |                             |                             =S r   )rn   dilationr   r   ru   get_padding)rE   r  channelsr5  rm   r}   s     r*   rG   z1HifiGanResidualBlock.__init__.<locals>.<listcomp>]  sf     
 
 
  	%a[ ,,[(1+FF  
 
 
r,   c                 l    g | ]0}t          j        d d                     d                     1S r4  r6  )rE   rF   r8  rm   r}   s     r*   rG   z1HifiGanResidualBlock.__init__.<locals>.<listcomp>j  s^     
 
 
  	 ,,[!<<  
 
 
r,   )	rp   rq   leaky_relu_sloper   r   rR   rW   convs1convs2)r}   r8  rm   r5  r:  r   s   ```` r*   rq   zHifiGanResidualBlock.__init__X  s     0m
 
 
 
 
 
 
 s8}}--
 
 

 
 m
 
 
 
 
 
 s8}}--
 
 

 
r,   r   c                     ||z  |z
  dz  S r   rD   )r}   rm   r5  s      r*   r7  z HifiGanResidualBlock.get_paddingw  s    h&1a77r,   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        }| j        D ]} ||           | j        D ]} ||           d S Nr   )r   r   r   r   r   r;  r<  r}   r   r  s      r*   apply_weight_normz&HifiGanResidualBlock.apply_weight_normz  s    h*28,m<< 	@(3?K[ 	 	EK[ 	 	EK	 	r,   c                     | j         D ]!}t          j                            |           "| j        D ]!}t          j                            |           "d S r   )r;  r   r   remove_weight_normr<  r}   r  s     r*   rC  z'HifiGanResidualBlock.remove_weight_norm  s`    [ 	/ 	/EH''....[ 	/ 	/EH''....	/ 	/r,   c                    t          | j        | j                  D ]l\  }}|}t          j                            || j                  } ||          }t          j                            || j                  } ||          }||z   }m|S r   )r]  r;  r<  r   r  
leaky_relur:  )r}   r   conv1conv2r  s        r*   r   zHifiGanResidualBlock.forward  s    T[99 	5 	5LE5$HM44]DDYZZM!E-00MM44]DDYZZM!E-00M)H4MMr,   )r   r/  r1  r  )	r   r   r   rq   r7  rA  rC  r   r   r   s   @r*   r.  r.  W  s~        
 
 
 
 
 
>8 8 8 8  / / /      r,   r.  z
    HiFi-GAN vocoder.
    c                        e Zd ZU eed<   dZdef fdZ fdZd Zd Z	 e
d          dej        d	ej        fd
            Z xZS )r*  r~   r  c                 |   t                                          |           t          |j                  | _        t          |j                  | _        t          j        |j	        |j
        ddd          | _        t          j                    | _        t          t          |j        |j                            D ]X\  }\  }}| j                            t          j        |j
        d|z  z  |j
        d|dz   z  z  ||||z
  dz                       Yt          j                    | _        t)          t          | j                            D ]a}|j
        d|dz   z  z  }t          |j        |j                  D ]4\  }}| j                            t-          ||||j                             5bt          j        |dddd          | _        |                     dt5          j        |j	                             |                     dt5          j        |j	                             |                                  d S )N   r   r   )rm   rn   r   r   r  r+  )rp   rq   rW   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   ru   model_in_dimupsample_initial_channelconv_prer   	upsamplerrG  r]  upsample_kernel_sizesr[   ConvTranspose1d	resblocksrR   resblock_dilation_sizesr.  r:  	conv_postr   r   rS   rY   r<  )r}   r~   r  upsample_raterm   r8  r5  r   s          r*   rq   zSpeechT5HifiGan.__init__  s5      v;<< !677	+
 
 
 /8V=RTZTp9q9q/r/r 		 		+A+{N!!"31=3a!eE +((=8Q>      s4>**++ 	v 	vA61Q<HH),V-I6Ki)j)j v v%X%%&:8[RZ\b\s&t&tuuuuv 8QAaQRSSSVU[1D%E%EFFFWej1D&E&EFFF 	r,   c                     t                                          |           t          |t                    r4t	          j        |j                   t	          j        |j                   d S d S r   )	rp   r-  r  r*  r  r&  r  r#  r+  )r}   r  r   s     r*   r-  zSpeechT5HifiGan._init_weights  sa    f%%%fo.. 	%K$$$Jv|$$$$$	% 	%r,   c                 8   t           j        j        }t          t           j        j        d          rt           j        j        j        } || j                   | j        D ]} ||           | j        D ]}|                                  || j	                   d S r?  )
r   r   r   r   r   rR  rS  rV  rA  rX  r@  s      r*   rA  z!SpeechT5HifiGan.apply_weight_norm  s    h*28,m<< 	@(3?KDM"""^ 	 	EK^ 	& 	&E##%%%%DN#####r,   c                 $   t           j                            | j                   | j        D ]!}t           j                            |           "| j        D ]}|                                 t           j                            | j                   d S r   )r   r   rC  rR  rS  rV  rX  rD  s     r*   rC  z"SpeechT5HifiGan.remove_weight_norm  s    
##DM222^ 	/ 	/EH''....^ 	' 	'E$$&&&&
##DN33333r,   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r8   c                    | j         j        r|| j        z
  | j        z  }|                                dk    }|s|                    d          }|                    dd          }|                     |          }t          | j	                  D ]}t          j                            || j         j                  } | j        |         |          } | j        || j        z           |          }t          d| j                  D ]&}| | j        || j        z  |z            |          z  }'|| j        z  }t          j                            |          }|                     |          }t%          j        |          }|s=|                    d                              dd                              d          }n|                    d          }|S )a  
        spectrogram (`torch.FloatTensor`):
            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
            config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        r   r   r   r   r#   )r~   normalize_beforer  r+  r   r   r   rR  rR   rO  r   r  rF  r:  rS  rV  rM  rX  r   tanhr  r   )	r}   r  r  
is_batchedr   r  	res_statejwaveforms	            r*   r   zSpeechT5HifiGan.forward  s   " ;' 	A&2dj@K __&&!+
 	3%//22K#--a33m44t)** 	9 	9AM44]DKD`aaM-DN1-m<<M<q4+;';<]KKI1d.// U UET^A0@,@1,DEmTTT		%(88MM00??}55
=11 	0$,,Q//99!Q??DDRHHHH %,,Q//Hr,   )r   r   r   r   r.  r0  rq   r-  rA  rC  r   r   ri  r   r   r   s   @r*   r*  r*    s          "!!!#O$4 $ $ $ $ $ $L% % % % %
$ 
$ 
$4 4 4 ^  (5#4 (5CT ( ( ( ( ( ( ( (r,   r*  )r  r(  r  r  r  r*  )r   Nr  r%  )er   r   numpyrK   r   r   torch.nnr   r   r    r   r  activationsr	   cache_utilsr
   r   r   
generationr   integrations.deepspeedr   integrations.fsdpr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   r   configuration_speecht5r   r   
get_loggerr   rz  _HIDDEN_STATES_START_POSITIONr   r<   r+   r4   r  r   rh  ndarrayrh   rj   r   r   r3  r   r   r   r  r   r  r.  r:  rk  r  r  r  r  r  r  r  r  r  r  r5  rS  r]  rl  rr  r  r  r  r  r  r  r  ri  rT   r  r  r(  r.  r*  __all__rD   r,   r*   <module>ru     sC                 @ @ @ @ @ @ @ @ @ @ & & & & & & ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) @ @ @ @ @ @ 7 7 7 7 7 7 J J J J J J J J 9 9 9 9 9 9              D C C C C C C C , , , , , , , , I I I I I I I I 
	H	%	% !" %, c [^    " bf0 0,0250KP<Z^K^0 0 0 04 /3t tc?tt t $t+	t
 t Zt t t tp    #=   ,    !;   8    !;   2B8 B8 B8 B8 B8BI B8 B8 B8L* * * * *bi * * *Z    ry   2" " " " " " " "(    29   % % % % %RY % % %R1 1 1 1 1	 1 1 1D D D D D") D D DN1 1 1 1 1") 1 1 1h% % % % % % % %P< < < < <29 < < <2    	+?   ""- "- "- "- "-	+? "- "- "-J& & & & &,@ & & &$U2 U2 U2 U2 U2	 U2 U2 U2p    ")   06 6 6 6 65 6 6 6rX X X X X5 X X Xv ?4 ?4 ?4 ?4 ?4o ?4 ?4 ?4Dn
 n
 n
 n
 n
- n
 n
 n
b! ! ! ! !&= ! ! !H& & & & &$; & & &R
 
 
 
 
#: 
 
 
>]
 ]
 ]
 ]
 ]
- ]
 ]
 ]
@* * * * *&= * * *Z. . . . .$; . . .b% % % % %#: % % %P8M 8M 8M 8M 8M29 8M 8M 8Mv: : : : :bi : : :z   
J
 J
 J
 J
 J
+ J
 J
 
J
Z   
a
 a
 a
 a
 a
5 a
 a
 
a
N 48.2 $$)"'L L"L#L )D0L $t+	L
 L L L YL "L  L u0%2CCDDL L L L^   
T
 T
 T
 T
 T
5 T
 T
 
T
n
   
b
 b
 b
 b
 b
 7 b
 b
 
b
J; ; ; ; ;29 ; ; ;|   
s s s s so s s 
sl  r,   