
    }-j3                         d dl mZmZmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ  e	            rd d	lZdd
lmZ ddlmZ dZ G d ded          Z G d de          Zd	S )    )Any	TypedDictoverload   )
AudioInput)GenerationConfig)is_torch_available)ChatChatType   )PipelineN)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc                   (    e Zd ZU dZeed<   eed<   dS )AudioOutputz
    audio (`AudioInput`):
        The generated audio waveform.
    sampling_rate (`int`):
        The sampling rate of the generated audio waveform.
    audiosampling_rateN)__name__
__module____qualname____doc__r   __annotations__int     d/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.pyr   r   !   s6           r   r   F)totalc                   @    e Zd ZdZdZdZdZdZdZ e	d          Z
ddd fd
Zd	 Zd
 Zedededefd            Zedee         dedee         fd            Zedededefd            Zedee         dedee         fd            Z fdZ	 	 	 ddZd Z xZS )TextToAudioPipelinea  
    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
    pipeline generates an audio file from an input text and optional other conditional inputs.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(model="suno/bark-small")
    >>> output = pipe("Hey it's HuggingFace on the phone!")

    >>> audio = output["audio"]
    >>> sampling_rate = output["sampling_rate"]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    <Tip>

    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
    [`TextToAudioPipeline.__call__.generate_kwargs`].

    Example:

    ```python
    >>> from transformers import pipeline

    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small")

    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
    >>> generate_kwargs = {
    ...     "do_sample": True,
    ...     "temperature": 0.7,
    ...     "max_new_tokens": 35,
    ... }

    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
    ```

    </Tip>

    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
    `"text-to-audio"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
    TNF   )max_new_tokens)vocoderr   c                    t                      j        |i | d | _        | j        j        t          j                    v r?|6t          j        t                    
                    | j        j                  n|| _        | j        j        j        dv rd | _        || _        | j        | j        j        j        | _        | j        | j        j        }| j        j                            dd           }|C|                    d |                                                                D                        dD ]M}t+          ||d           }||| _        t+          |dd           t+          |j        |d           }||| _        N| j        4| j        /t/          | j        d          r| j        j        j        | _        d S d S d S d S )N)musicgenspeecht5generation_configc                     i | ]
\  }}|||S Nr   ).0kvs      r   
<dictcomp>z0TextToAudioPipeline.__init__.<locals>.<dictcomp>   s$    ^^^1PQP]q!P]P]P]r   )sample_rater   codec_configfeature_extractor)super__init__r"   model	__class__r   valuesr   from_pretrainedDEFAULT_VOCODER_IDtodeviceconfig
model_type	processorr   __dict__getupdateto_dictitemsgetattrr.   hasattrr/   )	selfr"   r   argskwargsr9   
gen_configsampling_rate_namer3   s	           r   r1   zTextToAudioPipeline.__init__m   s   $)&))):#H#O#Q#QQQ ?  /0BCCFFtzGXYYY L :'+CCC!DN*<#!%!4!BD% Z&F,001DdKKJ%^^
0B0B0D0D0J0J0L0L^^^___&F ; ;" '0BD I I ,)6D&&V^T::F$+F,?ASUY$Z$ZM$0-:* %$.*DQUQ_atIuIu*D!%!A!OD &%*D*D*D*Dr   c                 L   t          |t                    r|g}| j        j        j        dk    rPd}t          | j        d          rt          | j        j        dd          }|dddd}|	                    |           |}| j
        | j
        n| j        }t          |t                    r |j        |j        fddd|}ne| j        j        j        d	k    r"d
 |D             }|                    dd           | j        j        j        dk    rd |D             } ||fi |ddi}|S )Nbarkr    semantic_configmax_input_semantic_lengthFT)
max_lengthadd_special_tokensreturn_attention_maskreturn_token_type_ids)tokenizereturn_dictcsmc                 F    g | ]}|                     d           sd| n|S )[z[0]
startswithr)   ts     r   
<listcomp>z2TextToAudioPipeline.preprocess.<locals>.<listcomp>   s3    PPPac):):A	a			PPPr   rM   diac                 F    g | ]}|                     d           sd| n|S )rT   z[S1] rU   rW   s     r   rY   z2TextToAudioPipeline.preprocess.<locals>.<listcomp>   s3    RRR1<<+<+<C!RRRr   return_tensorspt)
isinstancestrr2   r9   r:   rB   r&   rA   rJ   r>   r;   	tokenizerr
   apply_chat_templatemessages
setdefault)rC   textrE   rL   
new_kwargspreprocessoroutputs          r   
preprocesszTextToAudioPipeline.preprocess   s   dC   	6D:'611 Jt-/@AA o$T%;%KMhjmnn
(&+)-).	 J f%%%F)-)Ct~~dD!! 	G5\5   	 FF z +u44PP4PPP!!"6===z +u44RRTRRR!\$FF&FFFFFFr   c                 f   |                      || j                  }|d         }|d         }| j                                        r|                      || j                  }d|vr
| j        |d<   |                    |           |                    ddi           | j        j        j        dv r	d|vrd|d<    | j        j        di ||}nHt          |          r$t          d	|                                            | j        di ||d
         }| j        |                     |          }|S )N)r8   forward_paramsgenerate_kwargsr&   return_dict_in_generateT)rR   output_audiozYou're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. For reference, the `generate_kwargs` used here are: r   r   )_ensure_tensor_on_devicer8   r2   can_generater&   r>   r9   r:   generatelen
ValueErrorkeysr"   )rC   model_inputsrE   rj   rk   rg   s         r   _forwardzTextToAudioPipeline._forward   s   ..vdk.JJ 01 !23:""$$ 	E";;OTXT_;``O #/997;7M 34 !!/222 !!#<d"CDDDz +w66 "7759N>2(TZ(JJ<J>JJFF?##  dKZK_K_KaKad d  
  TZAA,A.AA!DF<#\\&))Fr   text_inputsrj   returnc                     d S r(   r   rC   rv   rj   s      r   __call__zTextToAudioPipeline.__call__   s    PSPSr   c                     d S r(   r   ry   s      r   rz   zTextToAudioPipeline.__call__   s    \_\_r   c                     d S r(   r   ry   s      r   rz   zTextToAudioPipeline.__call__   s    UXUXr   c                     d S r(   r   ry   s      r   rz   zTextToAudioPipeline.__call__   s    adadr   c                 8     t                      j        |fi |S )aL  
        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

        Args:
            text_inputs (`str`, `list[str]`, `ChatType`, or `list[ChatType]`):
                One or several texts to generate. If strings or a list of string are passed, this pipeline will
                generate the corresponding text. Alternatively, a "chat", in the form of a list of dicts with "role"
                and "content" keys, can be passed, or a list of such chats. When chats are passed, the model's chat
                template will be used to format them before passing them to the model.
            forward_params (`dict`, *optional*):
                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
                underlying model.
            generate_kwargs (`dict`, *optional*):
                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                complete overview of generate, check the [following
                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
                only passed to the underlying model if the latter is a generative model.

        Return:
            `AudioOutput` or a list of `AudioOutput`, which is a `TypedDict` with two keys:

            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
        )r0   rz   )rC   rv   rj   r3   s      r   rz   zTextToAudioPipeline.__call__   s$    2  uww>>~>>>r   c                     t          | dd           
| j        |d<   t          | dd           | j        |d<   | j        |d<   |r|ni |r|ni d}|i }i }|||fS )Nassistant_modelassistant_tokenizerr`   )rj   rk   )rA   r   r`   r   )rC   preprocess_paramsrj   rk   paramspostprocess_paramss         r   _sanitize_parametersz(TextToAudioPipeline._sanitize_parameters  s     4*D11=151EO-.4.55A+/>OK(595MO12 1?FnnB2AIr
 

 $ " &*<<<r   c                    d}t          |t                    rd|v r	|d         }n(d}|d         }nt          |t                    r|d         }|r!| j        | j                            |          }t          |t
                    r*d |D             }t          |          dk    r|n|d         }nE|                    dt          j	        	          
                                                                }t          || j        
          S )NFr   T	sequencesr   c                     g | ]G}|                     d t          j                                                                                  HS )cpur8   dtype)r7   torchfloatnumpysqueeze)r)   els     r   rY   z3TextToAudioPipeline.postprocess.<locals>.<listcomp>4  sC    ^^^RTRUU%u{U;;AACCKKMM^^^r   r   r   r   )r   r   )r^   dicttupler;   decodelistrq   r7   r   r   r   r   r   r   )rC   r   needs_decodings      r   postprocesszTextToAudioPipeline.postprocess%  s   eT"" 	%g!%k*u%% 	!HE 	1dn8N))%00EeT"" 	P^^X]^^^E ZZ!^^EEqEEHHEH==CCEEMMOOE,
 
 
 	
r   )NNN)r   r   r   r   _pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr1   rh   ru   r   r_   r   r   rz   r   r   r   r   __classcell__)r3   s   @r   r   r   -   s       2 2h  $O!#O "2!1" " " '+$ &P &P &P &P &P &P &PP& & &P( ( (T SCS3S;SSS XS_DI__kIZ___ X_XHXXXXX XXdDNdcddS^N_ddd Xd? ? ? ? ?: 	= = = =.
 
 
 
 
 
 
r   r   )typingr   r   r   audio_utilsr   
generationr   utilsr	   utils.chat_template_utilsr
   r   baser   r   models.auto.modeling_autor   !models.speecht5.modeling_speecht5r   r6   r   r   r   r   r   <module>r      s;   , + + + + + + + + + $ $ $ $ $ $ ) ) ) ) ) ) & & & & & & 6 6 6 6 6 6 6 6        DLLLQQQQQQCCCCCC1 	 	 	 	 	)5 	 	 	 	O
 O
 O
 O
 O
( O
 O
 O
 O
 O
r   