Me / app.py
Futuresony's picture
Update app.py
d49f9f1 verified
import gradio as gr
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch
# Initialize the TTS pipeline from Huggingface
synthesizer = pipeline("text-to-speech", model="Futuresony/output")
# Load the speaker embeddings dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
def text_to_speech(text):
# Convert the generated text to speech
speech = synthesizer(text, forward_params={"speaker_embeddings": speaker_embedding})
# Save the generated speech to a file
output_file = "generated_speech.wav"
sf.write(output_file, speech["audio"], samplerate=speech["sampling_rate"])
# Return the path to the audio file for playback
return output_file
# Create the Gradio interface
demo = gr.Interface(
fn=text_to_speech,
inputs=gr.Textbox(label="Enter Text", placeholder="Type something..."),
outputs=gr.Audio(label="Generated Speech"),
title="Text-to-Speech Generator",
description="Enter text and generate speech using a pre-trained TTS model."
)
if __name__ == "__main__":
demo.launch()