import gradio as gr from transformers import pipeline from datasets import load_dataset import soundfile as sf import torch # Initialize the TTS pipeline from Huggingface synthesizer = pipeline("text-to-speech", model="Futuresony/output") # Load the speaker embeddings dataset embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) def text_to_speech(text): # Convert the generated text to speech speech = synthesizer(text, forward_params={"speaker_embeddings": speaker_embedding}) # Save the generated speech to a file output_file = "generated_speech.wav" sf.write(output_file, speech["audio"], samplerate=speech["sampling_rate"]) # Return the path to the audio file for playback return output_file # Create the Gradio interface demo = gr.Interface( fn=text_to_speech, inputs=gr.Textbox(label="Enter Text", placeholder="Type something..."), outputs=gr.Audio(label="Generated Speech"), title="Text-to-Speech Generator", description="Enter text and generate speech using a pre-trained TTS model." ) if __name__ == "__main__": demo.launch()