import gradio as gr
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch

# Initialize the TTS pipeline from Huggingface
synthesizer = pipeline("text-to-speech", model="Futuresony/output")

# Load the speaker embeddings dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

def text_to_speech(text):
    # Convert the generated text to speech
    speech = synthesizer(text, forward_params={"speaker_embeddings": speaker_embedding})
    
    # Save the generated speech to a file
    output_file = "generated_speech.wav"
    sf.write(output_file, speech["audio"], samplerate=speech["sampling_rate"])
    
    # Return the path to the audio file for playback
    return output_file

# Create the Gradio interface
demo = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(label="Enter Text", placeholder="Type something..."),
    outputs=gr.Audio(label="Generated Speech"),
    title="Text-to-Speech Generator",
    description="Enter text and generate speech using a pre-trained TTS model."
)

if __name__ == "__main__":
    demo.launch()