import gradio as gr
from transformers import AutoTokenizer
import onnxruntime
import scipy.io.wavfile
import numpy as np
import torch  # Import torch - might be needed for tokenizer output

# --- Load tokenizer and ONNX model from Hugging Face Hub ---
repo_id = "Athspi/Gg"  # Replace with your actual repo ID if different

tokenizer = AutoTokenizer.from_pretrained(repo_id)
onnx_model_path = f"{repo_id}/mms_tts_eng/model_quantized.onnx" # Path to quantized ONNX model inside the repo
ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])


# --- Speech generation function ---
def generate_speech(text):
    """Generates speech from text using the loaded ONNX model."""
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs.input_ids.cpu().to(torch.long)  # Ensure LongTensor for ONNX

    # Run inference with ONNX Runtime
    onnx_outputs = ort_session.run(None, {"input_ids": input_ids.numpy()})
    waveform = onnx_outputs[0]  # Output waveform

    sampling_rate = 16000 # Assuming 16kHz, adjust if your model uses different rate

    return sampling_rate, waveform.squeeze()  # Return sample rate and waveform


# --- Gradio Interface ---
iface = gr.Interface(
    fn=generate_speech,
    inputs=gr.Textbox(lines=2, placeholder="Enter text to synthesize..."),
    outputs=gr.Audio(label="Generated Speech"),
    title="Fast MMS-TTS-ENG Text-to-Speech (CPU)",
    description="Real-time Text-to-Speech using the optimized facebook/mms-tts-eng model with ONNX Runtime for fast CPU inference. Model and tokenizer loaded from Hugging Face Hub (Athspi/Gg).",
    examples=[
        ["Hello, this is a demonstration of fast text-to-speech on CPU."],
        ["This is another example sentence."],
        ["How does this sound to you?"]
    ]
)

if __name__ == "__main__":
    iface.launch()