import os
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer
import onnxruntime
import scipy.io.wavfile
from huggingface_hub import hf_hub_download

# Define the Hugging Face repository/model ID.
repo_id = "Athspi/Gg"

# Download the ONNX model file from the repository.
onnx_model_path = hf_hub_download(repo_id=repo_id, filename="mms_tts_eng.onnx")

# Load the tokenizer from the repository.
tokenizer = AutoTokenizer.from_pretrained(repo_id)

# Initialize the ONNX runtime session for inference.
ort_session = onnxruntime.InferenceSession(
    onnx_model_path, providers=['CPUExecutionProvider']
)

# Define the fixed sampling rate (adjust if your model uses a different rate)
sampling_rate = 16000

def tts_inference(text: str):
    """
    Convert input text to speech waveform using the ONNX model.
    
    Parameters:
        text (str): Input text to synthesize.
    
    Returns:
        Tuple[int, np.ndarray]: A tuple containing the sampling rate (int) and the synthesized
                                 audio waveform (np.ndarray in float32 format).
    """
    # Tokenize the input text.
    inputs = tokenizer(text, return_tensors="pt")
    
    # Prepare inputs for the ONNX model.
    input_ids = inputs.input_ids.cpu().to(torch.long).numpy()
    
    # Run inference on the ONNX model.
    onnx_outputs = ort_session.run(None, {"input_ids": input_ids})
    waveform = onnx_outputs[0]
    
    # Ensure the output is a NumPy array.
    if not isinstance(waveform, np.ndarray):
        waveform = np.array(waveform)
    
    # Convert waveform to float32 (required by Gradio's Audio component).
    waveform = waveform.astype(np.float32)
    
    # Remove any extra dimensions.
    waveform = np.squeeze(waveform)
    
    return (sampling_rate, waveform)  # Return as a tuple


# Build the Gradio interface.
iface = gr.Interface(
    fn=tts_inference,
    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
    outputs=gr.Audio(type="numpy", label="Generated Speech"),
    title="ONNX TTS Demo",
    description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face.",
    examples=[
        ["Hello, this is an example of text-to-speech."],
        ["This model uses ONNX Runtime for fast inference."],
        ["You can try your own sentences here."]
    ]
)

if __name__ == "__main__":
    iface.launch()