import os
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer
import onnxruntime
import scipy.io.wavfile

# Specify the Hugging Face repository/model directory.
# This repository (Athspi/Gg) should contain the tokenizer files and the ONNX model file.
model_dir = "Athspi/Gg"  

# Define the ONNX model filename. Adjust the filename if needed.
onnx_model_filename = "mms_tts_eng.onnx"
onnx_model_path = os.path.join(model_dir, onnx_model_filename)

# Load the tokenizer from the Hugging Face model repository
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Initialize the ONNX runtime session for inference.
ort_session = onnxruntime.InferenceSession(
    onnx_model_path, providers=['CPUExecutionProvider']
)

# Define the fixed sampling rate (adjust if your model uses a different rate)
sampling_rate = 16000

def tts_inference(text: str):
    """
    Convert input text to speech waveform using the ONNX model.
    
    Parameters:
        text (str): Input text to synthesize.
    
    Returns:
        waveform (np.ndarray): Synthesized audio waveform.
        sampling_rate (int): The sampling rate of the waveform.
    """
    # Tokenize the input text.
    inputs = tokenizer(text, return_tensors="pt")
    
    # Prepare inputs for the ONNX model.
    input_ids = inputs.input_ids.cpu().to(torch.long).numpy()
    
    # Run inference on the ONNX model.
    onnx_outputs = ort_session.run(None, {"input_ids": input_ids})
    waveform = onnx_outputs[0]
    
    # Remove unnecessary dimensions.
    waveform = np.squeeze(waveform)
    
    # Return the waveform and its sampling rate.
    return waveform, sampling_rate

# Build a Gradio interface.
iface = gr.Interface(
    fn=tts_inference,
    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
    outputs=gr.Audio(type="numpy"),
    title="ONNX TTS Demo",
    description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face."
)

if __name__ == "__main__":
    iface.launch()