import os import gradio as gr import torch import numpy as np from transformers import AutoTokenizer import onnxruntime import scipy.io.wavfile from huggingface_hub import hf_hub_download # Define the Hugging Face repository/model ID. repo_id = "Athspi/Gg" # Download the ONNX model file from the repository. onnx_model_path = hf_hub_download(repo_id=repo_id, filename="mms_tts_eng.onnx") # Load the tokenizer from the repository. tokenizer = AutoTokenizer.from_pretrained(repo_id) # Initialize the ONNX runtime session for inference. ort_session = onnxruntime.InferenceSession( onnx_model_path, providers=['CPUExecutionProvider'] ) # Define the fixed sampling rate (adjust if your model uses a different rate) sampling_rate = 16000 def tts_inference(text: str): """ Convert input text to speech waveform using the ONNX model. Parameters: text (str): Input text to synthesize. Returns: Tuple[int, np.ndarray]: A tuple containing the sampling rate (int) and the synthesized audio waveform (np.ndarray in float32 format). """ # Tokenize the input text. inputs = tokenizer(text, return_tensors="pt") # Prepare inputs for the ONNX model. input_ids = inputs.input_ids.cpu().to(torch.long).numpy() # Run inference on the ONNX model. onnx_outputs = ort_session.run(None, {"input_ids": input_ids}) waveform = onnx_outputs[0] # Ensure the output is a NumPy array. if not isinstance(waveform, np.ndarray): waveform = np.array(waveform) # Convert waveform to float32 (required by Gradio's Audio component). waveform = waveform.astype(np.float32) # Remove any extra dimensions. waveform = np.squeeze(waveform) return (sampling_rate, waveform) # Return as a tuple # Build the Gradio interface. iface = gr.Interface( fn=tts_inference, inputs=gr.Textbox(lines=2, placeholder="Enter text here..."), outputs=gr.Audio(type="numpy", label="Generated Speech"), title="ONNX TTS Demo", description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face.", examples=[ ["Hello, this is an example of text-to-speech."], ["This model uses ONNX Runtime for fast inference."], ["You can try your own sentences here."] ] ) if __name__ == "__main__": iface.launch()