import gradio as gr from transformers import AutoTokenizer import onnxruntime import scipy.io.wavfile import numpy as np import torch # Import torch - might be needed for tokenizer output # --- Load tokenizer and ONNX model from Hugging Face Hub --- repo_id = "Athspi/Gg" # Replace with your actual repo ID if different tokenizer = AutoTokenizer.from_pretrained(repo_id) onnx_model_path = f"{repo_id}/mms_tts_eng/model_quantized.onnx" # Path to quantized ONNX model inside the repo ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider']) # --- Speech generation function --- def generate_speech(text): """Generates speech from text using the loaded ONNX model.""" inputs = tokenizer(text, return_tensors="pt") input_ids = inputs.input_ids.cpu().to(torch.long) # Ensure LongTensor for ONNX # Run inference with ONNX Runtime onnx_outputs = ort_session.run(None, {"input_ids": input_ids.numpy()}) waveform = onnx_outputs[0] # Output waveform sampling_rate = 16000 # Assuming 16kHz, adjust if your model uses different rate return sampling_rate, waveform.squeeze() # Return sample rate and waveform # --- Gradio Interface --- iface = gr.Interface( fn=generate_speech, inputs=gr.Textbox(lines=2, placeholder="Enter text to synthesize..."), outputs=gr.Audio(label="Generated Speech"), title="Fast MMS-TTS-ENG Text-to-Speech (CPU)", description="Real-time Text-to-Speech using the optimized facebook/mms-tts-eng model with ONNX Runtime for fast CPU inference. Model and tokenizer loaded from Hugging Face Hub (Athspi/Gg).", examples=[ ["Hello, this is a demonstration of fast text-to-speech on CPU."], ["This is another example sentence."], ["How does this sound to you?"] ] ) if __name__ == "__main__": iface.launch()