|
import gradio as gr |
|
from transformers import AutoTokenizer |
|
import onnxruntime |
|
import scipy.io.wavfile |
|
import numpy as np |
|
import torch |
|
|
|
|
|
repo_id = "Athspi/Gg" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(repo_id) |
|
onnx_model_path = f"{repo_id}/mms_tts_eng/model_quantized.onnx" |
|
ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider']) |
|
|
|
|
|
|
|
def generate_speech(text): |
|
"""Generates speech from text using the loaded ONNX model.""" |
|
inputs = tokenizer(text, return_tensors="pt") |
|
input_ids = inputs.input_ids.cpu().to(torch.long) |
|
|
|
|
|
onnx_outputs = ort_session.run(None, {"input_ids": input_ids.numpy()}) |
|
waveform = onnx_outputs[0] |
|
|
|
sampling_rate = 16000 |
|
|
|
return sampling_rate, waveform.squeeze() |
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=generate_speech, |
|
inputs=gr.Textbox(lines=2, placeholder="Enter text to synthesize..."), |
|
outputs=gr.Audio(label="Generated Speech"), |
|
title="Fast MMS-TTS-ENG Text-to-Speech (CPU)", |
|
description="Real-time Text-to-Speech using the optimized facebook/mms-tts-eng model with ONNX Runtime for fast CPU inference. Model and tokenizer loaded from Hugging Face Hub (Athspi/Gg).", |
|
examples=[ |
|
["Hello, this is a demonstration of fast text-to-speech on CPU."], |
|
["This is another example sentence."], |
|
["How does this sound to you?"] |
|
] |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |