|
import os |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
from transformers import AutoTokenizer |
|
import onnxruntime |
|
import scipy.io.wavfile |
|
|
|
|
|
|
|
model_dir = "Athspi/Gg" |
|
|
|
|
|
onnx_model_filename = "mms_tts_eng.onnx" |
|
onnx_model_path = os.path.join(model_dir, onnx_model_filename) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_dir) |
|
|
|
|
|
ort_session = onnxruntime.InferenceSession( |
|
onnx_model_path, providers=['CPUExecutionProvider'] |
|
) |
|
|
|
|
|
sampling_rate = 16000 |
|
|
|
def tts_inference(text: str): |
|
""" |
|
Convert input text to speech waveform using the ONNX model. |
|
|
|
Parameters: |
|
text (str): Input text to synthesize. |
|
|
|
Returns: |
|
waveform (np.ndarray): Synthesized audio waveform. |
|
sampling_rate (int): The sampling rate of the waveform. |
|
""" |
|
|
|
inputs = tokenizer(text, return_tensors="pt") |
|
|
|
|
|
input_ids = inputs.input_ids.cpu().to(torch.long).numpy() |
|
|
|
|
|
onnx_outputs = ort_session.run(None, {"input_ids": input_ids}) |
|
waveform = onnx_outputs[0] |
|
|
|
|
|
waveform = np.squeeze(waveform) |
|
|
|
|
|
return waveform, sampling_rate |
|
|
|
|
|
iface = gr.Interface( |
|
fn=tts_inference, |
|
inputs=gr.Textbox(lines=2, placeholder="Enter text here..."), |
|
outputs=gr.Audio(type="numpy"), |
|
title="ONNX TTS Demo", |
|
description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |