Tttt / app.py
Athspi's picture
Update app.py
4368215 verified
raw
history blame
1.85 kB
import gradio as gr
from transformers import AutoTokenizer
import onnxruntime
import scipy.io.wavfile
import numpy as np
import torch # Import torch - might be needed for tokenizer output
# --- Load tokenizer and ONNX model from Hugging Face Hub ---
repo_id = "Athspi/Gg" # Replace with your actual repo ID if different
tokenizer = AutoTokenizer.from_pretrained(repo_id)
onnx_model_path = f"{repo_id}/mms_tts_eng/model_quantized.onnx" # Path to quantized ONNX model inside the repo
ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
# --- Speech generation function ---
def generate_speech(text):
"""Generates speech from text using the loaded ONNX model."""
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs.input_ids.cpu().to(torch.long) # Ensure LongTensor for ONNX
# Run inference with ONNX Runtime
onnx_outputs = ort_session.run(None, {"input_ids": input_ids.numpy()})
waveform = onnx_outputs[0] # Output waveform
sampling_rate = 16000 # Assuming 16kHz, adjust if your model uses different rate
return sampling_rate, waveform.squeeze() # Return sample rate and waveform
# --- Gradio Interface ---
iface = gr.Interface(
fn=generate_speech,
inputs=gr.Textbox(lines=2, placeholder="Enter text to synthesize..."),
outputs=gr.Audio(label="Generated Speech"),
title="Fast MMS-TTS-ENG Text-to-Speech (CPU)",
description="Real-time Text-to-Speech using the optimized facebook/mms-tts-eng model with ONNX Runtime for fast CPU inference. Model and tokenizer loaded from Hugging Face Hub (Athspi/Gg).",
examples=[
["Hello, this is a demonstration of fast text-to-speech on CPU."],
["This is another example sentence."],
["How does this sound to you?"]
]
)
if __name__ == "__main__":
iface.launch()