Spaces:

Athspi
/

Tttt

Sleeping

App Files Files Community

Tttt / app.py

Athspi

Update app.py

4368215 verified 4 months ago

raw

history blame

1.85 kB

	import gradio as gr
	from transformers import AutoTokenizer
	import onnxruntime
	import scipy.io.wavfile
	import numpy as np
	import torch # Import torch - might be needed for tokenizer output

	# --- Load tokenizer and ONNX model from Hugging Face Hub ---
	repo_id = "Athspi/Gg" # Replace with your actual repo ID if different

	tokenizer = AutoTokenizer.from_pretrained(repo_id)
	onnx_model_path = f"{repo_id}/mms_tts_eng/model_quantized.onnx" # Path to quantized ONNX model inside the repo
	ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])


	# --- Speech generation function ---
	def generate_speech(text):
	"""Generates speech from text using the loaded ONNX model."""
	inputs = tokenizer(text, return_tensors="pt")
	input_ids = inputs.input_ids.cpu().to(torch.long) # Ensure LongTensor for ONNX

	# Run inference with ONNX Runtime
	onnx_outputs = ort_session.run(None, {"input_ids": input_ids.numpy()})
	waveform = onnx_outputs[0] # Output waveform

	sampling_rate = 16000 # Assuming 16kHz, adjust if your model uses different rate

	return sampling_rate, waveform.squeeze() # Return sample rate and waveform


	# --- Gradio Interface ---
	iface = gr.Interface(
	fn=generate_speech,
	inputs=gr.Textbox(lines=2, placeholder="Enter text to synthesize..."),
	outputs=gr.Audio(label="Generated Speech"),
	title="Fast MMS-TTS-ENG Text-to-Speech (CPU)",
	description="Real-time Text-to-Speech using the optimized facebook/mms-tts-eng model with ONNX Runtime for fast CPU inference. Model and tokenizer loaded from Hugging Face Hub (Athspi/Gg).",
	examples=[
	["Hello, this is a demonstration of fast text-to-speech on CPU."],
	["This is another example sentence."],
	["How does this sound to you?"]
	]
	)

	if __name__ == "__main__":
	iface.launch()