Spaces:

Athspi
/

Tttt

Sleeping

App Files Files Community

Athspi commited on Mar 19

Commit

6417dc9

verified ·

1 Parent(s): 49089da

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -30

app.py CHANGED Viewed

@@ -1,45 +1,64 @@
 import gradio as gr
 from transformers import AutoTokenizer
 import onnxruntime
 import scipy.io.wavfile
-import numpy as np
-import torch  # Import torch - might be needed for tokenizer output
-# --- Load tokenizer and ONNX model from Hugging Face Hub ---
-repo_id = "Athspi/Gg"  # Correct repo ID
-tokenizer = AutoTokenizer.from_pretrained(repo_id)
-onnx_model_path = f"{repo_id}/mms_tts_eng.onnx" # Path to ONNX model in repo root
-ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
-# --- Speech generation function ---
-def generate_speech(text):
-    """Generates speech from text using the loaded ONNX model."""
-    inputs = tokenizer(text, return_tensors="pt")
-    input_ids = inputs.input_ids.cpu().to(torch.long)  # Ensure LongTensor for ONNX
-    # Run inference with ONNX Runtime
-    onnx_outputs = ort_session.run(None, {"input_ids": input_ids.numpy()})
-    waveform = onnx_outputs[0]  # Output waveform
-    sampling_rate = 16000 # Assuming 16kHz, adjust if your model uses different rate
-    return sampling_rate, waveform.squeeze()  # Return sample rate and waveform
-# --- Gradio Interface ---
 iface = gr.Interface(
-    fn=generate_speech,
-    inputs=gr.Textbox(lines=2, placeholder="Enter text to synthesize..."),
-    outputs=gr.Audio(label="Generated Speech"),
-    title="Fast MMS-TTS-ENG Text-to-Speech (CPU)",
-    description="Real-time Text-to-Speech using the optimized facebook/mms-tts-eng model with ONNX Runtime for fast CPU inference. Model and tokenizer loaded from Hugging Face Hub (Athspi/Gg).",
-    examples=[
-        ["Hello, this is a demonstration of fast text-to-speech on CPU."],
-        ["This is another example sentence."],
-        ["How does this sound to you?"]
-    ]
 )
 if __name__ == "__main__":

+import os
 import gradio as gr
+import torch
+import numpy as np
 from transformers import AutoTokenizer
 import onnxruntime
 import scipy.io.wavfile
+# Specify the Hugging Face repository/model directory.
+# This repository (Athspi/Gg) should contain the tokenizer files and the ONNX model file.
+model_dir = "Athspi/Gg"
+# Define the ONNX model filename. Adjust the filename if needed.
+onnx_model_filename = "model_quantized.onnx"
+onnx_model_path = os.path.join(model_dir, onnx_model_filename)
+# Load the tokenizer from the Hugging Face model repository
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+# Initialize the ONNX runtime session for inference.
+ort_session = onnxruntime.InferenceSession(
+    onnx_model_path, providers=['CPUExecutionProvider']
+)
+# Define the fixed sampling rate (adjust if your model uses a different rate)
+sampling_rate = 16000
+def tts_inference(text: str):
+    """
+    Convert input text to speech waveform using the ONNX model.
+    Parameters:
+        text (str): Input text to synthesize.
+    Returns:
+        waveform (np.ndarray): Synthesized audio waveform.
+        sampling_rate (int): The sampling rate of the waveform.
+    """
+    # Tokenize the input text.
+    inputs = tokenizer(text, return_tensors="pt")
+    # Prepare inputs for the ONNX model.
+    input_ids = inputs.input_ids.cpu().to(torch.long).numpy()
+    # Run inference on the ONNX model.
+    onnx_outputs = ort_session.run(None, {"input_ids": input_ids})
+    waveform = onnx_outputs[0]
+    # Remove unnecessary dimensions.
+    waveform = np.squeeze(waveform)
+    # Return the waveform and its sampling rate.
+    return waveform, sampling_rate
+# Build a Gradio interface.
 iface = gr.Interface(
+    fn=tts_inference,
+    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
+    outputs=gr.Audio(type="numpy"),
+    title="ONNX TTS Demo",
+    description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face."
 )
 if __name__ == "__main__":