Athspi commited on
Commit
6417dc9
·
verified ·
1 Parent(s): 49089da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -30
app.py CHANGED
@@ -1,45 +1,64 @@
 
1
  import gradio as gr
 
 
2
  from transformers import AutoTokenizer
3
  import onnxruntime
4
  import scipy.io.wavfile
5
- import numpy as np
6
- import torch # Import torch - might be needed for tokenizer output
7
 
8
- # --- Load tokenizer and ONNX model from Hugging Face Hub ---
9
- repo_id = "Athspi/Gg" # Correct repo ID
 
10
 
11
- tokenizer = AutoTokenizer.from_pretrained(repo_id)
12
- onnx_model_path = f"{repo_id}/mms_tts_eng.onnx" # Path to ONNX model in repo root
13
- ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
14
 
 
 
15
 
16
- # --- Speech generation function ---
17
- def generate_speech(text):
18
- """Generates speech from text using the loaded ONNX model."""
19
- inputs = tokenizer(text, return_tensors="pt")
20
- input_ids = inputs.input_ids.cpu().to(torch.long) # Ensure LongTensor for ONNX
21
-
22
- # Run inference with ONNX Runtime
23
- onnx_outputs = ort_session.run(None, {"input_ids": input_ids.numpy()})
24
- waveform = onnx_outputs[0] # Output waveform
25
-
26
- sampling_rate = 16000 # Assuming 16kHz, adjust if your model uses different rate
27
 
28
- return sampling_rate, waveform.squeeze() # Return sample rate and waveform
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # --- Gradio Interface ---
32
  iface = gr.Interface(
33
- fn=generate_speech,
34
- inputs=gr.Textbox(lines=2, placeholder="Enter text to synthesize..."),
35
- outputs=gr.Audio(label="Generated Speech"),
36
- title="Fast MMS-TTS-ENG Text-to-Speech (CPU)",
37
- description="Real-time Text-to-Speech using the optimized facebook/mms-tts-eng model with ONNX Runtime for fast CPU inference. Model and tokenizer loaded from Hugging Face Hub (Athspi/Gg).",
38
- examples=[
39
- ["Hello, this is a demonstration of fast text-to-speech on CPU."],
40
- ["This is another example sentence."],
41
- ["How does this sound to you?"]
42
- ]
43
  )
44
 
45
  if __name__ == "__main__":
 
1
+ import os
2
  import gradio as gr
3
+ import torch
4
+ import numpy as np
5
  from transformers import AutoTokenizer
6
  import onnxruntime
7
  import scipy.io.wavfile
 
 
8
 
9
+ # Specify the Hugging Face repository/model directory.
10
+ # This repository (Athspi/Gg) should contain the tokenizer files and the ONNX model file.
11
+ model_dir = "Athspi/Gg"
12
 
13
+ # Define the ONNX model filename. Adjust the filename if needed.
14
+ onnx_model_filename = "model_quantized.onnx"
15
+ onnx_model_path = os.path.join(model_dir, onnx_model_filename)
16
 
17
+ # Load the tokenizer from the Hugging Face model repository
18
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
19
 
20
+ # Initialize the ONNX runtime session for inference.
21
+ ort_session = onnxruntime.InferenceSession(
22
+ onnx_model_path, providers=['CPUExecutionProvider']
23
+ )
 
 
 
 
 
 
 
24
 
25
+ # Define the fixed sampling rate (adjust if your model uses a different rate)
26
+ sampling_rate = 16000
27
 
28
+ def tts_inference(text: str):
29
+ """
30
+ Convert input text to speech waveform using the ONNX model.
31
+
32
+ Parameters:
33
+ text (str): Input text to synthesize.
34
+
35
+ Returns:
36
+ waveform (np.ndarray): Synthesized audio waveform.
37
+ sampling_rate (int): The sampling rate of the waveform.
38
+ """
39
+ # Tokenize the input text.
40
+ inputs = tokenizer(text, return_tensors="pt")
41
+
42
+ # Prepare inputs for the ONNX model.
43
+ input_ids = inputs.input_ids.cpu().to(torch.long).numpy()
44
+
45
+ # Run inference on the ONNX model.
46
+ onnx_outputs = ort_session.run(None, {"input_ids": input_ids})
47
+ waveform = onnx_outputs[0]
48
+
49
+ # Remove unnecessary dimensions.
50
+ waveform = np.squeeze(waveform)
51
+
52
+ # Return the waveform and its sampling rate.
53
+ return waveform, sampling_rate
54
 
55
+ # Build a Gradio interface.
56
  iface = gr.Interface(
57
+ fn=tts_inference,
58
+ inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
59
+ outputs=gr.Audio(type="numpy"),
60
+ title="ONNX TTS Demo",
61
+ description="Text-to-Speech synthesis using an ONNX model from the Athspi/Gg repository on Hugging Face."
 
 
 
 
 
62
  )
63
 
64
  if __name__ == "__main__":