Spaces:

Athspi
/

Tg

Sleeping

App Files Files Community

Athspi commited on Mar 19

Commit

f07e098

verified ·

1 Parent(s): 745e842

Create app py

Browse files

Files changed (1) hide show

app py +159 -0

app py ADDED Viewed

	@@ -0,0 +1,159 @@

+import gradio as gr
+import torch
+import numpy as np
+from transformers import AutoTokenizer
+import onnxruntime
+from huggingface_hub import hf_hub_download
+# --- Configuration ---
+repo_id = "Athspi/Gg"  # Your Hugging Face Hub repository ID
+onnx_filename = "mms_tts_eng.onnx"  # Name of the ONNX file in the repository
+sampling_rate = 16000  # Sampling rate of the model (adjust if needed)
+# --- Load Model and Tokenizer ---
+# Download the ONNX model (using hf_hub_download for caching)
+onnx_model_path = hf_hub_download(repo_id=repo_id, filename=onnx_filename)
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained(repo_id)
+# --- ONNX Runtime Session Setup with Optimization ---
+session_options = onnxruntime.SessionOptions()
+# Optimization level: Use all available optimizations
+session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+# Threading:  Set intra_op_num_threads to the number of *physical* cores
+#              (You'll need to determine this for your system).  Here's a
+#              way to get it programmatically (but it might not be 100%
+#              reliable on all systems).
+try:
+    import psutil
+    num_physical_cores = psutil.cpu_count(logical=False)
+except ImportError:
+    print("psutil not installed.  You can install it with: pip install psutil")
+    num_physical_cores = 4  # Set a reasonable default (e.g., 4)
+    print(f"Using default number of physical cores: {num_physical_cores}")
+session_options.intra_op_num_threads = num_physical_cores
+session_options.inter_op_num_threads = 1 # Usually best for TTS to be 1 or 2
+# Create the ONNX Runtime inference session
+ort_session = onnxruntime.InferenceSession(
+    onnx_model_path,
+    providers=['CPUExecutionProvider'],  # You can try other providers if available
+    sess_options=session_options,
+)
+# --- IO Binding Setup ---
+io_binding = ort_session.io_binding()
+# Get input/output metadata
+input_meta = ort_session.get_inputs()[0]
+output_meta = ort_session.get_outputs()[0]
+# Dummy input for shape/type
+dummy_input = tokenizer("a", return_tensors="pt")["input_ids"].to(torch.long)
+input_shape = tuple(dummy_input.shape)
+input_type = dummy_input.numpy().dtype
+# Pre-allocate input tensor (CPU, contiguous)
+input_tensor = torch.empty(input_shape, dtype=torch.int64, device="cpu").contiguous()
+# Pre-allocate output tensor (CPU, contiguous) - estimate max size
+max_output_length = input_shape[1] * 10  # Adjust factor as needed
+output_shape = (1, 1, max_output_length)
+output_tensor = torch.empty(output_shape, dtype=torch.float32, device="cpu").contiguous()
+# Bind the pre-allocated tensors
+io_binding.bind_input(
+    name=input_meta.name,
+    device_type="cpu",
+    device_id=0,
+    element_type=input_type,
+    shape=input_shape,
+    buffer_ptr=input_tensor.data_ptr(),
+)
+io_binding.bind_output(
+    name=output_meta.name,
+    device_type="cpu",
+    device_id=0,
+    element_type=np.float32,
+    shape=output_shape,
+    buffer_ptr=output_tensor.data_ptr(),
+)
+# --- Inference Function (with IO Binding) ---
+def tts_inference_io_binding(text: str):
+    """TTS inference with IO Binding."""
+    global input_tensor, output_tensor, io_binding
+    inputs = tokenizer(text, return_tensors="pt")
+    input_ids = inputs.input_ids.to(torch.long)
+    current_input_shape = tuple(input_ids.shape)
+    # Resize input tensor if necessary
+    if current_input_shape[1] > input_tensor.shape[1]:
+        input_tensor = torch.empty(current_input_shape, dtype=torch.int64, device="cpu").contiguous()
+        io_binding.bind_input(
+            name=input_meta.name,
+            device_type="cpu",
+            device_id=0,
+            element_type=input_type,
+            shape=current_input_shape,
+            buffer_ptr=input_tensor.data_ptr(),
+        )
+    # Copy input data
+    input_tensor[:current_input_shape[0], :current_input_shape[1]].copy_(input_ids)
+    # Resize output tensor if necessary
+    required_output_length = current_input_shape[1] * 10
+    if required_output_length > output_tensor.shape[2]:
+        output_shape = (1, 1, required_output_length)
+        output_tensor = torch.empty(output_shape, dtype=torch.float32, device="cpu").contiguous()
+        io_binding.bind_output(
+            name=output_meta.name,
+            device_type="cpu",
+            device_id=0,
+            element_type=np.float32,
+            shape=output_shape,
+            buffer_ptr=output_tensor.data_ptr(),
+        )
+    # Clear binding
+    io_binding.clear_binding_outputs()
+    # Run inference
+    ort_session.run_with_iobinding(io_binding)
+    # Get output
+    ort_outputs = io_binding.get_outputs()
+    output_data = ort_outputs[0].numpy()
+    return (sampling_rate, output_data.squeeze())
+# --- Gradio Interface ---
+iface = gr.Interface(
+    fn=tts_inference_io_binding,
+    inputs=gr.Textbox(lines=3, placeholder="Enter text here..."),  # Slightly larger textbox
+    outputs=gr.Audio(type="numpy", label="Generated Speech"),
+    title="Optimized MMS-TTS (English) with ONNX Runtime",
+    description="Fast Text-to-Speech using the facebook/mms-tts-eng model, optimized with ONNX Runtime and IO Binding.  Model loaded from Hugging Face Hub.",
+    examples=[
+        ["Hello, this is a demonstration of optimized text-to-speech."],
+        ["This model uses ONNX Runtime and IO Binding for fast CPU inference."],
+        ["The quick brown fox jumps over the lazy dog."],
+        ["Try entering your own text to hear how it sounds!"]
+    ],
+    cache_examples=False,  # Disable example caching (important for dynamic TTS)
+)
+if __name__ == "__main__":
+    iface.launch()