Spaces:

Athspi
/

Tttt

Sleeping

App Files Files Community

Athspi commited on Mar 19

Commit

4368215

verified ·

1 Parent(s): aa37cb9

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -63

app.py CHANGED Viewed

@@ -1,74 +1,46 @@
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# Load model and tokenizer
-model_id = "suayptalha/FastLlama-3.2-3B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-# Explicitly set padding token
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-# System prompt
-system_prompt = "You are a friendly assistant named FastLlama."
-def format_prompt(message: str, history: list):
-    prompt = f"<|system|>\n{system_prompt}</s>\n"
-    for user_msg, bot_msg in history:
-        prompt += f"<|user|>\n{user_msg}</s>\n<|assistant|>\n{bot_msg}</s>\n"
-    prompt += f"<|user|>\n{message}</s>\n<|assistant|>\n"
-    return prompt
-def respond(message: str, history: list):
-    # Format the prompt with chat history
-    full_prompt = format_prompt(message, history)
-    # Tokenize input with attention mask
-    inputs = tokenizer(
-        full_prompt,
-        return_tensors="pt",
-        padding=True,
-        truncation=True
-    ).to(model.device)
-    # Generate response with attention mask
-    output = model.generate(
-        inputs.input_ids,
-        attention_mask=inputs.attention_mask,
-        max_new_tokens=256,
-        temperature=0.7,
-        top_p=0.9,
-        repetition_penalty=1.1,
-        do_sample=True,
-        pad_token_id=tokenizer.pad_token_id
-    )
-    # Decode response while skipping special tokens
-    response = tokenizer.decode(
-        output[0][inputs.input_ids.shape[-1]:],
-        skip_special_tokens=True
-    )
-    return response
-# Create chat interface
-chat = gr.ChatInterface(
-    fn=respond,
-    title="FastLlama-3.2B Chat",
-    description="Chat with FastLlama-3.2-3B-Instruct AI assistant",
     examples=[
-        ["Explain quantum computing in simple terms"],
-        ["Write a poem about artificial intelligence"],
-        ["What's the meaning of life?"]
-    ],
-    cache_examples=False
 )
 if __name__ == "__main__":
-    chat.launch(server_name="0.0.0.0")

 import gradio as gr
+from transformers import AutoTokenizer
+import onnxruntime
+import scipy.io.wavfile
+import numpy as np
+import torch  # Import torch - might be needed for tokenizer output
+# --- Load tokenizer and ONNX model from Hugging Face Hub ---
+repo_id = "Athspi/Gg"  # Replace with your actual repo ID if different
+tokenizer = AutoTokenizer.from_pretrained(repo_id)
+onnx_model_path = f"{repo_id}/mms_tts_eng/model_quantized.onnx" # Path to quantized ONNX model inside the repo
+ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
+# --- Speech generation function ---
+def generate_speech(text):
+    """Generates speech from text using the loaded ONNX model."""
+    inputs = tokenizer(text, return_tensors="pt")
+    input_ids = inputs.input_ids.cpu().to(torch.long)  # Ensure LongTensor for ONNX
+    # Run inference with ONNX Runtime
+    onnx_outputs = ort_session.run(None, {"input_ids": input_ids.numpy()})
+    waveform = onnx_outputs[0]  # Output waveform
+    sampling_rate = 16000 # Assuming 16kHz, adjust if your model uses different rate
+    return sampling_rate, waveform.squeeze()  # Return sample rate and waveform
+# --- Gradio Interface ---
+iface = gr.Interface(
+    fn=generate_speech,
+    inputs=gr.Textbox(lines=2, placeholder="Enter text to synthesize..."),
+    outputs=gr.Audio(label="Generated Speech"),
+    title="Fast MMS-TTS-ENG Text-to-Speech (CPU)",
+    description="Real-time Text-to-Speech using the optimized facebook/mms-tts-eng model with ONNX Runtime for fast CPU inference. Model and tokenizer loaded from Hugging Face Hub (Athspi/Gg).",
     examples=[
+        ["Hello, this is a demonstration of fast text-to-speech on CPU."],
+        ["This is another example sentence."],
+        ["How does this sound to you?"]
+    ]
 )
 if __name__ == "__main__":
+    iface.launch()