Athspi commited on
Commit
4368215
·
verified ·
1 Parent(s): aa37cb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -63
app.py CHANGED
@@ -1,74 +1,46 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
4
 
5
- # Load model and tokenizer
6
- model_id = "suayptalha/FastLlama-3.2-3B-Instruct"
7
- tokenizer = AutoTokenizer.from_pretrained(model_id)
8
- model = AutoModelForCausalLM.from_pretrained(
9
- model_id,
10
- torch_dtype=torch.float16,
11
- device_map="auto"
12
- )
 
 
 
 
 
13
 
14
- # Explicitly set padding token
15
- if tokenizer.pad_token is None:
16
- tokenizer.pad_token = tokenizer.eos_token
17
 
18
- # System prompt
19
- system_prompt = "You are a friendly assistant named FastLlama."
20
 
21
- def format_prompt(message: str, history: list):
22
- prompt = f"<|system|>\n{system_prompt}</s>\n"
23
- for user_msg, bot_msg in history:
24
- prompt += f"<|user|>\n{user_msg}</s>\n<|assistant|>\n{bot_msg}</s>\n"
25
- prompt += f"<|user|>\n{message}</s>\n<|assistant|>\n"
26
- return prompt
27
 
28
- def respond(message: str, history: list):
29
- # Format the prompt with chat history
30
- full_prompt = format_prompt(message, history)
31
-
32
- # Tokenize input with attention mask
33
- inputs = tokenizer(
34
- full_prompt,
35
- return_tensors="pt",
36
- padding=True,
37
- truncation=True
38
- ).to(model.device)
39
-
40
- # Generate response with attention mask
41
- output = model.generate(
42
- inputs.input_ids,
43
- attention_mask=inputs.attention_mask,
44
- max_new_tokens=256,
45
- temperature=0.7,
46
- top_p=0.9,
47
- repetition_penalty=1.1,
48
- do_sample=True,
49
- pad_token_id=tokenizer.pad_token_id
50
- )
51
-
52
- # Decode response while skipping special tokens
53
- response = tokenizer.decode(
54
- output[0][inputs.input_ids.shape[-1]:],
55
- skip_special_tokens=True
56
- )
57
-
58
- return response
59
 
60
- # Create chat interface
61
- chat = gr.ChatInterface(
62
- fn=respond,
63
- title="FastLlama-3.2B Chat",
64
- description="Chat with FastLlama-3.2-3B-Instruct AI assistant",
 
 
65
  examples=[
66
- ["Explain quantum computing in simple terms"],
67
- ["Write a poem about artificial intelligence"],
68
- ["What's the meaning of life?"]
69
- ],
70
- cache_examples=False
71
  )
72
 
73
  if __name__ == "__main__":
74
- chat.launch(server_name="0.0.0.0")
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer
3
+ import onnxruntime
4
+ import scipy.io.wavfile
5
+ import numpy as np
6
+ import torch # Import torch - might be needed for tokenizer output
7
 
8
+ # --- Load tokenizer and ONNX model from Hugging Face Hub ---
9
+ repo_id = "Athspi/Gg" # Replace with your actual repo ID if different
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained(repo_id)
12
+ onnx_model_path = f"{repo_id}/mms_tts_eng/model_quantized.onnx" # Path to quantized ONNX model inside the repo
13
+ ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
14
+
15
+
16
+ # --- Speech generation function ---
17
+ def generate_speech(text):
18
+ """Generates speech from text using the loaded ONNX model."""
19
+ inputs = tokenizer(text, return_tensors="pt")
20
+ input_ids = inputs.input_ids.cpu().to(torch.long) # Ensure LongTensor for ONNX
21
 
22
+ # Run inference with ONNX Runtime
23
+ onnx_outputs = ort_session.run(None, {"input_ids": input_ids.numpy()})
24
+ waveform = onnx_outputs[0] # Output waveform
25
 
26
+ sampling_rate = 16000 # Assuming 16kHz, adjust if your model uses different rate
 
27
 
28
+ return sampling_rate, waveform.squeeze() # Return sample rate and waveform
 
 
 
 
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # --- Gradio Interface ---
32
+ iface = gr.Interface(
33
+ fn=generate_speech,
34
+ inputs=gr.Textbox(lines=2, placeholder="Enter text to synthesize..."),
35
+ outputs=gr.Audio(label="Generated Speech"),
36
+ title="Fast MMS-TTS-ENG Text-to-Speech (CPU)",
37
+ description="Real-time Text-to-Speech using the optimized facebook/mms-tts-eng model with ONNX Runtime for fast CPU inference. Model and tokenizer loaded from Hugging Face Hub (Athspi/Gg).",
38
  examples=[
39
+ ["Hello, this is a demonstration of fast text-to-speech on CPU."],
40
+ ["This is another example sentence."],
41
+ ["How does this sound to you?"]
42
+ ]
 
43
  )
44
 
45
  if __name__ == "__main__":
46
+ iface.launch()