EYEDOL commited on
Commit
aeacff6
·
verified ·
1 Parent(s): ca24ea7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -150
app.py CHANGED
@@ -13,184 +13,118 @@ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer
13
  from scipy.io.wavfile import write as write_wav
14
  import os
15
  import re
16
-
17
-
18
- import os
19
  from huggingface_hub import login
20
 
21
- # Read the secret from environment variables
22
- hf_token = os.environ.get("hugface") # Replace HF_TOKEN with the name of your secret in HF settings
23
-
24
- # Login using the token
 
25
  login(token=hf_token)
26
-
27
  print("Successfully logged into Hugging Face Hub!")
28
 
29
  # --- Configuration ---
30
-
31
  STT_MODEL_ID = "EYEDOL/SALAMA_C3"
32
-
33
- #Swahili LLM.
34
- LLM_MODEL_ID = "google/gemma-3-1b-it"
35
-
36
- # This is the tokenizer for your ONNX TTS model.
37
  TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
38
  TTS_ONNX_MODEL_PATH = "swahili_tts.onnx"
39
 
40
- # Ensure the temporary directory for audio files exists
41
  TEMP_DIR = "temp"
42
  os.makedirs(TEMP_DIR, exist_ok=True)
43
 
44
 
45
  class WeeboAssistant:
46
  def __init__(self):
47
- # Audio settings
48
  self.STT_SAMPLE_RATE = 16000
49
  self.TTS_SAMPLE_RATE = 16000
50
-
51
- # System prompt for the LLM
52
- self.SYSTEM_PROMPT = "Wewe ni msaidizi mwenye akili, jibu swali lililoulizwa kwa UFUPI na kwa usahihi. Jibu kwa lugha ya Kiswahili pekee. Hakuna jibu refu."
53
-
54
  self._init_models()
55
 
56
  def _init_models(self):
57
- """Initializes all models required for the pipeline."""
58
  print("Initializing models...")
59
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
60
  self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
61
  print(f"Using device: {self.device}")
62
 
63
- # --- 1. Initialize Swahili Speech-to-Text (STT/ASR) ---
64
  print(f"Loading STT model: {STT_MODEL_ID}")
65
- try:
66
- self.stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
67
- self.stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
68
- STT_MODEL_ID,
69
- torch_dtype=self.torch_dtype,
70
- low_cpu_mem_usage=True,
71
- use_safetensors=True
72
- )
73
- self.stt_model.to(self.device)
74
- print("STT model loaded successfully.")
75
- except Exception as e:
76
- print(f"FATAL: Could not load STT model. Please check the model ID and ensure you have access. Error: {e}")
77
- # In a real app, you might want to handle this more gracefully
78
- raise
79
-
80
- # --- 2. Initialize Language Model (LLM) ---
81
  print(f"Loading LLM: {LLM_MODEL_ID}")
82
- try:
83
- # We don't need a separate tokenizer for the pipeline
84
- self.llm_pipeline = pipeline(
85
- "text-generation",
86
- model=LLM_MODEL_ID,
87
- model_kwargs={"torch_dtype": self.torch_dtype},
88
- device=self.device,
89
- )
90
- print("LLM pipeline loaded successfully.")
91
- except Exception as e:
92
- print(f"FATAL: Could not load LLM. Error: {e}")
93
- raise
94
 
95
- # --- 3. Initialize Swahili Text-to-Speech (TTS) ---
96
  print(f"Loading TTS model: {TTS_ONNX_MODEL_PATH}")
97
- try:
98
- # The ONNX model should be in the same repository as app.py
99
- self.tts_session = onnxruntime.InferenceSession(
100
- TTS_ONNX_MODEL_PATH,
101
- providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
102
- )
103
- self.tts_tokenizer = AutoTokenizer.from_pretrained(TTS_TOKENIZER_ID)
104
- print("TTS model and tokenizer loaded successfully.")
105
- except Exception as e:
106
- print(f"FATAL: Could not load TTS model. Make sure '{TTS_ONNX_MODEL_PATH}' is in the repository. Error: {e}")
107
- raise
108
-
109
  print("-" * 30)
110
  print("All models initialized successfully! ✅")
111
 
112
- def transcribe_audio(self, audio_tuple: tuple) -> str:
113
- """
114
- Transcribes audio from Gradio's audio component.
115
- The input is a tuple (sample_rate, numpy_array).
116
- """
117
  if audio_tuple is None:
118
  return ""
119
-
120
  sample_rate, audio_data = audio_tuple
121
-
122
- # Convert to mono float32
123
  if audio_data.ndim > 1:
124
  audio_data = audio_data.mean(axis=1)
125
  if audio_data.dtype != np.float32:
126
  audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
127
-
128
- # Resample if necessary
129
  if sample_rate != self.STT_SAMPLE_RATE:
130
  audio_data = librosa.resample(y=audio_data, orig_sr=sample_rate, target_sr=self.STT_SAMPLE_RATE)
131
-
132
- if len(audio_data) < 1000: # Ignore very short audio clips
133
  return "(Audio too short to transcribe)"
134
-
135
- # Process and transcribe
136
  inputs = self.stt_processor(audio_data, sampling_rate=self.STT_SAMPLE_RATE, return_tensors="pt")
137
- inputs = {key: val.to(self.device) for key, val in inputs.items()}
138
-
139
  with torch.no_grad():
140
  generated_ids = self.stt_model.generate(**inputs, max_new_tokens=128)
141
-
142
  transcription = self.stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
143
  return transcription.strip()
144
 
145
- def generate_speech(self, text: str) -> str:
146
- """
147
- Generates audio from text and saves it to a temporary file.
148
- Returns the path to the audio file.
149
- """
150
  if not text:
151
  return None
152
-
153
- # Clean text
154
  text = text.strip()
155
-
156
- try:
157
- inputs = self.tts_tokenizer(text, return_tensors="np")
158
- input_ids = inputs.input_ids
159
- ort_inputs = {self.tts_session.get_inputs()[0].name: input_ids}
160
- audio_waveform = self.tts_session.run(None, ort_inputs)[0].flatten()
161
-
162
- # Save to a temporary WAV file
163
- output_path = os.path.join(TEMP_DIR, f"{os.urandom(8).hex()}.wav")
164
- write_wav(output_path, self.TTS_SAMPLE_RATE, audio_waveform)
165
- return output_path
166
- except Exception as e:
167
- print(f"Error during audio generation: {e}")
168
- return None
169
-
170
- def get_llm_response(self, chat_history: list):
171
- """
172
- Gets a streaming response from the LLM.
173
- Yields the updated full response at each step.
174
- """
175
- # Format messages for the pipeline
176
- # The Gemma-2 instruction-tuned model uses a specific turn-based format
177
  messages = [{'role': 'system', 'content': self.SYSTEM_PROMPT}]
178
  for turn in chat_history:
179
- messages.append({'role': 'user', 'content': turn[0]})
180
- if turn[1] is not None:
181
  messages.append({'role': 'assistant', 'content': turn[1]})
182
-
183
  prompt = self.llm_pipeline.tokenizer.apply_chat_template(
184
- messages,
185
- tokenize=False,
186
- add_generation_prompt=True
187
  )
188
-
189
  terminators = [
190
  self.llm_pipeline.tokenizer.eos_token_id,
191
  self.llm_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
192
  ]
193
-
194
  streamer = self.llm_pipeline(
195
  prompt,
196
  max_new_tokens=512,
@@ -202,59 +136,48 @@ class WeeboAssistant:
202
  )
203
  return streamer
204
 
205
- # --- Gradio Interface Logic ---
206
 
207
- # Instantiate the assistant
208
  assistant = WeeboAssistant()
209
 
 
210
  def s2s_pipeline(audio_input, chat_history):
211
- """The main function for the Speech-to-Speech tab."""
212
- # 1. Transcribe user's speech
213
  user_text = assistant.transcribe_audio(audio_input)
214
  if not user_text or user_text.startswith("("):
215
  chat_history.append((user_text or "(No valid speech detected)", None))
216
  yield chat_history, None, "Please record your voice again."
217
  return
218
-
219
  chat_history.append((user_text, None))
220
- yield chat_history, None, "..." # Show user text and a thinking indicator
221
-
222
- # 2. Get LLM response as a stream
223
  response_stream = assistant.get_llm_response(chat_history)
224
-
225
- # Stream the response text to the UI
226
  llm_response_text = ""
227
  for text_chunk in response_stream:
228
  llm_response_text = text_chunk
229
  chat_history[-1] = (user_text, llm_response_text)
230
  yield chat_history, None, llm_response_text
231
-
232
- # 3. Synthesize the final LLM response to speech
233
  final_audio_path = assistant.generate_speech(llm_response_text)
234
-
235
- # 4. Final update to the UI
236
  yield chat_history, final_audio_path, llm_response_text
237
 
 
238
  def t2t_pipeline(text_input, chat_history):
239
- """The main function for the Text-to-Text tab."""
240
  chat_history.append((text_input, None))
241
  yield chat_history, "..."
242
-
243
  response_stream = assistant.get_llm_response(chat_history)
244
-
245
  llm_response_text = ""
246
  for text_chunk in response_stream:
247
  llm_response_text = text_chunk
248
  chat_history[-1] = (text_input, llm_response_text)
249
  yield chat_history, llm_response_text
250
 
251
- # --- Build Gradio UI ---
 
 
 
 
252
  with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
253
  gr.Markdown("# 🤖 Msaidizi wa Sauti wa Kiswahili (Swahili Voice Assistant)")
254
  gr.Markdown("Ongea na msaidizi kwa Kiswahili. Toa sauti, andika maandishi, na upate majibu kwa sauti au maandishi.")
255
 
256
  with gr.Tabs():
257
- # Tab 1: Speech-to-Speech
258
  with gr.TabItem("🎙️ Sauti-kwa-Sauti (Speech-to-Speech)"):
259
  with gr.Row():
260
  with gr.Column(scale=2):
@@ -265,32 +188,25 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
265
  s2s_audio_out = gr.Audio(type="filepath", label="Jibu la Sauti (Audio Response)", autoplay=True)
266
  s2s_text_out = gr.Textbox(label="Jibu la Maandishi (Text Response)", interactive=False)
267
 
268
- # Tab 2: Text-to-Text
269
  with gr.TabItem("⌨️ Maandishi-kwa-Maandishi (Text-to-Text)"):
270
  t2t_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=500)
271
  with gr.Row():
272
  t2t_text_in = gr.Textbox(label="Andika Hapa (Write Here)", placeholder="Habari yako...", scale=4)
273
  t2t_submit_btn = gr.Button("Tuma (Submit)", variant="primary", scale=1)
274
 
275
- # Tab 3: Direct Tools
276
  with gr.TabItem("🛠️ Zana (Tools)"):
277
  with gr.Row():
278
- # Speech to Text Tool
279
  with gr.Column():
280
  gr.Markdown("### Unukuzi wa Sauti (Speech Transcription)")
281
  tool_s2t_audio_in = gr.Audio(sources=["microphone"], type="numpy", label="Sauti ya Kuingiza (Input Audio)")
282
  tool_s2t_text_out = gr.Textbox(label="Maandishi Yaliyonukuliwa (Transcribed Text)", interactive=False)
283
  tool_s2t_btn = gr.Button("Nukuu (Transcribe)")
284
- # Text to Speech Tool
285
  with gr.Column():
286
  gr.Markdown("### Utengenezaji wa Sauti (Speech Synthesis)")
287
  tool_t2s_text_in = gr.Textbox(label="Maandishi ya Kuingiza (Input Text)", placeholder="Andika Kiswahili hapa...")
288
  tool_t2s_audio_out = gr.Audio(type="filepath", label="Sauti Iliyotengenezwa (Synthesized Audio)", autoplay=False)
289
  tool_t2s_btn = gr.Button("Tengeneza Sauti (Synthesize)")
290
 
291
- # --- Event Handlers ---
292
-
293
- # Speech-to-Speech handler
294
  s2s_submit_btn.click(
295
  fn=s2s_pipeline,
296
  inputs=[s2s_audio_in, s2s_chatbot],
@@ -298,17 +214,17 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
298
  queue=True
299
  )
300
 
301
- # Text-to-Text handler
302
  t2t_submit_btn.click(
303
  fn=t2t_pipeline,
304
  inputs=[t2t_text_in, t2t_chatbot],
305
- outputs=[t2t_chatbot, t2t_text_in.change(value="")], # Clear input box on submit
306
  queue=True
307
  ).then(
308
- lambda x: x, t2t_chatbot, t2t_text_in
309
- ) # The text response is streamed directly to the chatbot UI
 
 
310
 
311
- # Tool handlers
312
  tool_s2t_btn.click(
313
  fn=assistant.transcribe_audio,
314
  inputs=tool_s2t_audio_in,
@@ -320,5 +236,4 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
320
  outputs=tool_t2s_audio_out
321
  )
322
 
323
- # Launch the Gradio app
324
- demo.queue().launch(debug=True)
 
13
  from scipy.io.wavfile import write as write_wav
14
  import os
15
  import re
 
 
 
16
  from huggingface_hub import login
17
 
18
+ # --- Login to Hugging Face using secret ---
19
+ # Make sure HF_TOKEN is set in your Hugging Face Space > Settings > Repository secrets
20
+ hf_token = os.environ.get("HF_TOKEN")
21
+ if not hf_token:
22
+ raise ValueError("HF_TOKEN not found. Please set it in Hugging Face Space repository secrets.")
23
  login(token=hf_token)
 
24
  print("Successfully logged into Hugging Face Hub!")
25
 
26
  # --- Configuration ---
 
27
  STT_MODEL_ID = "EYEDOL/SALAMA_C3"
28
+ LLM_MODEL_ID = "google/gemma-3-1b-it"
 
 
 
 
29
  TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
30
  TTS_ONNX_MODEL_PATH = "swahili_tts.onnx"
31
 
 
32
  TEMP_DIR = "temp"
33
  os.makedirs(TEMP_DIR, exist_ok=True)
34
 
35
 
36
  class WeeboAssistant:
37
  def __init__(self):
 
38
  self.STT_SAMPLE_RATE = 16000
39
  self.TTS_SAMPLE_RATE = 16000
40
+ self.SYSTEM_PROMPT = (
41
+ "Wewe ni msaidizi mwenye akili, jibu swali lililoulizwa kwa UFUPI na kwa usahihi. "
42
+ "Jibu kwa lugha ya Kiswahili pekee. Hakuna jibu refu."
43
+ )
44
  self._init_models()
45
 
46
  def _init_models(self):
 
47
  print("Initializing models...")
48
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
49
  self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
50
  print(f"Using device: {self.device}")
51
 
52
+ # STT
53
  print(f"Loading STT model: {STT_MODEL_ID}")
54
+ self.stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
55
+ self.stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
56
+ STT_MODEL_ID,
57
+ torch_dtype=self.torch_dtype,
58
+ low_cpu_mem_usage=True,
59
+ use_safetensors=True
60
+ ).to(self.device)
61
+ print("STT model loaded successfully.")
62
+
63
+ # LLM
 
 
 
 
 
 
64
  print(f"Loading LLM: {LLM_MODEL_ID}")
65
+ self.llm_pipeline = pipeline(
66
+ "text-generation",
67
+ model=LLM_MODEL_ID,
68
+ model_kwargs={"torch_dtype": self.torch_dtype},
69
+ device=self.device,
70
+ )
71
+ print("LLM pipeline loaded successfully.")
 
 
 
 
 
72
 
73
+ # TTS
74
  print(f"Loading TTS model: {TTS_ONNX_MODEL_PATH}")
75
+ self.tts_session = onnxruntime.InferenceSession(
76
+ TTS_ONNX_MODEL_PATH,
77
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
78
+ )
79
+ self.tts_tokenizer = AutoTokenizer.from_pretrained(TTS_TOKENIZER_ID)
80
+ print("TTS model and tokenizer loaded successfully.")
81
+
 
 
 
 
 
82
  print("-" * 30)
83
  print("All models initialized successfully! ✅")
84
 
85
+ def transcribe_audio(self, audio_tuple):
 
 
 
 
86
  if audio_tuple is None:
87
  return ""
 
88
  sample_rate, audio_data = audio_tuple
 
 
89
  if audio_data.ndim > 1:
90
  audio_data = audio_data.mean(axis=1)
91
  if audio_data.dtype != np.float32:
92
  audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
 
 
93
  if sample_rate != self.STT_SAMPLE_RATE:
94
  audio_data = librosa.resample(y=audio_data, orig_sr=sample_rate, target_sr=self.STT_SAMPLE_RATE)
95
+ if len(audio_data) < 1000:
 
96
  return "(Audio too short to transcribe)"
 
 
97
  inputs = self.stt_processor(audio_data, sampling_rate=self.STT_SAMPLE_RATE, return_tensors="pt")
98
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
 
99
  with torch.no_grad():
100
  generated_ids = self.stt_model.generate(**inputs, max_new_tokens=128)
 
101
  transcription = self.stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
102
  return transcription.strip()
103
 
104
+ def generate_speech(self, text):
 
 
 
 
105
  if not text:
106
  return None
 
 
107
  text = text.strip()
108
+ inputs = self.tts_tokenizer(text, return_tensors="np")
109
+ ort_inputs = {self.tts_session.get_inputs()[0].name: inputs.input_ids}
110
+ audio_waveform = self.tts_session.run(None, ort_inputs)[0].flatten()
111
+ output_path = os.path.join(TEMP_DIR, f"{os.urandom(8).hex()}.wav")
112
+ write_wav(output_path, self.TTS_SAMPLE_RATE, audio_waveform)
113
+ return output_path
114
+
115
+ def get_llm_response(self, chat_history):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  messages = [{'role': 'system', 'content': self.SYSTEM_PROMPT}]
117
  for turn in chat_history:
118
+ messages.append({'role': 'user', 'content': turn[0]})
119
+ if turn[1] is not None:
120
  messages.append({'role': 'assistant', 'content': turn[1]})
 
121
  prompt = self.llm_pipeline.tokenizer.apply_chat_template(
122
+ messages, tokenize=False, add_generation_prompt=True
 
 
123
  )
 
124
  terminators = [
125
  self.llm_pipeline.tokenizer.eos_token_id,
126
  self.llm_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
127
  ]
 
128
  streamer = self.llm_pipeline(
129
  prompt,
130
  max_new_tokens=512,
 
136
  )
137
  return streamer
138
 
 
139
 
 
140
  assistant = WeeboAssistant()
141
 
142
+
143
  def s2s_pipeline(audio_input, chat_history):
 
 
144
  user_text = assistant.transcribe_audio(audio_input)
145
  if not user_text or user_text.startswith("("):
146
  chat_history.append((user_text or "(No valid speech detected)", None))
147
  yield chat_history, None, "Please record your voice again."
148
  return
 
149
  chat_history.append((user_text, None))
150
+ yield chat_history, None, "..."
 
 
151
  response_stream = assistant.get_llm_response(chat_history)
 
 
152
  llm_response_text = ""
153
  for text_chunk in response_stream:
154
  llm_response_text = text_chunk
155
  chat_history[-1] = (user_text, llm_response_text)
156
  yield chat_history, None, llm_response_text
 
 
157
  final_audio_path = assistant.generate_speech(llm_response_text)
 
 
158
  yield chat_history, final_audio_path, llm_response_text
159
 
160
+
161
  def t2t_pipeline(text_input, chat_history):
 
162
  chat_history.append((text_input, None))
163
  yield chat_history, "..."
 
164
  response_stream = assistant.get_llm_response(chat_history)
 
165
  llm_response_text = ""
166
  for text_chunk in response_stream:
167
  llm_response_text = text_chunk
168
  chat_history[-1] = (text_input, llm_response_text)
169
  yield chat_history, llm_response_text
170
 
171
+
172
+ def clear_textbox():
173
+ return ""
174
+
175
+
176
  with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
177
  gr.Markdown("# 🤖 Msaidizi wa Sauti wa Kiswahili (Swahili Voice Assistant)")
178
  gr.Markdown("Ongea na msaidizi kwa Kiswahili. Toa sauti, andika maandishi, na upate majibu kwa sauti au maandishi.")
179
 
180
  with gr.Tabs():
 
181
  with gr.TabItem("🎙️ Sauti-kwa-Sauti (Speech-to-Speech)"):
182
  with gr.Row():
183
  with gr.Column(scale=2):
 
188
  s2s_audio_out = gr.Audio(type="filepath", label="Jibu la Sauti (Audio Response)", autoplay=True)
189
  s2s_text_out = gr.Textbox(label="Jibu la Maandishi (Text Response)", interactive=False)
190
 
 
191
  with gr.TabItem("⌨️ Maandishi-kwa-Maandishi (Text-to-Text)"):
192
  t2t_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=500)
193
  with gr.Row():
194
  t2t_text_in = gr.Textbox(label="Andika Hapa (Write Here)", placeholder="Habari yako...", scale=4)
195
  t2t_submit_btn = gr.Button("Tuma (Submit)", variant="primary", scale=1)
196
 
 
197
  with gr.TabItem("🛠️ Zana (Tools)"):
198
  with gr.Row():
 
199
  with gr.Column():
200
  gr.Markdown("### Unukuzi wa Sauti (Speech Transcription)")
201
  tool_s2t_audio_in = gr.Audio(sources=["microphone"], type="numpy", label="Sauti ya Kuingiza (Input Audio)")
202
  tool_s2t_text_out = gr.Textbox(label="Maandishi Yaliyonukuliwa (Transcribed Text)", interactive=False)
203
  tool_s2t_btn = gr.Button("Nukuu (Transcribe)")
 
204
  with gr.Column():
205
  gr.Markdown("### Utengenezaji wa Sauti (Speech Synthesis)")
206
  tool_t2s_text_in = gr.Textbox(label="Maandishi ya Kuingiza (Input Text)", placeholder="Andika Kiswahili hapa...")
207
  tool_t2s_audio_out = gr.Audio(type="filepath", label="Sauti Iliyotengenezwa (Synthesized Audio)", autoplay=False)
208
  tool_t2s_btn = gr.Button("Tengeneza Sauti (Synthesize)")
209
 
 
 
 
210
  s2s_submit_btn.click(
211
  fn=s2s_pipeline,
212
  inputs=[s2s_audio_in, s2s_chatbot],
 
214
  queue=True
215
  )
216
 
 
217
  t2t_submit_btn.click(
218
  fn=t2t_pipeline,
219
  inputs=[t2t_text_in, t2t_chatbot],
220
+ outputs=[t2t_chatbot, t2t_text_in],
221
  queue=True
222
  ).then(
223
+ fn=clear_textbox,
224
+ inputs=None,
225
+ outputs=t2t_text_in
226
+ )
227
 
 
228
  tool_s2t_btn.click(
229
  fn=assistant.transcribe_audio,
230
  inputs=tool_s2t_audio_in,
 
236
  outputs=tool_t2s_audio_out
237
  )
238
 
239
+ demo.queue().launch(debug=True)