Jaward commited on
Commit
c20b5a9
·
verified ·
1 Parent(s): 430796b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -101
app.py CHANGED
@@ -10,12 +10,6 @@ import torch
10
  import random
11
  from openai import OpenAI
12
  import subprocess
13
- import threading
14
- import queue
15
- import sounddevice as sd
16
- import numpy as np
17
- import wave
18
- import sys
19
 
20
  default_lang = "en"
21
 
@@ -118,90 +112,18 @@ def models(text, model="Llama 3 8B Service", seed=42):
118
 
119
  return output
120
 
121
- # New global variables for audio processing
122
- RATE = 16000
123
- CHUNK = int(RATE / 10) # 100ms
124
- audio_queue = queue.Queue()
125
- is_listening = False
126
-
127
- def audio_callback(indata, frames, time, status):
128
- if status:
129
- print(status, file=sys.stderr)
130
- audio_queue.put(indata.copy())
131
-
132
- def process_audio_stream(model, seed):
133
- global is_listening
134
- audio_buffer = []
135
- silence_threshold = 0.01
136
- silence_duration = 0
137
- max_silence = 2 # seconds
138
-
139
- while True:
140
- if not is_listening:
141
- audio_buffer.clear()
142
- silence_duration = 0
143
- audio_queue.queue.clear()
144
- continue
145
-
146
- try:
147
- chunk = audio_queue.get(timeout=1)
148
- audio_buffer.append(chunk)
149
-
150
- # Check for silence
151
- if np.abs(chunk).mean() < silence_threshold:
152
- silence_duration += CHUNK / RATE
153
- else:
154
- silence_duration = 0
155
-
156
- if silence_duration > max_silence:
157
- # Process the buffered audio
158
- audio_data = np.concatenate(audio_buffer)
159
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
160
- tmp_path = tmp_file.name
161
- with wave.open(tmp_path, 'wb') as wf:
162
- wf.setnchannels(1)
163
- wf.setsampwidth(2)
164
- wf.setframerate(RATE)
165
- wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
166
-
167
- # Transcribe and process
168
- user_input = transcribe(tmp_path)
169
- if user_input:
170
- is_listening = False
171
- reply = models(user_input, model, seed)
172
- asyncio.run(respond_and_play(reply))
173
- is_listening = True
174
-
175
- # Clear the buffer
176
- audio_buffer.clear()
177
- silence_duration = 0
178
-
179
- except queue.Empty:
180
- pass
181
-
182
- async def respond_and_play(text):
183
- communicate = edge_tts.Communicate(text, voice="en-US-ChristopherNeural")
184
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
185
  tmp_path = tmp_file.name
186
  await communicate.save(tmp_path)
187
-
188
- # Play the audio
189
- with wave.open(tmp_path, 'rb') as wf:
190
- data = wf.readframes(wf.getnframes())
191
- sd.play(np.frombuffer(data, dtype=np.int16), wf.getframerate())
192
- sd.wait()
193
-
194
- def start_listening(model, seed):
195
- global is_listening
196
- is_listening = True
197
- threading.Thread(target=process_audio_stream, args=(model, seed), daemon=True).start()
198
- with sd.InputStream(callback=audio_callback, channels=1, samplerate=RATE, blocksize=CHUNK):
199
- while is_listening:
200
- sd.sleep(100)
201
-
202
- def stop_listening():
203
- global is_listening
204
- is_listening = False
205
 
206
  # Supported languages for seamless-expressive
207
  LANGUAGE_CODES = {
@@ -276,21 +198,17 @@ with gr.Blocks(css="style.css") as demo:
276
  value=0,
277
  visible=False
278
  )
279
- start_button = gr.Button("Start Listening")
280
- stop_button = gr.Button("Stop Listening")
281
- status = gr.Markdown("Status: Not listening")
 
 
282
 
283
- start_button.click(
284
- fn=lambda model, seed: start_listening(model, seed),
285
- inputs=[select, seed],
286
- outputs=[status],
287
- _js="() => {document.getElementById('status').textContent = 'Status: Listening'}"
288
- )
289
- stop_button.click(
290
- fn=stop_listening,
291
- inputs=[],
292
- outputs=[status],
293
- _js="() => {document.getElementById('status').textContent = 'Status: Not listening'}"
294
  )
295
 
296
  with gr.TabItem("Speech Translation") as speech_translation:
 
10
  import random
11
  from openai import OpenAI
12
  import subprocess
 
 
 
 
 
 
13
 
14
  default_lang = "en"
15
 
 
112
 
113
  return output
114
 
115
+ async def respond(audio, model, seed):
116
+ if audio is None:
117
+ return None
118
+ user = transcribe(audio)
119
+ if not user:
120
+ return None
121
+ reply = models(user, model, seed)
122
+ communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
124
  tmp_path = tmp_file.name
125
  await communicate.save(tmp_path)
126
+ return tmp_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  # Supported languages for seamless-expressive
129
  LANGUAGE_CODES = {
 
198
  value=0,
199
  visible=False
200
  )
201
+ input = gr.Audio(label="User", sources=["microphone"], type="filepath")
202
+ output = gr.Audio(label="AI", type="filepath",
203
+ interactive=False,
204
+ autoplay=True,
205
+ elem_classes="audio")
206
 
207
+ gr.Interface(
208
+ fn=respond,
209
+ inputs=[input, select, seed],
210
+ outputs=[output],
211
+ live=True
 
 
 
 
 
 
212
  )
213
 
214
  with gr.TabItem("Speech Translation") as speech_translation: