Spaces:
Sleeping
Sleeping
Commit
·
0a928fe
1
Parent(s):
51c343c
Enhance audio processing by ensuring correct numpy array conversion and adding error handling during transcription
Browse files
app.py
CHANGED
@@ -29,8 +29,12 @@ def process_audio_chunk(audio_chunk):
|
|
29 |
"""Process a single audio chunk and update buffer."""
|
30 |
global audio_buffer, buffer_duration
|
31 |
|
32 |
-
# Convert audio chunk to numpy array
|
33 |
-
|
|
|
|
|
|
|
|
|
34 |
audio_buffer.append(audio_array)
|
35 |
buffer_duration += len(audio_array) / RATE
|
36 |
|
@@ -48,13 +52,18 @@ def transcribe_audio():
|
|
48 |
# Concatenate buffer into a window
|
49 |
audio_window = np.concatenate(list(audio_buffer))
|
50 |
audio_window = audio_window[:window_samples] # Trim to window size
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
# Yield transcription if different from the last one
|
60 |
if transcription and transcription != last_transcription:
|
@@ -79,6 +88,9 @@ def audio_stream(audio):
|
|
79 |
# Audio is a tuple (sample_rate, data) from Gradio
|
80 |
sample_rate, audio_data = audio
|
81 |
|
|
|
|
|
|
|
82 |
# Resample audio to 16kHz if needed
|
83 |
if sample_rate != RATE:
|
84 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=RATE)
|
|
|
29 |
"""Process a single audio chunk and update buffer."""
|
30 |
global audio_buffer, buffer_duration
|
31 |
|
32 |
+
# Convert audio chunk to numpy array if not already
|
33 |
+
if not isinstance(audio_chunk, np.ndarray):
|
34 |
+
audio_array = np.array(audio_chunk, dtype=np.float32)
|
35 |
+
else:
|
36 |
+
audio_array = audio_chunk # Already a numpy array with correct type
|
37 |
+
|
38 |
audio_buffer.append(audio_array)
|
39 |
buffer_duration += len(audio_array) / RATE
|
40 |
|
|
|
52 |
# Concatenate buffer into a window
|
53 |
audio_window = np.concatenate(list(audio_buffer))
|
54 |
audio_window = audio_window[:window_samples] # Trim to window size
|
55 |
+
# Process audio with Whisper
|
56 |
+
try:
|
57 |
+
# Ensure audio is in the correct format for librosa
|
58 |
+
audio_window = audio_window.astype(np.float32)
|
59 |
+
audio_input, _ = librosa.load(audio_window, sr=RATE, mono=True)
|
60 |
+
inputs = processor(audio_input, sampling_rate=RATE, return_tensors="pt").to(DEVICE)
|
61 |
+
with torch.no_grad():
|
62 |
+
predicted_ids = model.generate(inputs["input_features"])
|
63 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
|
64 |
+
except Exception as e:
|
65 |
+
print(f"Error during transcription: {e}")
|
66 |
+
continue
|
67 |
|
68 |
# Yield transcription if different from the last one
|
69 |
if transcription and transcription != last_transcription:
|
|
|
88 |
# Audio is a tuple (sample_rate, data) from Gradio
|
89 |
sample_rate, audio_data = audio
|
90 |
|
91 |
+
# Ensure audio data is floating-point for librosa
|
92 |
+
audio_data = np.array(audio_data, dtype=np.float32)
|
93 |
+
|
94 |
# Resample audio to 16kHz if needed
|
95 |
if sample_rate != RATE:
|
96 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=RATE)
|