GavinHuang commited on
Commit
0a928fe
·
1 Parent(s): 51c343c

Enhance audio processing by ensuring correct numpy array conversion and adding error handling during transcription

Browse files
Files changed (1) hide show
  1. app.py +21 -9
app.py CHANGED
@@ -29,8 +29,12 @@ def process_audio_chunk(audio_chunk):
29
  """Process a single audio chunk and update buffer."""
30
  global audio_buffer, buffer_duration
31
 
32
- # Convert audio chunk to numpy array
33
- audio_array = np.array(audio_chunk, dtype=np.float32)
 
 
 
 
34
  audio_buffer.append(audio_array)
35
  buffer_duration += len(audio_array) / RATE
36
 
@@ -48,13 +52,18 @@ def transcribe_audio():
48
  # Concatenate buffer into a window
49
  audio_window = np.concatenate(list(audio_buffer))
50
  audio_window = audio_window[:window_samples] # Trim to window size
51
-
52
- # Process audio with Whisper
53
- audio_window, _ = librosa.load(audio_window, sr=RATE, mono=True)
54
- inputs = processor(audio_window, sampling_rate=RATE, return_tensors="pt").to(DEVICE)
55
- with torch.no_grad():
56
- predicted_ids = model.generate(inputs["input_features"])
57
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
 
 
 
 
 
58
 
59
  # Yield transcription if different from the last one
60
  if transcription and transcription != last_transcription:
@@ -79,6 +88,9 @@ def audio_stream(audio):
79
  # Audio is a tuple (sample_rate, data) from Gradio
80
  sample_rate, audio_data = audio
81
 
 
 
 
82
  # Resample audio to 16kHz if needed
83
  if sample_rate != RATE:
84
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=RATE)
 
29
  """Process a single audio chunk and update buffer."""
30
  global audio_buffer, buffer_duration
31
 
32
+ # Convert audio chunk to numpy array if not already
33
+ if not isinstance(audio_chunk, np.ndarray):
34
+ audio_array = np.array(audio_chunk, dtype=np.float32)
35
+ else:
36
+ audio_array = audio_chunk # Already a numpy array with correct type
37
+
38
  audio_buffer.append(audio_array)
39
  buffer_duration += len(audio_array) / RATE
40
 
 
52
  # Concatenate buffer into a window
53
  audio_window = np.concatenate(list(audio_buffer))
54
  audio_window = audio_window[:window_samples] # Trim to window size
55
+ # Process audio with Whisper
56
+ try:
57
+ # Ensure audio is in the correct format for librosa
58
+ audio_window = audio_window.astype(np.float32)
59
+ audio_input, _ = librosa.load(audio_window, sr=RATE, mono=True)
60
+ inputs = processor(audio_input, sampling_rate=RATE, return_tensors="pt").to(DEVICE)
61
+ with torch.no_grad():
62
+ predicted_ids = model.generate(inputs["input_features"])
63
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
64
+ except Exception as e:
65
+ print(f"Error during transcription: {e}")
66
+ continue
67
 
68
  # Yield transcription if different from the last one
69
  if transcription and transcription != last_transcription:
 
88
  # Audio is a tuple (sample_rate, data) from Gradio
89
  sample_rate, audio_data = audio
90
 
91
+ # Ensure audio data is floating-point for librosa
92
+ audio_data = np.array(audio_data, dtype=np.float32)
93
+
94
  # Resample audio to 16kHz if needed
95
  if sample_rate != RATE:
96
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=RATE)