Athspi commited on
Commit
380d6cf
·
verified ·
1 Parent(s): 2eebdd2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -140
app.py CHANGED
@@ -3,9 +3,10 @@ import whisper
3
  import torch
4
  import os
5
  from pydub import AudioSegment, silence
6
- from faster_whisper import WhisperModel # Import faster-whisper
7
  import numpy as np
8
  from scipy.io import wavfile
 
9
 
10
  # Mapping of model names to Whisper model sizes
11
  MODELS = {
@@ -187,141 +188,74 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
187
 
188
  return output_path
189
 
190
- def convert_to_wav(audio_file):
191
  """
192
- Convert the input audio file to WAV format.
193
 
194
  Args:
195
- audio_file (str): Path to the input audio file.
 
 
196
 
197
  Returns:
198
- str: Path to the converted WAV file.
199
- """
200
- audio = AudioSegment.from_file(audio_file)
201
- wav_path = "converted_audio.wav"
202
- audio.export(wav_path, format="wav")
203
- return wav_path
204
-
205
- def detect_voice_activity(audio_file, threshold=0.02):
206
  """
207
- Detect voice activity in the audio file and trim the audio to include only voice segments.
208
-
209
- Args:
210
- audio_file (str): Path to the input audio file.
211
- threshold (float): Amplitude threshold for voice detection. Default is 0.02.
212
-
213
- Returns:
214
- str: Path to the output audio file with only voice segments.
215
- """
216
- # Convert the input audio to WAV format
217
- wav_path = convert_to_wav(audio_file)
218
-
219
- # Load the WAV file
220
- sample_rate, data = wavfile.read(wav_path)
221
-
222
- # If the audio is stereo, convert it to mono by averaging the channels
223
- if len(data.shape) > 1:
224
- data = np.mean(data, axis=1)
225
-
226
- # Normalize the audio data to the range [-1, 1]
227
- if data.dtype != np.float32:
228
- data = data.astype(np.float32) / np.iinfo(data.dtype).max
229
-
230
- # Detect voice activity
231
- voice_segments = []
232
- is_voice = False
233
- start = 0
234
- for i, sample in enumerate(data):
235
- if abs(sample) > threshold and not is_voice:
236
- is_voice = True
237
- start = i
238
- elif abs(sample) <= threshold and is_voice:
239
- is_voice = False
240
- voice_segments.append((start, i))
241
-
242
- # If the last segment is voice, add it
243
- if is_voice:
244
- voice_segments.append((start, len(data)))
245
-
246
- # Trim the audio to include only voice segments
247
- trimmed_audio = np.array([], dtype=np.float32)
248
- for segment in voice_segments:
249
- trimmed_audio = np.concatenate((trimmed_audio, data[segment[0]:segment[1]]))
250
-
251
- # Convert the trimmed audio back to 16-bit integer format
252
- trimmed_audio_int16 = np.int16(trimmed_audio * 32767)
253
-
254
- # Export the trimmed audio
255
- output_path = "voice_trimmed_audio.wav"
256
- wavfile.write(output_path, sample_rate, trimmed_audio_int16)
257
-
258
- # Clean up the converted WAV file
259
- os.remove(wav_path)
260
-
261
- return output_path
262
-
263
- def detect_and_trim_audio(audio_file, threshold=0.02):
264
- """
265
- Detect voice activity in the audio file, trim the audio to include only voice segments,
266
- and return the timestamps of the detected segments.
267
-
268
- Args:
269
- audio_file (str): Path to the input audio file.
270
- threshold (float): Amplitude threshold for voice detection. Default is 0.02.
271
-
272
- Returns:
273
- str: Path to the output audio file with only voice segments.
274
- list: List of timestamps (start, end) for the detected segments.
275
- """
276
- # Convert the input audio to WAV format
277
- wav_path = convert_to_wav(audio_file)
278
-
279
- # Load the WAV file
280
- sample_rate, data = wavfile.read(wav_path)
281
-
282
- # If the audio is stereo, convert it to mono by averaging the channels
283
- if len(data.shape) > 1:
284
- data = np.mean(data, axis=1)
285
-
286
- # Normalize the audio data to the range [-1, 1]
287
- if data.dtype != np.float32:
288
- data = data.astype(np.float32) / np.iinfo(data.dtype).max
289
-
290
- # Detect voice activity
291
- voice_segments = []
292
- is_voice = False
293
- start = 0
294
- for i, sample in enumerate(data):
295
- if abs(sample) > threshold and not is_voice:
296
- is_voice = True
297
- start = i
298
- elif abs(sample) <= threshold and is_voice:
299
- is_voice = False
300
- voice_segments.append((start, i))
301
-
302
- # If the last segment is voice, add it
303
- if is_voice:
304
- voice_segments.append((start, len(data)))
305
-
306
- # Trim the audio to include only voice segments
307
- trimmed_audio = np.array([], dtype=np.float32)
308
- for segment in voice_segments:
309
- trimmed_audio = np.concatenate((trimmed_audio, data[segment[0]:segment[1]]))
310
-
311
- # Convert the trimmed audio back to 16-bit integer format
312
- trimmed_audio_int16 = np.int16(trimmed_audio * 32767)
313
 
314
  # Export the trimmed audio
315
- output_path = "voice_trimmed_audio.wav"
316
- wavfile.write(output_path, sample_rate, trimmed_audio_int16)
317
-
318
- # Calculate timestamps in seconds
319
- timestamps = [(start / sample_rate, end / sample_rate) for start, end in voice_segments]
320
 
321
- # Clean up the converted WAV file
322
- os.remove(wav_path)
323
 
324
- return output_path, timestamps
325
 
326
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
327
  """Transcribe the audio file."""
@@ -373,7 +307,7 @@ def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whis
373
 
374
  # Define the Gradio interface
375
  with gr.Blocks() as demo:
376
- gr.Markdown("# Audio Transcription and Language Detection")
377
 
378
  with gr.Tab("Detect Language"):
379
  gr.Markdown("Upload an audio file to detect its language.")
@@ -414,17 +348,18 @@ with gr.Blocks() as demo:
414
  silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
415
  silence_button = gr.Button("Remove Silence")
416
 
417
- with gr.Tab("Voice Detection and Trimming"):
418
- gr.Markdown("Upload an audio file to detect voice activity and trim the audio.")
419
- voice_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
420
- voice_threshold_slider = gr.Slider(
421
- minimum=0.01, maximum=0.1, value=0.02, step=0.01,
422
- label="Voice Detection Threshold",
423
- info="Higher values detect louder sounds as voice."
 
424
  )
425
- voice_output = gr.Audio(label="Trimmed Audio", type="filepath")
426
- timestamps_output = gr.Textbox(label="Detected Timestamps (seconds)")
427
- voice_button = gr.Button("Detect and Trim Voice")
428
 
429
  # Link buttons to functions
430
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
@@ -438,10 +373,10 @@ with gr.Blocks() as demo:
438
  inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
439
  outputs=silence_output
440
  )
441
- voice_button.click(
442
  detect_and_trim_audio,
443
- inputs=[voice_audio_input, voice_threshold_slider],
444
- outputs=[voice_output, timestamps_output]
445
  )
446
 
447
  # Launch the Gradio interface
 
3
  import torch
4
  import os
5
  from pydub import AudioSegment, silence
6
+ from faster_whisper import WhisperModel
7
  import numpy as np
8
  from scipy.io import wavfile
9
+ from scipy.signal import correlate
10
 
11
  # Mapping of model names to Whisper model sizes
12
  MODELS = {
 
188
 
189
  return output_path
190
 
191
+ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
192
  """
193
+ Detect the target audio in the main audio and trim the main audio to include only the detected segments.
194
 
195
  Args:
196
+ main_audio (str): Path to the main audio file.
197
+ target_audio (str): Path to the target audio file.
198
+ threshold (float): Detection threshold (0 to 1). Higher values mean stricter detection.
199
 
200
  Returns:
201
+ str: Path to the trimmed audio file.
202
+ str: Detected timestamps in the format "start-end (in seconds)".
 
 
 
 
 
 
203
  """
204
+ # Load audio files
205
+ main_rate, main_data = wavfile.read(main_audio)
206
+ target_rate, target_data = wavfile.read(target_audio)
207
+
208
+ # Ensure both audio files have the same sample rate
209
+ if main_rate != target_rate:
210
+ raise ValueError("Sample rates of the main audio and target audio must match.")
211
+
212
+ # Normalize audio data
213
+ main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
214
+ target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
215
+
216
+ # Perform cross-correlation to detect the target audio in the main audio
217
+ correlation = correlate(main_data, target_data, mode='valid')
218
+ correlation = np.abs(correlation)
219
+ max_corr = np.max(correlation)
220
+
221
+ # Detect segments where the target audio is present
222
+ detected_segments = []
223
+ for i, corr_value in enumerate(correlation):
224
+ if corr_value >= threshold * max_corr:
225
+ start_time = i / main_rate
226
+ end_time = (i + len(target_data)) / main_rate
227
+ detected_segments.append((start_time, end_time))
228
+
229
+ # Merge overlapping or nearby segments
230
+ merged_segments = []
231
+ for segment in detected_segments:
232
+ if not merged_segments:
233
+ merged_segments.append(segment)
234
+ else:
235
+ last_segment = merged_segments[-1]
236
+ if segment[0] <= last_segment[1] + 1.0: # Merge if within 1 second
237
+ merged_segments[-1] = (last_segment[0], max(last_segment[1], segment[1]))
238
+ else:
239
+ merged_segments.append(segment)
240
+
241
+ # Trim the main audio to include only the detected segments
242
+ main_audio_segment = AudioSegment.from_file(main_audio)
243
+ trimmed_audio = AudioSegment.empty()
244
+ timestamps = []
245
+ for segment in merged_segments:
246
+ start_ms = int(segment[0] * 1000)
247
+ end_ms = int(segment[1] * 1000)
248
+ trimmed_audio += main_audio_segment[start_ms:end_ms]
249
+ timestamps.append(f"{segment[0]:.2f}-{segment[1]:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  # Export the trimmed audio
252
+ output_path = "trimmed_audio.wav"
253
+ trimmed_audio.export(output_path, format="wav")
 
 
 
254
 
255
+ # Format timestamps
256
+ timestamps_str = "\n".join(timestamps)
257
 
258
+ return output_path, timestamps_str
259
 
260
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
261
  """Transcribe the audio file."""
 
307
 
308
  # Define the Gradio interface
309
  with gr.Blocks() as demo:
310
+ gr.Markdown("# Audio Processing Tool")
311
 
312
  with gr.Tab("Detect Language"):
313
  gr.Markdown("Upload an audio file to detect its language.")
 
348
  silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
349
  silence_button = gr.Button("Remove Silence")
350
 
351
+ with gr.Tab("Detect and Trim Audio"):
352
+ gr.Markdown("Upload a main audio file and a target audio file. The app will detect the target audio in the main audio and trim it.")
353
+ main_audio_input = gr.Audio(type="filepath", label="Upload Main Audio File")
354
+ target_audio_input = gr.Audio(type="filepath", label="Upload Target Audio File")
355
+ threshold_slider = gr.Slider(
356
+ minimum=0.1, maximum=1.0, value=0.5, step=0.1,
357
+ label="Detection Threshold",
358
+ info="Higher values mean stricter detection."
359
  )
360
+ trimmed_audio_output = gr.Audio(label="Trimmed Audio", type="filepath")
361
+ timestamps_output = gr.Textbox(label="Detected Timestamps (in seconds)")
362
+ detect_button = gr.Button("Detect and Trim")
363
 
364
  # Link buttons to functions
365
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
 
373
  inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
374
  outputs=silence_output
375
  )
376
+ detect_button.click(
377
  detect_and_trim_audio,
378
+ inputs=[main_audio_input, target_audio_input, threshold_slider],
379
+ outputs=[trimmed_audio_output, timestamps_output]
380
  )
381
 
382
  # Launch the Gradio interface