qqwjq1981 commited on
Commit
5e11de1
·
verified ·
1 Parent(s): 2af32a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -79
app.py CHANGED
@@ -279,87 +279,42 @@ def transcribe_video_with_speakers(video_path):
279
 
280
  return transcript_with_speakers, detected_language
281
 
282
- def segment_audio_from_video(video_path, frame_duration_ms=30):
283
- """
284
- Extracts audio from a video and segments it into speech chunks using WebRTC VAD.
285
- Returns a list of dictionaries, each with 'start' and 'end' timestamps for speech segments.
286
-
287
- Args:
288
- video_path (str): The path to the input video file.
289
- frame_duration_ms (int): The duration of a frame in milliseconds for VAD (10, 20, or 30).
290
- Lower values are more precise but computationally intensive.
291
-
292
- Returns:
293
- tuple: A tuple containing:
294
- - audio_path (str): Path to the extracted temporary audio file.
295
- - speech_segments (list): A list of dictionaries, where each dictionary
296
- represents a speech segment with 'start' and 'end' timestamps in seconds.
297
- - error_message (str, optional): An error message if processing fails.
298
- """
299
- audio_path = "temp_extracted_audio.wav"
300
- speech_segments = []
301
- error_message = None
302
 
 
 
 
 
 
 
 
303
  try:
304
- # 1. Extract audio from video
305
- logger.info(f"Extracting audio from video: {video_path}")
306
- video = VideoFileClip(video_path)
307
- # Ensure audio is saved in a compatible format for WebRTC VAD (16-bit, 1 channel, 8000/16000/32000 Hz)
308
- # We will resample to 16kHz for VAD as it's a good balance.
309
- video.audio.write_audiofile(audio_path)
310
- video.close()
311
- logger.info(f"Audio extracted to: {audio_path}")
312
-
313
- # 2. Load audio for VAD
314
- audio = AudioSegment.from_wav(audio_path)
315
- sample_rate = audio.frame_rate
316
- audio_data = np.array(audio.get_array_of_samples())
317
-
318
- # WebRTC VAD operates on 16-bit mono audio at 8kHz, 16kHz, or 32kHz.
319
- # We already saved at 16kHz, so we can proceed.
320
- if sample_rate not in [8000, 16000, 32000]:
321
- error_message = f"Unsupported sample rate for VAD: {sample_rate} Hz. Must be 8kHz, 16kHz, or 32kHz."
322
- logger.error(error_message)
323
- return audio_path, [], error_message
324
-
325
- vad = webrtcvad.Vad(3) # Aggressiveness mode (0-3, 3 is most aggressive)
326
- frames = []
327
- offset = 0
328
- while offset + frame_duration_ms <= len(audio):
329
- frame_start = offset
330
- frame_end = offset + frame_duration_ms
331
- frame = audio[frame_start:frame_end]
332
- frames.append(frame)
333
- offset += frame_duration_ms
334
-
335
- logger.info(f"Running WebRTC VAD on {len(frames)} frames...")
336
-
337
- current_segment_start = None
338
- for i, frame in enumerate(frames):
339
- is_speech = vad.is_speech(frame.raw_data, sample_rate)
340
-
341
- frame_start_time = (i * frame_duration_ms) / 1000.0
342
- frame_end_time = ((i + 1) * frame_duration_ms) / 1000.0
343
 
344
- if is_speech:
345
- if current_segment_start is None:
346
- current_segment_start = frame_start_time
347
- else:
348
- if current_segment_start is not None:
349
- speech_segments.append({"start": current_segment_start, "end": frame_end_time})
350
- current_segment_start = None
351
-
352
- # Add the last segment if it ended with speech
353
- if current_segment_start is not None:
354
- speech_segments.append({"start": current_segment_start, "end": len(audio) / 1000.0})
355
 
356
- logger.info(f"VAD completed. Found {len(speech_segments)} speech segments.")
 
 
 
 
 
 
 
357
 
358
- except Exception as e:
359
- error_message = f"An error occurred during audio segmentation: {e}"
360
- logger.error(error_message)
361
-
362
- return audio_path, speech_segments, error_message
363
 
364
  def transcribe_segments_with_scribe(full_audio_path, segments):
365
  """
@@ -1373,9 +1328,7 @@ def upload_and_manage(file, target_language, process_mode):
1373
 
1374
  # Step 1: Segment audio from the uploaded video/audio file
1375
  logger.info("Segmenting audio...")
1376
- temp_audio_for_vad, speech_segments, seg_error = segment_audio_from_video(file.name)
1377
- if seg_error:
1378
- raise Exception(f"Audio segmentation failed: {seg_error}")
1379
  if not speech_segments:
1380
  raise Exception("No speech segments detected in the audio.")
1381
  logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")
 
279
 
280
  return transcript_with_speakers, detected_language
281
 
282
+ def segment_audio_from_video(video_path):
283
+ # Extract audio from video
284
+ video = VideoFileClip(video_path)
285
+ audio_path = "audio.wav"
286
+ video.audio.write_audiofile(audio_path)
287
+ logger.info(f"Audio extracted from video: {audio_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ segment_result, speech_audio_path = segment_background_audio(audio_path)
290
+ print(f"Saved non-speech (background) audio to local")
291
+
292
+ # Set up device
293
+ device = "cuda" if torch.cuda.is_available() else "cpu"
294
+ logger.info(f"Using device: {device}")
295
+
296
  try:
297
+ # Load a medium model with float32 for broader compatibility
298
+ model = whisperx.load_model("large-v3", device=device, compute_type="float32")
299
+ logger.info("WhisperX model loaded")
300
+
301
+ # Transcribe
302
+ result = model.transcribe(speech_audio_path, chunk_size=4, print_progress = True)
303
+ logger.info("Audio transcription completed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
+ except Exception as e:
306
+ logger.error(f"❌ WhisperX pipeline failed: {e}")
 
 
 
 
 
 
 
 
 
307
 
308
+ # Extract timestamps, text, and speaker IDs
309
+ transcript_with_speakers = [
310
+ {
311
+ "start": segment["start"],
312
+ "end": segment["end"]
313
+ }
314
+ for segment in result["segments"]
315
+ ]
316
 
317
+ return audio_path, transcript_with_speakers
 
 
 
 
318
 
319
  def transcribe_segments_with_scribe(full_audio_path, segments):
320
  """
 
1328
 
1329
  # Step 1: Segment audio from the uploaded video/audio file
1330
  logger.info("Segmenting audio...")
1331
+ temp_audio_for_vad, speech_segments = segment_audio_from_video(file.name)
 
 
1332
  if not speech_segments:
1333
  raise Exception("No speech segments detected in the audio.")
1334
  logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")