studio_V1_4_OCR_SOTA

Sleeping

App Files Files Community

qqwjq1981 commited on Jun 4

Commit

5e11de1

verified ·

1 Parent(s): 2af32a6

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -79

app.py CHANGED Viewed

@@ -279,87 +279,42 @@ def transcribe_video_with_speakers(video_path):
     return transcript_with_speakers, detected_language
-def segment_audio_from_video(video_path, frame_duration_ms=30):
-    """
-    Extracts audio from a video and segments it into speech chunks using WebRTC VAD.
-    Returns a list of dictionaries, each with 'start' and 'end' timestamps for speech segments.
-    Args:
-        video_path (str): The path to the input video file.
-        frame_duration_ms (int): The duration of a frame in milliseconds for VAD (10, 20, or 30).
-                                 Lower values are more precise but computationally intensive.
-    Returns:
-        tuple: A tuple containing:
-            - audio_path (str): Path to the extracted temporary audio file.
-            - speech_segments (list): A list of dictionaries, where each dictionary
-              represents a speech segment with 'start' and 'end' timestamps in seconds.
-            - error_message (str, optional): An error message if processing fails.
-    """
-    audio_path = "temp_extracted_audio.wav"
-    speech_segments = []
-    error_message = None
     try:
-        # 1. Extract audio from video
-        logger.info(f"Extracting audio from video: {video_path}")
-        video = VideoFileClip(video_path)
-        # Ensure audio is saved in a compatible format for WebRTC VAD (16-bit, 1 channel, 8000/16000/32000 Hz)
-        # We will resample to 16kHz for VAD as it's a good balance.
-        video.audio.write_audiofile(audio_path)
-        video.close()
-        logger.info(f"Audio extracted to: {audio_path}")
-        # 2. Load audio for VAD
-        audio = AudioSegment.from_wav(audio_path)
-        sample_rate = audio.frame_rate
-        audio_data = np.array(audio.get_array_of_samples())
-        # WebRTC VAD operates on 16-bit mono audio at 8kHz, 16kHz, or 32kHz.
-        # We already saved at 16kHz, so we can proceed.
-        if sample_rate not in [8000, 16000, 32000]:
-            error_message = f"Unsupported sample rate for VAD: {sample_rate} Hz. Must be 8kHz, 16kHz, or 32kHz."
-            logger.error(error_message)
-            return audio_path, [], error_message
-        vad = webrtcvad.Vad(3)  # Aggressiveness mode (0-3, 3 is most aggressive)
-        frames = []
-        offset = 0
-        while offset + frame_duration_ms <= len(audio):
-            frame_start = offset
-            frame_end = offset + frame_duration_ms
-            frame = audio[frame_start:frame_end]
-            frames.append(frame)
-            offset += frame_duration_ms
-        logger.info(f"Running WebRTC VAD on {len(frames)} frames...")
-        current_segment_start = None
-        for i, frame in enumerate(frames):
-            is_speech = vad.is_speech(frame.raw_data, sample_rate)
-            frame_start_time = (i * frame_duration_ms) / 1000.0
-            frame_end_time = ((i + 1) * frame_duration_ms) / 1000.0
-            if is_speech:
-                if current_segment_start is None:
-                    current_segment_start = frame_start_time
-            else:
-                if current_segment_start is not None:
-                    speech_segments.append({"start": current_segment_start, "end": frame_end_time})
-                    current_segment_start = None
-        # Add the last segment if it ended with speech
-        if current_segment_start is not None:
-            speech_segments.append({"start": current_segment_start, "end": len(audio) / 1000.0})
-        logger.info(f"VAD completed. Found {len(speech_segments)} speech segments.")
-    except Exception as e:
-        error_message = f"An error occurred during audio segmentation: {e}"
-        logger.error(error_message)
-    return audio_path, speech_segments, error_message
 def transcribe_segments_with_scribe(full_audio_path, segments):
     """
@@ -1373,9 +1328,7 @@ def upload_and_manage(file, target_language, process_mode):
         # Step 1: Segment audio from the uploaded video/audio file
         logger.info("Segmenting audio...")
-        temp_audio_for_vad, speech_segments, seg_error = segment_audio_from_video(file.name)
-        if seg_error:
-            raise Exception(f"Audio segmentation failed: {seg_error}")
         if not speech_segments:
             raise Exception("No speech segments detected in the audio.")
         logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")

     return transcript_with_speakers, detected_language
+def segment_audio_from_video(video_path):
+    # Extract audio from video
+    video = VideoFileClip(video_path)
+    audio_path = "audio.wav"
+    video.audio.write_audiofile(audio_path)
+    logger.info(f"Audio extracted from video: {audio_path}")
+    segment_result, speech_audio_path = segment_background_audio(audio_path)
+    print(f"Saved non-speech (background) audio to local")
+    # Set up device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    logger.info(f"Using device: {device}")
     try:
+        # Load a medium model with float32 for broader compatibility
+        model = whisperx.load_model("large-v3", device=device, compute_type="float32")
+        logger.info("WhisperX model loaded")
+        # Transcribe
+        result = model.transcribe(speech_audio_path, chunk_size=4, print_progress = True)
+        logger.info("Audio transcription completed")
+    except Exception as e:
+        logger.error(f"❌ WhisperX pipeline failed: {e}")
+    # Extract timestamps, text, and speaker IDs
+    transcript_with_speakers = [
+        {
+            "start": segment["start"],
+            "end": segment["end"]
+        }
+        for segment in result["segments"]
+    ]
+    return audio_path, transcript_with_speakers
 def transcribe_segments_with_scribe(full_audio_path, segments):
     """
         # Step 1: Segment audio from the uploaded video/audio file
         logger.info("Segmenting audio...")
+        temp_audio_for_vad, speech_segments = segment_audio_from_video(file.name)
         if not speech_segments:
             raise Exception("No speech segments detected in the audio.")
         logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")