studio_V1_4_OCR_SOTA

Sleeping

App Files Files Community

qqwjq1981 commited on Apr 26

Commit

e57296f

verified ·

1 Parent(s): 0489670

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -8

app.py CHANGED Viewed

@@ -125,7 +125,7 @@ def handle_feedback(feedback):
             conn.commit()
         return "Thank you for your feedback!", None
-def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
     """
     Uses Demucs to separate audio and extract background (non-vocal) parts.
     Merges drums, bass, and other stems into a single background track.
@@ -142,6 +142,7 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
     stem_dir = os.path.join("separated", "htdemucs", filename)
     # Step 3: Load and merge background stems
     drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
     bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
     other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
@@ -150,7 +151,8 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
     # Step 4: Export the merged background
     background.export(background_audio_path, format="wav")
-    return background_audio_path
 # def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
 #     pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
@@ -186,7 +188,7 @@ def transcribe_video_with_speakers(video_path):
     video.audio.write_audiofile(audio_path)
     logger.info(f"Audio extracted from video: {audio_path}")
-    segment_result = segment_background_audio(audio_path)
     print(f"Saved non-speech (background) audio to local")
     # Set up device
@@ -199,7 +201,7 @@ def transcribe_video_with_speakers(video_path):
         logger.info("WhisperX model loaded")
         # Transcribe
-        result = model.transcribe(audio_path, chunk_size=6, print_progress = True)
         logger.info("Audio transcription completed")
         # Get the detected language
@@ -207,12 +209,12 @@ def transcribe_video_with_speakers(video_path):
         logger.debug(f"Detected language: {detected_language}")
         # Alignment
         model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-        result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
         logger.info("Transcription alignment completed")
         # Diarization (works independently of Whisper model size)
         diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
-        diarize_segments = diarize_model(audio_path)
         logger.info("Speaker diarization completed")
         # Assign speakers
@@ -243,7 +245,7 @@ def transcribe_video_with_speakers(video_path):
     # Collapse and truncate speaker audio
     speaker_sample_paths = {}
-    audio_clip = AudioFileClip(audio_path)
     for speaker, segments in speaker_audio.items():
         speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
         combined_clip = concatenate_audioclips(speaker_clips)
@@ -256,7 +258,7 @@ def transcribe_video_with_speakers(video_path):
     # Clean up
     video.close()
     audio_clip.close()
-    os.remove(audio_path)
     return transcript_with_speakers, detected_language

             conn.commit()
         return "Thank you for your feedback!", None
+def segment_background_audio(audio_path, background_audio_path="background_segments.wav", speech_audio_path="speech_segment.wav"):
     """
     Uses Demucs to separate audio and extract background (non-vocal) parts.
     Merges drums, bass, and other stems into a single background track.
     stem_dir = os.path.join("separated", "htdemucs", filename)
     # Step 3: Load and merge background stems
+    vocals = AudioSegment.from_wav(os.path.join(stem_dir, "vocals.wav"))
     drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
     bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
     other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
     # Step 4: Export the merged background
     background.export(background_audio_path, format="wav")
+    vocals.export(speech_audio_path, format="wav")
+    return background_audio_path, speech_audio_path
 # def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
 #     pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
     video.audio.write_audiofile(audio_path)
     logger.info(f"Audio extracted from video: {audio_path}")
+    segment_result, speech_audio_path = segment_background_audio(audio_path)
     print(f"Saved non-speech (background) audio to local")
     # Set up device
         logger.info("WhisperX model loaded")
         # Transcribe
+        result = model.transcribe(speech_audio_path, chunk_size=6, print_progress = True)
         logger.info("Audio transcription completed")
         # Get the detected language
         logger.debug(f"Detected language: {detected_language}")
         # Alignment
         model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+        result = whisperx.align(result["segments"], model_a, metadata, speech_audio_path, device)
         logger.info("Transcription alignment completed")
         # Diarization (works independently of Whisper model size)
         diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
+        diarize_segments = diarize_model(speech_audio_path)
         logger.info("Speaker diarization completed")
         # Assign speakers
     # Collapse and truncate speaker audio
     speaker_sample_paths = {}
+    audio_clip = AudioFileClip(speech_audio_path)
     for speaker, segments in speaker_audio.items():
         speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
         combined_clip = concatenate_audioclips(speaker_clips)
     # Clean up
     video.close()
     audio_clip.close()
+    os.remove(speech_audio_path)
     return transcript_with_speakers, detected_language