Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -125,7 +125,7 @@ def handle_feedback(feedback):
|
|
125 |
conn.commit()
|
126 |
return "Thank you for your feedback!", None
|
127 |
|
128 |
-
def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
129 |
"""
|
130 |
Uses Demucs to separate audio and extract background (non-vocal) parts.
|
131 |
Merges drums, bass, and other stems into a single background track.
|
@@ -142,6 +142,7 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
|
|
142 |
stem_dir = os.path.join("separated", "htdemucs", filename)
|
143 |
|
144 |
# Step 3: Load and merge background stems
|
|
|
145 |
drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
|
146 |
bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
|
147 |
other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
|
@@ -150,7 +151,8 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
|
|
150 |
|
151 |
# Step 4: Export the merged background
|
152 |
background.export(background_audio_path, format="wav")
|
153 |
-
|
|
|
154 |
|
155 |
# def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
156 |
# pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
@@ -186,7 +188,7 @@ def transcribe_video_with_speakers(video_path):
|
|
186 |
video.audio.write_audiofile(audio_path)
|
187 |
logger.info(f"Audio extracted from video: {audio_path}")
|
188 |
|
189 |
-
segment_result = segment_background_audio(audio_path)
|
190 |
print(f"Saved non-speech (background) audio to local")
|
191 |
|
192 |
# Set up device
|
@@ -199,7 +201,7 @@ def transcribe_video_with_speakers(video_path):
|
|
199 |
logger.info("WhisperX model loaded")
|
200 |
|
201 |
# Transcribe
|
202 |
-
result = model.transcribe(
|
203 |
logger.info("Audio transcription completed")
|
204 |
|
205 |
# Get the detected language
|
@@ -207,12 +209,12 @@ def transcribe_video_with_speakers(video_path):
|
|
207 |
logger.debug(f"Detected language: {detected_language}")
|
208 |
# Alignment
|
209 |
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
210 |
-
result = whisperx.align(result["segments"], model_a, metadata,
|
211 |
logger.info("Transcription alignment completed")
|
212 |
|
213 |
# Diarization (works independently of Whisper model size)
|
214 |
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
|
215 |
-
diarize_segments = diarize_model(
|
216 |
logger.info("Speaker diarization completed")
|
217 |
|
218 |
# Assign speakers
|
@@ -243,7 +245,7 @@ def transcribe_video_with_speakers(video_path):
|
|
243 |
|
244 |
# Collapse and truncate speaker audio
|
245 |
speaker_sample_paths = {}
|
246 |
-
audio_clip = AudioFileClip(
|
247 |
for speaker, segments in speaker_audio.items():
|
248 |
speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
|
249 |
combined_clip = concatenate_audioclips(speaker_clips)
|
@@ -256,7 +258,7 @@ def transcribe_video_with_speakers(video_path):
|
|
256 |
# Clean up
|
257 |
video.close()
|
258 |
audio_clip.close()
|
259 |
-
os.remove(
|
260 |
|
261 |
return transcript_with_speakers, detected_language
|
262 |
|
|
|
125 |
conn.commit()
|
126 |
return "Thank you for your feedback!", None
|
127 |
|
128 |
+
def segment_background_audio(audio_path, background_audio_path="background_segments.wav", speech_audio_path="speech_segment.wav"):
|
129 |
"""
|
130 |
Uses Demucs to separate audio and extract background (non-vocal) parts.
|
131 |
Merges drums, bass, and other stems into a single background track.
|
|
|
142 |
stem_dir = os.path.join("separated", "htdemucs", filename)
|
143 |
|
144 |
# Step 3: Load and merge background stems
|
145 |
+
vocals = AudioSegment.from_wav(os.path.join(stem_dir, "vocals.wav"))
|
146 |
drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
|
147 |
bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
|
148 |
other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
|
|
|
151 |
|
152 |
# Step 4: Export the merged background
|
153 |
background.export(background_audio_path, format="wav")
|
154 |
+
vocals.export(speech_audio_path, format="wav")
|
155 |
+
return background_audio_path, speech_audio_path
|
156 |
|
157 |
# def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
158 |
# pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
|
|
188 |
video.audio.write_audiofile(audio_path)
|
189 |
logger.info(f"Audio extracted from video: {audio_path}")
|
190 |
|
191 |
+
segment_result, speech_audio_path = segment_background_audio(audio_path)
|
192 |
print(f"Saved non-speech (background) audio to local")
|
193 |
|
194 |
# Set up device
|
|
|
201 |
logger.info("WhisperX model loaded")
|
202 |
|
203 |
# Transcribe
|
204 |
+
result = model.transcribe(speech_audio_path, chunk_size=6, print_progress = True)
|
205 |
logger.info("Audio transcription completed")
|
206 |
|
207 |
# Get the detected language
|
|
|
209 |
logger.debug(f"Detected language: {detected_language}")
|
210 |
# Alignment
|
211 |
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
212 |
+
result = whisperx.align(result["segments"], model_a, metadata, speech_audio_path, device)
|
213 |
logger.info("Transcription alignment completed")
|
214 |
|
215 |
# Diarization (works independently of Whisper model size)
|
216 |
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
|
217 |
+
diarize_segments = diarize_model(speech_audio_path)
|
218 |
logger.info("Speaker diarization completed")
|
219 |
|
220 |
# Assign speakers
|
|
|
245 |
|
246 |
# Collapse and truncate speaker audio
|
247 |
speaker_sample_paths = {}
|
248 |
+
audio_clip = AudioFileClip(speech_audio_path)
|
249 |
for speaker, segments in speaker_audio.items():
|
250 |
speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
|
251 |
combined_clip = concatenate_audioclips(speaker_clips)
|
|
|
258 |
# Clean up
|
259 |
video.close()
|
260 |
audio_clip.close()
|
261 |
+
os.remove(speech_audio_path)
|
262 |
|
263 |
return transcript_with_speakers, detected_language
|
264 |
|