qqwjq1981 commited on
Commit
e57296f
·
verified ·
1 Parent(s): 0489670

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -8
app.py CHANGED
@@ -125,7 +125,7 @@ def handle_feedback(feedback):
125
  conn.commit()
126
  return "Thank you for your feedback!", None
127
 
128
- def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
129
  """
130
  Uses Demucs to separate audio and extract background (non-vocal) parts.
131
  Merges drums, bass, and other stems into a single background track.
@@ -142,6 +142,7 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
142
  stem_dir = os.path.join("separated", "htdemucs", filename)
143
 
144
  # Step 3: Load and merge background stems
 
145
  drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
146
  bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
147
  other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
@@ -150,7 +151,8 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
150
 
151
  # Step 4: Export the merged background
152
  background.export(background_audio_path, format="wav")
153
- return background_audio_path
 
154
 
155
  # def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
156
  # pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
@@ -186,7 +188,7 @@ def transcribe_video_with_speakers(video_path):
186
  video.audio.write_audiofile(audio_path)
187
  logger.info(f"Audio extracted from video: {audio_path}")
188
 
189
- segment_result = segment_background_audio(audio_path)
190
  print(f"Saved non-speech (background) audio to local")
191
 
192
  # Set up device
@@ -199,7 +201,7 @@ def transcribe_video_with_speakers(video_path):
199
  logger.info("WhisperX model loaded")
200
 
201
  # Transcribe
202
- result = model.transcribe(audio_path, chunk_size=6, print_progress = True)
203
  logger.info("Audio transcription completed")
204
 
205
  # Get the detected language
@@ -207,12 +209,12 @@ def transcribe_video_with_speakers(video_path):
207
  logger.debug(f"Detected language: {detected_language}")
208
  # Alignment
209
  model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
210
- result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
211
  logger.info("Transcription alignment completed")
212
 
213
  # Diarization (works independently of Whisper model size)
214
  diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
215
- diarize_segments = diarize_model(audio_path)
216
  logger.info("Speaker diarization completed")
217
 
218
  # Assign speakers
@@ -243,7 +245,7 @@ def transcribe_video_with_speakers(video_path):
243
 
244
  # Collapse and truncate speaker audio
245
  speaker_sample_paths = {}
246
- audio_clip = AudioFileClip(audio_path)
247
  for speaker, segments in speaker_audio.items():
248
  speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
249
  combined_clip = concatenate_audioclips(speaker_clips)
@@ -256,7 +258,7 @@ def transcribe_video_with_speakers(video_path):
256
  # Clean up
257
  video.close()
258
  audio_clip.close()
259
- os.remove(audio_path)
260
 
261
  return transcript_with_speakers, detected_language
262
 
 
125
  conn.commit()
126
  return "Thank you for your feedback!", None
127
 
128
+ def segment_background_audio(audio_path, background_audio_path="background_segments.wav", speech_audio_path="speech_segment.wav"):
129
  """
130
  Uses Demucs to separate audio and extract background (non-vocal) parts.
131
  Merges drums, bass, and other stems into a single background track.
 
142
  stem_dir = os.path.join("separated", "htdemucs", filename)
143
 
144
  # Step 3: Load and merge background stems
145
+ vocals = AudioSegment.from_wav(os.path.join(stem_dir, "vocals.wav"))
146
  drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
147
  bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
148
  other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
 
151
 
152
  # Step 4: Export the merged background
153
  background.export(background_audio_path, format="wav")
154
+ vocals.export(speech_audio_path, format="wav")
155
+ return background_audio_path, speech_audio_path
156
 
157
  # def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
158
  # pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
 
188
  video.audio.write_audiofile(audio_path)
189
  logger.info(f"Audio extracted from video: {audio_path}")
190
 
191
+ segment_result, speech_audio_path = segment_background_audio(audio_path)
192
  print(f"Saved non-speech (background) audio to local")
193
 
194
  # Set up device
 
201
  logger.info("WhisperX model loaded")
202
 
203
  # Transcribe
204
+ result = model.transcribe(speech_audio_path, chunk_size=6, print_progress = True)
205
  logger.info("Audio transcription completed")
206
 
207
  # Get the detected language
 
209
  logger.debug(f"Detected language: {detected_language}")
210
  # Alignment
211
  model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
212
+ result = whisperx.align(result["segments"], model_a, metadata, speech_audio_path, device)
213
  logger.info("Transcription alignment completed")
214
 
215
  # Diarization (works independently of Whisper model size)
216
  diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
217
+ diarize_segments = diarize_model(speech_audio_path)
218
  logger.info("Speaker diarization completed")
219
 
220
  # Assign speakers
 
245
 
246
  # Collapse and truncate speaker audio
247
  speaker_sample_paths = {}
248
+ audio_clip = AudioFileClip(speech_audio_path)
249
  for speaker, segments in speaker_audio.items():
250
  speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
251
  combined_clip = concatenate_audioclips(speaker_clips)
 
258
  # Clean up
259
  video.close()
260
  audio_clip.close()
261
+ os.remove(speech_audio_path)
262
 
263
  return transcript_with_speakers, detected_language
264