Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -125,32 +125,59 @@ def handle_feedback(feedback):
|
|
| 125 |
return "Thank you for your feedback!", None
|
| 126 |
|
| 127 |
def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
return background_audio_path
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
def transcribe_video_with_speakers(video_path):
|
| 155 |
# Extract audio from video
|
| 156 |
video = VideoFileClip(video_path)
|
|
|
|
| 125 |
return "Thank you for your feedback!", None
|
| 126 |
|
| 127 |
def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
| 128 |
+
"""
|
| 129 |
+
Uses Demucs to separate audio and extract background (non-vocal) parts.
|
| 130 |
+
Merges drums, bass, and other stems into a single background track.
|
| 131 |
+
"""
|
| 132 |
+
# Step 1: Run Demucs using the 4-stem model
|
| 133 |
+
subprocess.run([
|
| 134 |
+
"demucs",
|
| 135 |
+
"-n", "htdemucs", # 4-stem model
|
| 136 |
+
audio_path
|
| 137 |
+
], check=True)
|
| 138 |
+
|
| 139 |
+
# Step 2: Locate separated stem files
|
| 140 |
+
filename = os.path.splitext(os.path.basename(audio_path))[0]
|
| 141 |
+
stem_dir = os.path.join("separated", "htdemucs", filename)
|
| 142 |
+
|
| 143 |
+
# Step 3: Load and merge background stems
|
| 144 |
+
drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
|
| 145 |
+
bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
|
| 146 |
+
other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
|
| 147 |
+
|
| 148 |
+
background = drums.overlay(bass).overlay(other)
|
| 149 |
+
|
| 150 |
+
# Step 4: Export the merged background
|
| 151 |
+
background.export(background_audio_path, format="wav")
|
| 152 |
return background_audio_path
|
| 153 |
|
| 154 |
+
# def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
| 155 |
+
# pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
| 156 |
+
# vad_result = pipeline(audio_path)
|
| 157 |
+
|
| 158 |
+
# full_audio = AudioSegment.from_wav(audio_path)
|
| 159 |
+
# full_duration_sec = len(full_audio) / 1000.0
|
| 160 |
+
|
| 161 |
+
# current_time = 0.0
|
| 162 |
+
# result_audio = AudioSegment.empty()
|
| 163 |
+
|
| 164 |
+
# for segment in vad_result.itersegments():
|
| 165 |
+
# # Background segment before the speech
|
| 166 |
+
# if current_time < segment.start:
|
| 167 |
+
# bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
|
| 168 |
+
# result_audio += bg
|
| 169 |
+
# # Add silence for the speech duration
|
| 170 |
+
# silence_duration = segment.end - segment.start
|
| 171 |
+
# result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
|
| 172 |
+
# current_time = segment.end
|
| 173 |
+
|
| 174 |
+
# # Handle any remaining background after the last speech
|
| 175 |
+
# if current_time < full_duration_sec:
|
| 176 |
+
# result_audio += full_audio[int(current_time * 1000):]
|
| 177 |
+
|
| 178 |
+
# result_audio.export(background_audio_path, format="wav")
|
| 179 |
+
# return background_audio_path
|
| 180 |
+
|
| 181 |
def transcribe_video_with_speakers(video_path):
|
| 182 |
# Extract audio from video
|
| 183 |
video = VideoFileClip(video_path)
|