Update app.py
Browse files
app.py
CHANGED
|
@@ -120,7 +120,14 @@ def handle_feedback(feedback):
|
|
| 120 |
conn.commit()
|
| 121 |
return "Thank you for your feedback!", None
|
| 122 |
|
| 123 |
-
def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
| 125 |
vad_result = pipeline(audio_path)
|
| 126 |
|
|
@@ -131,16 +138,19 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
|
|
| 131 |
result_audio = AudioSegment.empty()
|
| 132 |
|
| 133 |
for segment in vad_result.itersegments():
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
| 137 |
result_audio += bg
|
| 138 |
-
# Add silence for the speech duration
|
| 139 |
-
silence_duration = segment.end - segment.start
|
| 140 |
-
result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
|
| 141 |
-
current_time = segment.end
|
| 142 |
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
if current_time < full_duration_sec:
|
| 145 |
result_audio += full_audio[int(current_time * 1000):]
|
| 146 |
|
|
|
|
| 120 |
conn.commit()
|
| 121 |
return "Thank you for your feedback!", None
|
| 122 |
|
| 123 |
+
def segment_background_audio(audio_path, background_audio_path="background_segments.wav", speech_padding=0.15):
|
| 124 |
+
"""
|
| 125 |
+
Segments and removes speech from audio, returning only background.
|
| 126 |
+
Padding is applied around speech segments to reduce overlap/bleed.
|
| 127 |
+
"""
|
| 128 |
+
from pyannote.audio import Pipeline
|
| 129 |
+
from pydub import AudioSegment
|
| 130 |
+
|
| 131 |
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
| 132 |
vad_result = pipeline(audio_path)
|
| 133 |
|
|
|
|
| 138 |
result_audio = AudioSegment.empty()
|
| 139 |
|
| 140 |
for segment in vad_result.itersegments():
|
| 141 |
+
start = max(0.0, segment.start - speech_padding)
|
| 142 |
+
end = min(full_duration_sec, segment.end + speech_padding)
|
| 143 |
+
|
| 144 |
+
# Extract non-speech segment before speech starts
|
| 145 |
+
if current_time < start:
|
| 146 |
+
bg = full_audio[int(current_time * 1000):int(start * 1000)]
|
| 147 |
result_audio += bg
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
+
# Replace speech (plus margin) with silence
|
| 150 |
+
result_audio += AudioSegment.silent(duration=int((end - start) * 1000))
|
| 151 |
+
current_time = end
|
| 152 |
+
|
| 153 |
+
# Add trailing background after the last segment
|
| 154 |
if current_time < full_duration_sec:
|
| 155 |
result_audio += full_audio[int(current_time * 1000):]
|
| 156 |
|