Spaces:

Curify
/

studio_V1

Sleeping

qqwjq1981 commited on Apr 9

Commit

447f785

verified ·

1 Parent(s): f256463

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -120,7 +120,14 @@ def handle_feedback(feedback):
             conn.commit()
         return "Thank you for your feedback!", None
-def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
     pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
     vad_result = pipeline(audio_path)
@@ -131,16 +138,19 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
     result_audio = AudioSegment.empty()
     for segment in vad_result.itersegments():
-        # Background segment before the speech
-        if current_time < segment.start:
-            bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
             result_audio += bg
-        # Add silence for the speech duration
-        silence_duration = segment.end - segment.start
-        result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
-        current_time = segment.end
-    # Handle any remaining background after the last speech
     if current_time < full_duration_sec:
         result_audio += full_audio[int(current_time * 1000):]

             conn.commit()
         return "Thank you for your feedback!", None
+def segment_background_audio(audio_path, background_audio_path="background_segments.wav", speech_padding=0.15):
+    """
+    Segments and removes speech from audio, returning only background.
+    Padding is applied around speech segments to reduce overlap/bleed.
+    """
+    from pyannote.audio import Pipeline
+    from pydub import AudioSegment
     pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
     vad_result = pipeline(audio_path)
     result_audio = AudioSegment.empty()
     for segment in vad_result.itersegments():
+        start = max(0.0, segment.start - speech_padding)
+        end = min(full_duration_sec, segment.end + speech_padding)
+        # Extract non-speech segment before speech starts
+        if current_time < start:
+            bg = full_audio[int(current_time * 1000):int(start * 1000)]
             result_audio += bg
+        # Replace speech (plus margin) with silence
+        result_audio += AudioSegment.silent(duration=int((end - start) * 1000))
+        current_time = end
+    # Add trailing background after the last segment
     if current_time < full_duration_sec:
         result_audio += full_audio[int(current_time * 1000):]