Spaces:

ZDingman
/

zacks-audio-outpost-denoiser

Sleeping

App Files Files Community

ZDingman commited on 5 days ago

Commit

1430d22

verified ·

1 Parent(s): 2ee63b1

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -24

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # app.py
-# Zack's Audio Outpost — AI Noise Reducer (SpeechBrain MetricGAN)
-# Works on CPU in a Hugging Face Space. No GPU required.
 import os
 from typing import Tuple
@@ -14,7 +14,7 @@ from speechbrain.pretrained import SpectralMaskEnhancement
 # -----------------------------
 # Config
 # -----------------------------
-TARGET_SR = 16_000  # The SpeechBrain mtl-mimic-voicebank model expects 16 kHz mono
 # Wet/dry mix by "strength"
 MIX_BY_STRENGTH = {
@@ -23,8 +23,9 @@ MIX_BY_STRENGTH = {
     "Strong": 1.00,  # 100% wet
 }
-MODEL_SOURCE = "speechbrain/mtl-mimic-voicebank"
-MODEL_DIR = "pretrained_models/mtl-mimic-voicebank"
 # Global enhancer (loaded once)
 _enhancer: SpectralMaskEnhancement | None = None
@@ -34,9 +35,9 @@ def get_enhancer() -> SpectralMaskEnhancement:
     """Lazy-load the SpeechBrain enhancer once."""
     global _enhancer
     if _enhancer is None:
-        # Downloads the small MetricGAN+ checkpoint on first run
         _enhancer = SpectralMaskEnhancement.from_hparams(
-            source=MODEL_SOURCE, savedir=MODEL_DIR
         )
         _enhancer.mods.eval()
         torch.set_grad_enabled(False)
@@ -51,13 +52,12 @@ def to_mono(x: np.ndarray) -> np.ndarray:
     Ensure mono. Accepts shapes:
       - (time,) already mono
       - (time, channels) -> average channels
-      - (channels, time) (rare) -> average channels, return (time,)
     Returns float32 -1..1
     """
     if x.ndim == 1:
         y = x
     elif x.ndim == 2:
-        # pick which axis is channels
         if x.shape[0] < x.shape[1]:
             # (channels, time)
             y = x.mean(axis=0)
@@ -95,14 +95,12 @@ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[i
       Both as float32 in [-1, 1]
     """
     if audio is None:
-        # Nothing uploaded
         return None, None
     in_sr, in_wav = audio
     if in_wav is None or in_wav.size == 0:
         return None, None
-    # Normalize types just in case
     in_wav = in_wav.astype(np.float32, copy=False)
     # Prepare input for model (mono, 16k)
@@ -113,22 +111,19 @@ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[i
     # Enhance
     enhancer = get_enhancer()
-    enhanced = enhancer.enhance_batch(wav16, lengths=lengths)  # (1, time)
-    enhanced = enhanced.squeeze(0)  # (time,)
     dry = wav16.squeeze(0)
     # Wet/dry mix
     mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
-    out = dry * (1.0 - mix) + enhanced * mix  # (time,)
-    # Clamp just in case, then back to numpy
     y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)
-    # For "Original", we return the user’s uploaded audio unmodified
-    # (Gradio prefers (sr, waveform) for type="numpy")
-    original = (in_sr, to_mono(in_wav))  # make sure it plays as mono
     processed = (TARGET_SR, y)
     return original, processed
@@ -136,7 +131,6 @@ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[i
 # UI
 # -----------------------------
 CSS = """
-/* simple brand-ish tweaks */
 .gradio-container { max-width: 1100px !important; }
 #title { font-weight: 700; font-size: 1.4rem; margin-bottom: .25rem; }
 #subtitle { opacity: .8; margin-bottom: .75rem; }
@@ -149,9 +143,9 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     with gr.Row():
         audio_in = gr.Audio(
             sources=["upload"],
-            type="numpy",          # returns (sr, np.ndarray)
             label="Upload Audio",
-            waveform_options=gr.WaveformOptions(show_controls=True),
         )
         strength = gr.Radio(
             choices=["Light", "Medium", "Strong"],
@@ -167,7 +161,5 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     btn.click(denoise_numpy, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
-# Recommended: SSR is fine on Spaces; leave default
 if __name__ == "__main__":
-    # In Spaces this is ignored; locally it runs on http://0.0.0.0:7860
     demo.launch()

 # app.py
+# Zack's Audio Outpost — AI Noise Reducer (SpeechBrain MetricGAN+)
+# CPU-friendly; provides Light/Medium/Strong wet mix and Original vs Processed.
 import os
 from typing import Tuple
 # -----------------------------
 # Config
 # -----------------------------
+TARGET_SR = 16_000  # MetricGAN+ expects 16 kHz mono
 # Wet/dry mix by "strength"
 MIX_BY_STRENGTH = {
     "Strong": 1.00,  # 100% wet
 }
+# ✅ Correct SpeechBrain model for SpectralMaskEnhancement
+MODEL_SOURCE = "speechbrain/metricgan-plus-voicebank"
+MODEL_DIR = "pretrained_models/metricgan-plus-voicebank"
 # Global enhancer (loaded once)
 _enhancer: SpectralMaskEnhancement | None = None
     """Lazy-load the SpeechBrain enhancer once."""
     global _enhancer
     if _enhancer is None:
         _enhancer = SpectralMaskEnhancement.from_hparams(
+            source=MODEL_SOURCE,
+            savedir=MODEL_DIR,
         )
         _enhancer.mods.eval()
         torch.set_grad_enabled(False)
     Ensure mono. Accepts shapes:
       - (time,) already mono
       - (time, channels) -> average channels
+      - (channels, time) -> average channels, return (time,)
     Returns float32 -1..1
     """
     if x.ndim == 1:
         y = x
     elif x.ndim == 2:
         if x.shape[0] < x.shape[1]:
             # (channels, time)
             y = x.mean(axis=0)
       Both as float32 in [-1, 1]
     """
     if audio is None:
         return None, None
     in_sr, in_wav = audio
     if in_wav is None or in_wav.size == 0:
         return None, None
     in_wav = in_wav.astype(np.float32, copy=False)
     # Prepare input for model (mono, 16k)
     # Enhance
     enhancer = get_enhancer()
+    enhanced = enhancer.enhance_batch(wav16, lengths=lengths).squeeze(0)  # (time,)
     dry = wav16.squeeze(0)
     # Wet/dry mix
     mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
+    out = dry * (1.0 - mix) + enhanced * mix
+    # Clamp & back to numpy
     y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)
+    # Return original (mono copy for consistent playback) + processed @16k
+    original = (in_sr, to_mono(in_wav))
     processed = (TARGET_SR, y)
     return original, processed
 # UI
 # -----------------------------
 CSS = """
 .gradio-container { max-width: 1100px !important; }
 #title { font-weight: 700; font-size: 1.4rem; margin-bottom: .25rem; }
 #subtitle { opacity: .8; margin-bottom: .75rem; }
     with gr.Row():
         audio_in = gr.Audio(
             sources=["upload"],
+            type="numpy",  # returns (sr, np.ndarray)
             label="Upload Audio",
+            # show_controls is deprecated; we leave default controls on
         )
         strength = gr.Radio(
             choices=["Light", "Medium", "Strong"],
     btn.click(denoise_numpy, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
 if __name__ == "__main__":
     demo.launch()