Spaces:

ZDingman
/

zacks-audio-outpost-denoiser

Sleeping

App Files Files Community

ZDingman commited on 8 days ago

Commit

683fcfc

verified ·

1 Parent(s): f7b8d3f

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -69

app.py CHANGED Viewed

@@ -1,116 +1,92 @@
-import os
 import numpy as np
 import gradio as gr
 import soundfile as sf
 import torch
-# Try torchaudio for resampling. If it's not usable, fall back to SciPy.
-USE_TORCHAUDIO = True
-try:
-    import torchaudio
-    import torchaudio.functional as AF
-except Exception:
-    USE_TORCHAUDIO = False
-    from scipy.signal import resample_poly
-# SpeechBrain MetricGAN+ enhancement (CPU)
-from speechbrain.pretrained import SpectralMaskEnhancement
-torch.set_num_threads(1)
 DEVICE = "cpu"
-MODEL_ID = "speechbrain/metricgan-plus-voicebank"
-# Load the enhancer once
-ENHANCER = SpectralMaskEnhancement.from_hparams(
-    source=MODEL_ID,
-    savedir="pretrained_metricganp",
-    run_opts={"device": DEVICE}
-)
-TARGET_SR = 16000  # MetricGAN+ expects 16 kHz
 def _to_mono(x: np.ndarray) -> np.ndarray:
-    # x shape: (samples,) or (samples, channels)
     if x.ndim == 2 and x.shape[1] > 1:
-        return np.mean(x, axis=1, dtype=np.float32)
-    return x.astype(np.float32, copy=False)
 def _resample(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
     if sr_in == sr_out:
-        return x
-    if USE_TORCHAUDIO:
-        with torch.no_grad():
-            t = torch.from_numpy(x).unsqueeze(0)  # (1, time)
-            y = AF.resample(t, orig_freq=sr_in, new_freq=sr_out)
-            return y.squeeze(0).cpu().numpy().astype(np.float32)
-    # SciPy fall-back
     g = np.gcd(sr_in, sr_out)
     up, down = sr_out // g, sr_in // g
     y = resample_poly(x, up, down).astype(np.float32)
     return y
 def _mix(dry: np.ndarray, wet: np.ndarray, strength: str) -> np.ndarray:
-    # Light / Medium / Strong → wet mix amounts
     mix = {"Light": 0.4, "Medium": 0.7, "Strong": 1.0}.get(strength, 0.7)
-    # pad/truncate to the same length
     n = min(len(dry), len(wet))
-    out = dry[:n] * (1.0 - mix) + wet[:n] * mix
-    return out
 def denoise(audio: tuple, strength: str):
-    """
-    Gradio passes (sr, np.ndarray[int16/float32, shape=(n,) or (n, ch)]) when type='numpy'
-    Return the processed audio as (sr, np.ndarray[float32]).
-    """
     if audio is None:
         raise gr.Error("Please upload an audio file.")
     sr, data = audio
-    if isinstance(data, list):
-        data = np.array(data, dtype=np.float32)
-    # To mono, float32 in [-1, 1]
-    x_mono = _to_mono(data)
-    x_mono = np.clip(x_mono, -1.0, 1.0).astype(np.float32)
-    # Resample to 16 kHz for the model
-    x_16k = _resample(x_mono, sr_in=sr, sr_out=TARGET_SR)
-    # Enhance with MetricGAN+
     with torch.no_grad():
-        # Enhance expects torch.Tensor: shape (batch, time)
-        inp = torch.from_numpy(x_16k).unsqueeze(0)
-        enhanced = ENHANCER.enhance_batch(inp, TARGET_SR)
         if isinstance(enhanced, torch.Tensor):
             enhanced = enhanced.squeeze(0).cpu().numpy().astype(np.float32)
-    # Back to original sample rate
-    enhanced_sr = _resample(enhanced, sr_in=TARGET_SR, sr_out=sr)
-    # Mix according to strength (preserve dry transients)
-    y = _mix(dry=x_mono, wet=enhanced_sr, strength=strength)
-    # Return as mono track at original sr
-    return (sr, y.astype(np.float32))
-# ---------- UI ----------
-with gr.Blocks(theme=gr.themes.Soft(), css="footer {visibility:hidden}") as demo:
     gr.Markdown("### Zack’s Audio Outpost — AI Noise Reducer\nUpload a file and compare **Original vs Processed**.")
     with gr.Row():
         audio_in = gr.Audio(type="numpy", label="Upload Audio")
-        strength = gr.Radio(["Light", "Medium", "Strong"], value="Medium", label="Noise Reduction Strength")
     run_btn = gr.Button("Run Noise Reduction", variant="primary")
     with gr.Row():
         orig = gr.Audio(label="Original")
-        clean = gr.Audio(label="Processed")
-    def run(audio, strength):
         if audio is None:
             raise gr.Error("Please upload an audio file.")
-        sr, data = audio
-        processed = denoise((sr, data), strength)
-        return (sr, data), processed
-    run_btn.click(fn=run, inputs=[audio_in, strength], outputs=[orig, clean])
 demo.launch()

 import numpy as np
 import gradio as gr
 import soundfile as sf
+from scipy.signal import resample_poly
 import torch
+# Lazy import to avoid failing at build time
+ENHANCER = None
+TARGET_SR = 16000  # MetricGAN+ expects 16 kHz
 DEVICE = "cpu"
+torch.set_num_threads(1)
+def get_enhancer():
+    global ENHANCER
+    if ENHANCER is None:
+        from speechbrain.pretrained import SpectralMaskEnhancement
+        ENHANCER = SpectralMaskEnhancement.from_hparams(
+            source="speechbrain/metricgan-plus-voicebank",
+            savedir="pretrained_metricganp",
+            run_opts={"device": DEVICE}
+        )
+    return ENHANCER
 def _to_mono(x: np.ndarray) -> np.ndarray:
+    # x shape: (n,) or (n, ch); keep as float32 in [-1,1]
     if x.ndim == 2 and x.shape[1] > 1:
+        x = np.mean(x, axis=1)
+    x = np.asarray(x, dtype=np.float32)
+    return np.clip(x, -1.0, 1.0)
 def _resample(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
     if sr_in == sr_out:
+        return x.astype(np.float32, copy=False)
     g = np.gcd(sr_in, sr_out)
     up, down = sr_out // g, sr_in // g
     y = resample_poly(x, up, down).astype(np.float32)
     return y
 def _mix(dry: np.ndarray, wet: np.ndarray, strength: str) -> np.ndarray:
     mix = {"Light": 0.4, "Medium": 0.7, "Strong": 1.0}.get(strength, 0.7)
     n = min(len(dry), len(wet))
+    return dry[:n] * (1.0 - mix) + wet[:n] * mix
 def denoise(audio: tuple, strength: str):
     if audio is None:
         raise gr.Error("Please upload an audio file.")
     sr, data = audio
+    data = np.asarray(data)  # gradio sometimes gives list
+    # to mono + float32
+    dry_mono = _to_mono(data)
+    # resample to 16k
+    x16 = _resample(dry_mono, sr_in=sr, sr_out=TARGET_SR)
+    # run enhancer (lazy load)
+    enhancer = get_enhancer()
     with torch.no_grad():
+        inp = torch.from_numpy(x16).unsqueeze(0)  # (1, time)
+        enhanced = enhancer.enhance_batch(inp, TARGET_SR)
         if isinstance(enhanced, torch.Tensor):
             enhanced = enhanced.squeeze(0).cpu().numpy().astype(np.float32)
+    # back to original SR
+    enh_sr = _resample(enhanced, sr_in=TARGET_SR, sr_out=sr)
+    # wet/dry
+    out = _mix(dry_mono, enh_sr, strength)
+    return (sr, out.astype(np.float32))
+# -------- UI --------
+with gr.Blocks(theme=gr.themes.Soft(), css="footer{visibility:hidden}") as demo:
     gr.Markdown("### Zack’s Audio Outpost — AI Noise Reducer\nUpload a file and compare **Original vs Processed**.")
     with gr.Row():
         audio_in = gr.Audio(type="numpy", label="Upload Audio")
+        strength = gr.Radio(["Light","Medium","Strong"], value="Medium", label="Noise Reduction Strength")
     run_btn = gr.Button("Run Noise Reduction", variant="primary")
     with gr.Row():
         orig = gr.Audio(label="Original")
+        proc = gr.Audio(label="Processed")
+    def run(audio, s):
         if audio is None:
             raise gr.Error("Please upload an audio file.")
+        sr, x = audio
+        y = denoise(audio, s)
+        return (sr, x), y
+    run_btn.click(run, [audio_in, strength], [orig, proc])
 demo.launch()