Spaces:

ZDingman
/

zacks-audio-outpost-denoiser

Running

App Files Files Community

ZDingman commited on 10 days ago

Commit

fc270e2

verified ·

1 Parent(s): 56349a1

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -75

app.py CHANGED Viewed

@@ -1,106 +1,116 @@
-import gradio as gr
 import numpy as np
 import torch
-# Try torchaudio for high-quality resampling; fall back to scipy if unavailable
 try:
     import torchaudio
-    HAS_TORCHAUDIO = True
 except Exception:
-    HAS_TORCHAUDIO = False
     from scipy.signal import resample_poly
 from speechbrain.pretrained import SpectralMaskEnhancement
-# Download once and cache in the Space
 ENHANCER = SpectralMaskEnhancement.from_hparams(
-    source="speechbrain/metricgan-plus-voicebank",
-    savedir="pretrained/metricgan-plus-voicebank",
 )
-TARGET_SR = 16000  # model expects 16 kHz
-def _resample(x: torch.Tensor, in_sr: int, out_sr: int) -> torch.Tensor:
-    if in_sr == out_sr:
-        return x
-    if HAS_TORCHAUDIO:
-        return torchaudio.functional.resample(x, in_sr, out_sr)
-    # fallback (scipy) — expects numpy
-    xn = x.cpu().numpy()
-    g = np.gcd(in_sr, out_sr)
-    up, down = out_sr // g, in_sr // g
-    y = resample_poly(xn, up, down).astype(np.float32)
-    return torch.from_numpy(y)
-def _to_tensor(mono_np: np.ndarray) -> torch.Tensor:
-    t = torch.from_numpy(mono_np.astype(np.float32))
-    peak = t.abs().max().clamp(min=1e-8)
-    return (t / peak)
-def _enhance_channel(wav_np: np.ndarray, in_sr: int, mix: float) -> np.ndarray:
-    """Enhance one channel and wet/dry mix."""
-    x = _to_tensor(wav_np)                # shape [T]
-    x16 = _resample(x, in_sr, TARGET_SR)  # -> 16 kHz
-    with torch.no_grad():
-        # ✅ Correct call: enhance_batch(wavs [, lengths]), NOT sample_rate
-        # expects [B, T]; returns [B, T]
-        est16 = ENHANCER.enhance_batch(x16.unsqueeze(0))[0]
-    # back to original sr and length
-    est = _resample(est16, TARGET_SR, in_sr)
-    if est.shape[0] >= x.shape[0]:
-        est = est[: x.shape[0]]
-    else:
-        est = torch.nn.functional.pad(est, (0, x.shape[0] - est.shape[0]))
-    y = (1.0 - mix) * x + mix * est
-    return y.cpu().numpy()
-def denoise(audio, strength):
     """
-    Gradio (type='numpy') -> (sample_rate:int, data: np.ndarray)
-    data is [T] or [T,2].
     """
-    try:
-        if audio is None:
-            return None, None
-        sr, data = audio
-        chs = [data] if data.ndim == 1 else [data[:, 0], data[:, 1]]
-        mix_map = {"Light": 0.5, "Medium": 0.75, "Strong": 1.0}
-        mix = mix_map.get(strength, 0.75)
-        out_chs = [_enhance_channel(c, sr, mix) for c in chs]
-        processed = (np.stack(out_chs, axis=1) if len(out_chs) == 2
-                     else out_chs[0])
-        # Return (sr, audio) tuples for A/B players
-        return (sr, data), (sr, processed)
-    except Exception as e:
-        # Show a readable error in the UI
-        msg = f"Processing error: {type(e).__name__}: {e}"
-        print(msg)
-        raise RuntimeError(msg)
-# -------- UI --------
-with gr.Blocks(css="footer {visibility: hidden}") as demo:
-    gr.Markdown("## Zack’s Audio Outpost — AI Noise Reducer\nUpload a file and compare **Original** vs **Processed**.")
     with gr.Row():
         audio_in = gr.Audio(type="numpy", label="Upload Audio")
-        strength = gr.Radio(["Light", "Medium", "Strong"], value="Medium",
-                            label="Noise Reduction Strength")
-    run = gr.Button("Run Noise Reduction", variant="primary")
     with gr.Row():
-        out_orig = gr.Audio(label="Original Audio")
-        out_proc = gr.Audio(label="Processed Audio")
-    run.click(denoise, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
 demo.launch()

+import os
 import numpy as np
+import gradio as gr
+import soundfile as sf
 import torch
+# Try torchaudio for resampling. If it's not usable, fall back to SciPy.
+USE_TORCHAUDIO = True
 try:
     import torchaudio
+    import torchaudio.functional as AF
 except Exception:
+    USE_TORCHAUDIO = False
     from scipy.signal import resample_poly
+# SpeechBrain MetricGAN+ enhancement (CPU)
 from speechbrain.pretrained import SpectralMaskEnhancement
+torch.set_num_threads(1)
+DEVICE = "cpu"
+MODEL_ID = "speechbrain/metricgan-plus-voicebank"
+# Load the enhancer once
 ENHANCER = SpectralMaskEnhancement.from_hparams(
+    source=MODEL_ID,
+    savedir="pretrained_metricganp",
+    run_opts={"device": DEVICE}
 )
+TARGET_SR = 16000  # MetricGAN+ expects 16 kHz
+def _to_mono(x: np.ndarray) -> np.ndarray:
+    # x shape: (samples,) or (samples, channels)
+    if x.ndim == 2 and x.shape[1] > 1:
+        return np.mean(x, axis=1, dtype=np.float32)
+    return x.astype(np.float32, copy=False)
+def _resample(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
+    if sr_in == sr_out:
+        return x
+    if USE_TORCHAUDIO:
+        with torch.no_grad():
+            t = torch.from_numpy(x).unsqueeze(0)  # (1, time)
+            y = AF.resample(t, orig_freq=sr_in, new_freq=sr_out)
+            return y.squeeze(0).cpu().numpy().astype(np.float32)
+    # SciPy fall-back
+    g = np.gcd(sr_in, sr_out)
+    up, down = sr_out // g, sr_in // g
+    y = resample_poly(x, up, down).astype(np.float32)
+    return y
+def _mix(dry: np.ndarray, wet: np.ndarray, strength: str) -> np.ndarray:
+    # Light / Medium / Strong → wet mix amounts
+    mix = {"Light": 0.4, "Medium": 0.7, "Strong": 1.0}.get(strength, 0.7)
+    # pad/truncate to the same length
+    n = min(len(dry), len(wet))
+    out = dry[:n] * (1.0 - mix) + wet[:n] * mix
+    return out
+def denoise(audio: tuple, strength: str):
     """
+    Gradio passes (sr, np.ndarray[int16/float32, shape=(n,) or (n, ch)]) when type='numpy'
+    Return the processed audio as (sr, np.ndarray[float32]).
     """
+    if audio is None:
+        raise gr.Error("Please upload an audio file.")
+    sr, data = audio
+    if isinstance(data, list):
+        data = np.array(data, dtype=np.float32)
+    # To mono, float32 in [-1, 1]
+    x_mono = _to_mono(data)
+    x_mono = np.clip(x_mono, -1.0, 1.0).astype(np.float32)
+    # Resample to 16 kHz for the model
+    x_16k = _resample(x_mono, sr_in=sr, sr_out=TARGET_SR)
+    # Enhance with MetricGAN+
+    with torch.no_grad():
+        # Enhance expects torch.Tensor: shape (batch, time)
+        inp = torch.from_numpy(x_16k).unsqueeze(0)
+        enhanced = ENHANCER.enhance_batch(inp, TARGET_SR)
+        if isinstance(enhanced, torch.Tensor):
+            enhanced = enhanced.squeeze(0).cpu().numpy().astype(np.float32)
+    # Back to original sample rate
+    enhanced_sr = _resample(enhanced, sr_in=TARGET_SR, sr_out=sr)
+    # Mix according to strength (preserve dry transients)
+    y = _mix(dry=x_mono, wet=enhanced_sr, strength=strength)
+    # Return as mono track at original sr
+    return (sr, y.astype(np.float32))
+# ---------- UI ----------
+with gr.Blocks(theme=gr.themes.Soft(), css="footer {visibility:hidden}") as demo:
+    gr.Markdown("### Zack’s Audio Outpost — AI Noise Reducer\nUpload a file and compare **Original vs Processed**.")
     with gr.Row():
         audio_in = gr.Audio(type="numpy", label="Upload Audio")
+        strength = gr.Radio(["Light", "Medium", "Strong"], value="Medium", label="Noise Reduction Strength")
+    run_btn = gr.Button("Run Noise Reduction", variant="primary")
     with gr.Row():
+        orig = gr.Audio(label="Original")
+        clean = gr.Audio(label="Processed")
+    def run(audio, strength):
+        if audio is None:
+            raise gr.Error("Please upload an audio file.")
+        sr, data = audio
+        processed = denoise((sr, data), strength)
+        return (sr, data), processed
+    run_btn.click(fn=run, inputs=[audio_in, strength], outputs=[orig, clean])
 demo.launch()