Spaces:

ZDingman
/

zacks-audio-outpost-denoiser

Running

App Files Files Community

ZDingman commited on 18 days ago

Commit

fbb28c2

verified ·

1 Parent(s): 7dc97a7

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -90

app.py CHANGED Viewed

@@ -1,92 +1,46 @@
-import numpy as np
-import gradio as gr
-import soundfile as sf
-from scipy.signal import resample_poly
 import torch
-# Lazy import to avoid failing at build time
-ENHANCER = None
-TARGET_SR = 16000  # MetricGAN+ expects 16 kHz
-DEVICE = "cpu"
-torch.set_num_threads(1)
-def get_enhancer():
-    global ENHANCER
-    if ENHANCER is None:
-        from speechbrain.pretrained import SpectralMaskEnhancement
-        ENHANCER = SpectralMaskEnhancement.from_hparams(
-            source="speechbrain/metricgan-plus-voicebank",
-            savedir="pretrained_metricganp",
-            run_opts={"device": DEVICE}
-        )
-    return ENHANCER
-def _to_mono(x: np.ndarray) -> np.ndarray:
-    # x shape: (n,) or (n, ch); keep as float32 in [-1,1]
-    if x.ndim == 2 and x.shape[1] > 1:
-        x = np.mean(x, axis=1)
-    x = np.asarray(x, dtype=np.float32)
-    return np.clip(x, -1.0, 1.0)
-def _resample(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
-    if sr_in == sr_out:
-        return x.astype(np.float32, copy=False)
-    g = np.gcd(sr_in, sr_out)
-    up, down = sr_out // g, sr_in // g
-    y = resample_poly(x, up, down).astype(np.float32)
-    return y
-def _mix(dry: np.ndarray, wet: np.ndarray, strength: str) -> np.ndarray:
-    mix = {"Light": 0.4, "Medium": 0.7, "Strong": 1.0}.get(strength, 0.7)
-    n = min(len(dry), len(wet))
-    return dry[:n] * (1.0 - mix) + wet[:n] * mix
-def denoise(audio: tuple, strength: str):
-    if audio is None:
-        raise gr.Error("Please upload an audio file.")
-    sr, data = audio
-    data = np.asarray(data)  # gradio sometimes gives list
-    # to mono + float32
-    dry_mono = _to_mono(data)
-    # resample to 16k
-    x16 = _resample(dry_mono, sr_in=sr, sr_out=TARGET_SR)
-    # run enhancer (lazy load)
-    enhancer = get_enhancer()
-    with torch.no_grad():
-        inp = torch.from_numpy(x16).unsqueeze(0)  # (1, time)
-        enhanced = enhancer.enhance_batch(inp, TARGET_SR)
-        if isinstance(enhanced, torch.Tensor):
-            enhanced = enhanced.squeeze(0).cpu().numpy().astype(np.float32)
-    # back to original SR
-    enh_sr = _resample(enhanced, sr_in=TARGET_SR, sr_out=sr)
-    # wet/dry
-    out = _mix(dry_mono, enh_sr, strength)
-    return (sr, out.astype(np.float32))
-# -------- UI --------
-with gr.Blocks(theme=gr.themes.Soft(), css="footer{visibility:hidden}") as demo:
-    gr.Markdown("### Zack’s Audio Outpost — AI Noise Reducer\nUpload a file and compare **Original vs Processed**.")
-    with gr.Row():
-        audio_in = gr.Audio(type="numpy", label="Upload Audio")
-        strength = gr.Radio(["Light","Medium","Strong"], value="Medium", label="Noise Reduction Strength")
-    run_btn = gr.Button("Run Noise Reduction", variant="primary")
-    with gr.Row():
-        orig = gr.Audio(label="Original")
-        proc = gr.Audio(label="Processed")
-    def run(audio, s):
-        if audio is None:
-            raise gr.Error("Please upload an audio file.")
-        sr, x = audio
-        y = denoise(audio, s)
-        return (sr, x), y
-    run_btn.click(run, [audio_in, strength], [orig, proc])
-demo.launch()

 import torch
+import torchaudio
+import numpy as np
+TARGET_SR = 16000  # model expects 16 kHz
+# strength -> wet mix
+MIX_BY_STRENGTH = {
+    "Light": 0.5,
+    "Medium": 0.75,
+    "Strong": 1.0,
+}
+def _to_16k_mono(x: np.ndarray, sr: int) -> torch.Tensor:
+    """x: (time,) or (time, channels) float32 -1..1 -> torch (1, time) @16k"""
+    if x.ndim == 2:  # stereo -> mono average
+        x = x.mean(axis=1)
+    wav = torch.from_numpy(x.astype(np.float32))  # (time,)
+    if sr != TARGET_SR:
+        wav = torchaudio.functional.resample(wav, sr, TARGET_SR)
+    return wav.unsqueeze(0)  # (1, time)
+@torch.no_grad()
+def denoise(audio, strength):
+    # audio comes from gradio as (sr, np.ndarray) or filepath depending on your IO
+    # If you already have (sr, np.ndarray) upstream, keep that. Example below assumes tuple:
+    sr, x = audio  # x shape (time, [channels]) float32 -1..1
+    # to 16k mono
+    wav16 = _to_16k_mono(x, sr)             # (1, time) torch.float32
+    lengths = torch.tensor([1.0])           # full-length (relative) as required
+    # Run SpeechBrain enhancer (already created as `enhancer`)
+    enhanced = enhancer.enhance_batch(wav16, lengths=lengths)  # (1, time)
+    enhanced = enhanced.squeeze(0)          # (time,)
+    dry = wav16.squeeze(0)
+    # Wet/dry mix per UI strength
+    mix = MIX_BY_STRENGTH.get(strength, 0.75)
+    out = dry * (1.0 - mix) + enhanced * mix
+    # back to numpy @16k
+    y = out.cpu().numpy().astype(np.float32)
+    # Return (sr, waveform) to Gradio (or whatever your interface expects)
+    return (TARGET_SR, y)