Spaces:

ZDingman
/

zacks-audio-outpost-denoiser

Sleeping

App Files Files Community

ZDingman commited on 8 days ago

Commit

0af1c3f

verified ·

1 Parent(s): e2a2b71

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -69

app.py CHANGED Viewed

@@ -1,79 +1,96 @@
 import gradio as gr
 import numpy as np
-import soundfile as sf
-# DeepFilterNet2
-from df.enhance import enhance, init_df
-APP_TITLE = "Zack’s Audio Outpost — Voice Denoiser (DeepFilterNet2)"
-APP_DESC = (
-    "Upload a voice clip with traffic/hiss/room noise and compare Original vs Processed. "
-    "Choose Light / Medium / Strong (1× / 2× / 3× passes)."
 )
-# Load DFN2 once (first run can take a few minutes while the Space installs packages)
-MODEL_DF, DF_STATE, _ = init_df()
-def _ensure_2d(x: np.ndarray) -> np.ndarray:
-    """Make shape (samples, channels)."""
-    if x.ndim == 1:
-        x = x[:, None]
-    return x
-def _run_single_pass(stereo: np.ndarray) -> np.ndarray:
-    """Run DFN2 per channel; keep same length/channels."""
-    out = np.zeros_like(stereo, dtype=np.float32)
-    for ch in range(stereo.shape[1]):
-        y = enhance(stereo[:, ch].astype(np.float32),
-                    DF_STATE, model=MODEL_DF, atten_lim_db=12.0)
-        out[:len(y), ch] = y[:stereo.shape[0]]
-    return out
-def process(file_obj, strength):
-    if file_obj is None:
-        raise gr.Error("Please upload an audio file first.")
-    # Load original audio (mono or stereo)
-    audio, sr = sf.read(file_obj.name, always_2d=False)
-    x = _ensure_2d(audio.astype(np.float32))
-    # Map UI strength to number of passes
-    passes = {"Light": 1, "Medium": 2, "Strong": 3}[strength]
-    y = x.copy()
-    for _ in range(passes):
-        y = _run_single_pass(y)
-    # Avoid clipping if multi-pass pushed levels
-    y = np.clip(y, -1.0, 1.0)
-    # Gradio wants (sr, np.array). If mono, squeeze back to 1D
-    return (sr, audio), (sr, y.squeeze())
-THEME = gr.themes.Soft(primary_hue="cyan", neutral_hue="slate").set(
-    body_background_fill="#0b1020",
-    body_text_color="#e6ecff",
-    block_background_fill="#121830",
-    block_border_color="#243154",
-    button_primary_background_fill="#3dd6ff",
-    button_primary_text_color="#001018",
-    input_background_fill="#0e1530",
-    input_border_color="#243154",
-)
-with gr.Blocks(title=APP_TITLE, theme=THEME) as demo:
-    gr.Markdown(f"## {APP_TITLE}\n{APP_DESC}")
     with gr.Row():
-        file = gr.File(label="Upload audio", file_types=["audio"])
-        strength = gr.Radio(["Light","Medium","Strong"], value="Medium",
-                            label="Noise reduction strength")
-    run = gr.Button("Process", variant="primary")
     with gr.Row():
-        a_orig = gr.Audio(label="Original (A)", interactive=False)
-        a_proc = gr.Audio(label="Processed (B)", interactive=False)
-    run.click(process, inputs=[file, strength], outputs=[a_orig, a_proc])
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import numpy as np
+import torch
+import torchaudio
+from speechbrain.pretrained import SpectralMaskEnhancement
+# Download once and cache in the Space
+ENHANCER = SpectralMaskEnhancement.from_hparams(
+    source="speechbrain/metricgan-plus-voicebank",
+    savedir="pretrained/metricgan-plus-voicebank",
 )
+TARGET_SR = 16000  # model sample rate
+def _to_tensor(mono_np: np.ndarray) -> torch.Tensor:
+    # ensure float32 [-1,1]
+    t = torch.from_numpy(mono_np.astype(np.float32))
+    peak = t.abs().max().clamp(min=1e-8)
+    return (t / peak)
+def _enhance_channel(wav_np: np.ndarray, in_sr: int, mix: float) -> np.ndarray:
+    """Enhance one channel and wet/dry mix."""
+    x = _to_tensor(wav_np)
+    if in_sr != TARGET_SR:
+        x16 = torchaudio.functional.resample(x, in_sr, TARGET_SR)
+    else:
+        x16 = x
+    with torch.no_grad():
+        # enhance_batch expects shape [B, T]
+        est16 = ENHANCER.enhance_batch(x16.unsqueeze(0), TARGET_SR)[0].squeeze(0)
+    # back to original sr
+    if in_sr != TARGET_SR:
+        est = torchaudio.functional.resample(est16, TARGET_SR, in_sr)
+    else:
+        est = est16
+    # trim/pad to original length
+    n = x.shape[0]
+    if est.shape[0] >= n:
+        est = est[:n]
+    else:
+        est = torch.nn.functional.pad(est, (0, n - est.shape[0]))
+    y = (1.0 - mix) * x + mix * est
+    return y.cpu().numpy()
+def denoise(audio, strength):
+    """
+    Gradio passes (sr, numpy) when type='numpy'.
+    numpy is shape [T] (mono) or [T, 2] (stereo).
+    We process mono or true stereo.
+    """
+    if audio is None:
+        return None, None
+    sr, data = audio
+    if data.ndim == 1:  # mono
+        chs = [data]
+    else:               # stereo (T,2)
+        chs = [data[:, 0], data[:, 1]]
+    mix_map = {"Light": 0.5, "Medium": 0.75, "Strong": 1.0}
+    mix = mix_map.get(strength, 0.75)
+    out_chs = [ _enhance_channel(c, sr, mix) for c in chs ]
+    if len(out_chs) == 2:
+        processed = np.stack(out_chs, axis=1)  # (T,2)
+        original  = data
+    else:
+        processed = out_chs[0]
+        original  = data
+    # Return both so users can A/B
+    return (sr, original), (sr, processed)
+# -------- UI --------
+with gr.Blocks(css="footer {visibility: hidden}") as demo:
+    gr.Markdown("## Zack’s Audio Outpost — AI Noise Reducer\nUpload a file and compare **Original** vs **Processed**.")
     with gr.Row():
+        audio_in = gr.Audio(type="numpy", label="Upload Audio")
+        strength = gr.Radio(["Light", "Medium", "Strong"], value="Medium",
+                            label="Noise Reduction Strength")
+    run = gr.Button("Run Noise Reduction", variant="primary")
     with gr.Row():
+        out_orig = gr.Audio(label="Original Audio")
+        out_proc = gr.Audio(label="Processed Audio")
+    run.click(denoise, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
+demo.launch()