Spaces:

ZDingman
/

zacks-audio-outpost-denoiser

Running

File size: 6,277 Bytes

2ee63b1
 
 
 
0af1c3f
fbb28c2
276b212
2ee63b1
56349a1
276b212
 
 
 
 
 
 
 
fbb28c2
276b212
 
2ee63b1
276b212
2ee63b1
 
276b212
 
 
2ee63b1
276b212
 
 
 
2ee63b1
 
 
276b212
 
 
2ee63b1
276b212
2ee63b1
 
 
 
276b212
 
 
2ee63b1
 
276b212
 
 
 
 
 
2ee63b1
 
 
 
276b212
 
 
 
 
 
 
2ee63b1
276b212
2ee63b1
 
276b212
 
 
 
 
 
 
 
 
 
 
 
2ee63b1
 
 
 
276b212
2ee63b1
 
276b212
fbb28c2
276b212
 
 
fbb28c2
 
2ee63b1
276b212
 
 
fbb28c2
276b212
 
 
2ee63b1
276b212
 
 
2ee63b1
 
 
fbb28c2
2ee63b1
 
 
fbb28c2
276b212
 
 
 
 
2ee63b1
276b212
 
 
 
 
2ee63b1
276b212
1430d22
fbb28c2
 
276b212
2ee63b1
1430d22
2ee63b1
276b212
 
2ee63b1
 
1430d22
2ee63b1
 
 
 
276b212
 
 
2ee63b1
276b212
 
 
 
 
2ee63b1
 
276b212
 
 
2ee63b1
 
 
276b212
 
 
 
2ee63b1
 
 
 
276b212
2ee63b1
 
276b212
2ee63b1
 
276b212
 
 
 
 
 
fbb28c2
276b212
 
 
 
 
 
 
fbb28c2
2ee63b1
276b212

import os
from typing import Tuple

import numpy as np
import torch
import torchaudio
import gradio as gr
from speechbrain.pretrained import SpectralMaskEnhancement

# ----------------------------
# Constants / Globals
# ----------------------------
TITLE = "Zack's Audio Outpost — Voice Denoiser"
DESCRIPTION = """
Upload a short audio clip with speech (mono or stereo).  
Choose **Light**, **Medium**, or **Strong** reduction and compare **Original vs Processed**.
"""

TARGET_SR = 16000  # MetricGAN+ VoiceBank expects 16 kHz
MIX_BY_STRENGTH = {"Light": 0.35, "Medium": 0.65, "Strong": 0.9}

_enhancer = None  # lazy-loaded model


# ----------------------------
# Model Loader (cached)
# ----------------------------
def get_enhancer() -> SpectralMaskEnhancement:
    """
    Loads SpeechBrain MetricGAN+ VoiceBank denoiser (once) and caches it.
    Runs on CPU inside Spaces by default.
    """
    global _enhancer
    if _enhancer is None:
        _enhancer = SpectralMaskEnhancement.from_hparams(
            source="speechbrain/metricgan-plus-voicebank",
            savedir="pretrained_models/metricgan-plus-voicebank",
            run_opts={"device": "cpu"},
        )
        # Put underlying nn.Module in eval mode
        _enhancer.mods.eval()
    return _enhancer


# ----------------------------
# Audio utilities
# ----------------------------
def to_mono(x: np.ndarray) -> np.ndarray:
    """
    Normalize shape consistently to mono float32 in [-1, 1].

    Accepts:
      (time,)          -> mono
      (time, channels) -> last dim is channels
      (channels, time) -> first dim is channels
    """
    if x.ndim == 1:
        y = x
    elif x.ndim == 2:
        t, c = x.shape
        # If last dim is 1 or 2, treat as (time, ch)
        if c in (1, 2) and t >= c:
            y = x if c == 1 else x.mean(axis=1)
        # If first dim is 1 or 2, treat as (ch, time)
        elif t in (1, 2) and x.shape[1] > t:
            y = x[0] if t == 1 else x.mean(axis=0)
        else:
            # Fallback: assume (time, ch)
            y = x.mean(axis=1)
    else:
        raise ValueError(f"Unsupported audio shape {x.shape} (need 1D or 2D).")

    # Ensure float32 in [-1, 1], handle int16/24/32 just in case
    if np.issubdtype(y.dtype, np.integer):
        # assume int16-like full scale
        y = y.astype(np.float32) / 32768.0
    else:
        y = y.astype(np.float32, copy=False)

    # Remove NaNs/Infs and hard-clip
    y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)
    return np.clip(y, -1.0, 1.0)


def resample_to_16k_mono(x: np.ndarray, sr: int) -> torch.Tensor:
    """
    Returns torch float32 (1, time) @ 16 kHz mono on CPU.
    """
    mono = to_mono(x)
    wav = torch.from_numpy(mono).to(torch.float32)  # (time,)
    if sr != TARGET_SR:
        wav = torchaudio.functional.resample(
            wav, orig_freq=sr, new_freq=TARGET_SR
        )
    return wav.unsqueeze(0)  # (1, time)


# ----------------------------
# Denoise core
# ----------------------------
@torch.no_grad()
def denoise_numpy(
    audio: Tuple[int, np.ndarray], strength: str
) -> Tuple[Tuple[int, np.ndarray], Tuple[int, np.ndarray]]:
    """
    Gradio callback:
      input:  (sr, np.ndarray) where array is mono or stereo
      output: ((sr_in, mono_orig), (16k, mono_processed))
    """
    if audio is None:
        return None, None

    in_sr, in_wav = audio
    if in_wav is None or in_wav.size == 0:
        return None, None

    # Load model and prepare input
    enhancer = get_enhancer()
    device = next(enhancer.mods.parameters()).device  # CPU
    wav16 = resample_to_16k_mono(in_wav, in_sr).to(device)  # (1, time)
    lengths = torch.tensor([1.0], dtype=torch.float32, device=device)

    # If effectively silent, skip processing
    if wav16.abs().mean().item() < 1e-6:
        original = (in_sr, to_mono(in_wav))
        processed = (TARGET_SR, wav16.squeeze(0).cpu().numpy())
        return original, processed

    # Enhance (MetricGAN+ outputs enhanced waveform directly via mix-mask pipeline)
    enhanced = enhancer.enhance_batch(wav16, lengths=lengths).squeeze(0)  # (time,)
    dry = wav16.squeeze(0)

    # Wet/Dry mix by strength
    mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
    out = dry * (1.0 - mix) + enhanced * mix

    # Clean up output
    out = torch.nan_to_num(out, nan=0.0, posinf=0.0, neginf=0.0)
    y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)

    original = (in_sr, to_mono(in_wav))
    processed = (TARGET_SR, y)
    return original, processed


# ----------------------------
# Gradio UI
# ----------------------------
CSS = """
:root { --brand: #4b6bfb; } /* tweak to your brand color */
.gradio-container { font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; }
#title { font-weight: 800; font-size: 1.4rem; }
#footer { opacity: 0.75; font-size: 0.85rem; }
button.primary { background: var(--brand) !important; }
"""

with gr.Blocks(css=CSS, title=TITLE, fill_height=True) as demo:
    gr.Markdown(f"<div id='title'>{TITLE}</div>")
    gr.Markdown(DESCRIPTION)

    with gr.Row():
        audio_in = gr.Audio(
            label="Upload or record (mono or stereo)",
            sources=["upload", "microphone"],
            type="numpy",           # (sr, np.ndarray)
            waveform_options={"show_controls": True},  # keep simple transport
        )
        strength = gr.Radio(
            choices=["Light", "Medium", "Strong"],
            value="Medium",
            label="Reduction Strength",
        )

    run_btn = gr.Button("Process", variant="primary")

    with gr.Row():
        orig_out = gr.Audio(label="Original (mono)", interactive=False)
        proc_out = gr.Audio(label="Processed (16 kHz mono)", interactive=False)

    gr.Markdown(
        "<div id='footer'>Tip: Try Medium first. Strong may sound more 'processed' but removes more traffic/hiss.</div>"
    )

    run_btn.click(
        fn=denoise_numpy,
        inputs=[audio_in, strength],
        outputs=[orig_out, proc_out],
        scroll_to_output=True,
        show_progress=True,
    )

if __name__ == "__main__":
    # On Hugging Face Spaces the host/port are set for you.
    demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=True)