File size: 6,277 Bytes
2ee63b1
 
 
 
0af1c3f
fbb28c2
276b212
2ee63b1
56349a1
276b212
 
 
 
 
 
 
 
fbb28c2
276b212
 
2ee63b1
276b212
2ee63b1
 
276b212
 
 
2ee63b1
276b212
 
 
 
2ee63b1
 
 
276b212
 
 
2ee63b1
276b212
2ee63b1
 
 
 
276b212
 
 
2ee63b1
 
276b212
 
 
 
 
 
2ee63b1
 
 
 
276b212
 
 
 
 
 
 
2ee63b1
276b212
2ee63b1
 
276b212
 
 
 
 
 
 
 
 
 
 
 
2ee63b1
 
 
 
276b212
2ee63b1
 
276b212
fbb28c2
276b212
 
 
fbb28c2
 
2ee63b1
276b212
 
 
fbb28c2
276b212
 
 
2ee63b1
276b212
 
 
2ee63b1
 
 
fbb28c2
2ee63b1
 
 
fbb28c2
276b212
 
 
 
 
2ee63b1
276b212
 
 
 
 
2ee63b1
276b212
1430d22
fbb28c2
 
276b212
2ee63b1
1430d22
2ee63b1
276b212
 
2ee63b1
 
1430d22
2ee63b1
 
 
 
276b212
 
 
2ee63b1
276b212
 
 
 
 
2ee63b1
 
276b212
 
 
2ee63b1
 
 
276b212
 
 
 
2ee63b1
 
 
 
276b212
2ee63b1
 
276b212
2ee63b1
 
276b212
 
 
 
 
 
fbb28c2
276b212
 
 
 
 
 
 
fbb28c2
2ee63b1
276b212
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os
from typing import Tuple

import numpy as np
import torch
import torchaudio
import gradio as gr
from speechbrain.pretrained import SpectralMaskEnhancement

# ----------------------------
# Constants / Globals
# ----------------------------
TITLE = "Zack's Audio Outpost — Voice Denoiser"
DESCRIPTION = """
Upload a short audio clip with speech (mono or stereo).  
Choose **Light**, **Medium**, or **Strong** reduction and compare **Original vs Processed**.
"""

TARGET_SR = 16000  # MetricGAN+ VoiceBank expects 16 kHz
MIX_BY_STRENGTH = {"Light": 0.35, "Medium": 0.65, "Strong": 0.9}

_enhancer = None  # lazy-loaded model


# ----------------------------
# Model Loader (cached)
# ----------------------------
def get_enhancer() -> SpectralMaskEnhancement:
    """
    Loads SpeechBrain MetricGAN+ VoiceBank denoiser (once) and caches it.
    Runs on CPU inside Spaces by default.
    """
    global _enhancer
    if _enhancer is None:
        _enhancer = SpectralMaskEnhancement.from_hparams(
            source="speechbrain/metricgan-plus-voicebank",
            savedir="pretrained_models/metricgan-plus-voicebank",
            run_opts={"device": "cpu"},
        )
        # Put underlying nn.Module in eval mode
        _enhancer.mods.eval()
    return _enhancer


# ----------------------------
# Audio utilities
# ----------------------------
def to_mono(x: np.ndarray) -> np.ndarray:
    """
    Normalize shape consistently to mono float32 in [-1, 1].

    Accepts:
      (time,)          -> mono
      (time, channels) -> last dim is channels
      (channels, time) -> first dim is channels
    """
    if x.ndim == 1:
        y = x
    elif x.ndim == 2:
        t, c = x.shape
        # If last dim is 1 or 2, treat as (time, ch)
        if c in (1, 2) and t >= c:
            y = x if c == 1 else x.mean(axis=1)
        # If first dim is 1 or 2, treat as (ch, time)
        elif t in (1, 2) and x.shape[1] > t:
            y = x[0] if t == 1 else x.mean(axis=0)
        else:
            # Fallback: assume (time, ch)
            y = x.mean(axis=1)
    else:
        raise ValueError(f"Unsupported audio shape {x.shape} (need 1D or 2D).")

    # Ensure float32 in [-1, 1], handle int16/24/32 just in case
    if np.issubdtype(y.dtype, np.integer):
        # assume int16-like full scale
        y = y.astype(np.float32) / 32768.0
    else:
        y = y.astype(np.float32, copy=False)

    # Remove NaNs/Infs and hard-clip
    y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)
    return np.clip(y, -1.0, 1.0)


def resample_to_16k_mono(x: np.ndarray, sr: int) -> torch.Tensor:
    """
    Returns torch float32 (1, time) @ 16 kHz mono on CPU.
    """
    mono = to_mono(x)
    wav = torch.from_numpy(mono).to(torch.float32)  # (time,)
    if sr != TARGET_SR:
        wav = torchaudio.functional.resample(
            wav, orig_freq=sr, new_freq=TARGET_SR
        )
    return wav.unsqueeze(0)  # (1, time)


# ----------------------------
# Denoise core
# ----------------------------
@torch.no_grad()
def denoise_numpy(
    audio: Tuple[int, np.ndarray], strength: str
) -> Tuple[Tuple[int, np.ndarray], Tuple[int, np.ndarray]]:
    """
    Gradio callback:
      input:  (sr, np.ndarray) where array is mono or stereo
      output: ((sr_in, mono_orig), (16k, mono_processed))
    """
    if audio is None:
        return None, None

    in_sr, in_wav = audio
    if in_wav is None or in_wav.size == 0:
        return None, None

    # Load model and prepare input
    enhancer = get_enhancer()
    device = next(enhancer.mods.parameters()).device  # CPU
    wav16 = resample_to_16k_mono(in_wav, in_sr).to(device)  # (1, time)
    lengths = torch.tensor([1.0], dtype=torch.float32, device=device)

    # If effectively silent, skip processing
    if wav16.abs().mean().item() < 1e-6:
        original = (in_sr, to_mono(in_wav))
        processed = (TARGET_SR, wav16.squeeze(0).cpu().numpy())
        return original, processed

    # Enhance (MetricGAN+ outputs enhanced waveform directly via mix-mask pipeline)
    enhanced = enhancer.enhance_batch(wav16, lengths=lengths).squeeze(0)  # (time,)
    dry = wav16.squeeze(0)

    # Wet/Dry mix by strength
    mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
    out = dry * (1.0 - mix) + enhanced * mix

    # Clean up output
    out = torch.nan_to_num(out, nan=0.0, posinf=0.0, neginf=0.0)
    y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)

    original = (in_sr, to_mono(in_wav))
    processed = (TARGET_SR, y)
    return original, processed


# ----------------------------
# Gradio UI
# ----------------------------
CSS = """
:root { --brand: #4b6bfb; } /* tweak to your brand color */
.gradio-container { font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; }
#title { font-weight: 800; font-size: 1.4rem; }
#footer { opacity: 0.75; font-size: 0.85rem; }
button.primary { background: var(--brand) !important; }
"""

with gr.Blocks(css=CSS, title=TITLE, fill_height=True) as demo:
    gr.Markdown(f"<div id='title'>{TITLE}</div>")
    gr.Markdown(DESCRIPTION)

    with gr.Row():
        audio_in = gr.Audio(
            label="Upload or record (mono or stereo)",
            sources=["upload", "microphone"],
            type="numpy",           # (sr, np.ndarray)
            waveform_options={"show_controls": True},  # keep simple transport
        )
        strength = gr.Radio(
            choices=["Light", "Medium", "Strong"],
            value="Medium",
            label="Reduction Strength",
        )

    run_btn = gr.Button("Process", variant="primary")

    with gr.Row():
        orig_out = gr.Audio(label="Original (mono)", interactive=False)
        proc_out = gr.Audio(label="Processed (16 kHz mono)", interactive=False)

    gr.Markdown(
        "<div id='footer'>Tip: Try Medium first. Strong may sound more 'processed' but removes more traffic/hiss.</div>"
    )

    run_btn.click(
        fn=denoise_numpy,
        inputs=[audio_in, strength],
        outputs=[orig_out, proc_out],
        scroll_to_output=True,
        show_progress=True,
    )

if __name__ == "__main__":
    # On Hugging Face Spaces the host/port are set for you.
    demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=True)