File size: 6,277 Bytes
2ee63b1 0af1c3f fbb28c2 276b212 2ee63b1 56349a1 276b212 fbb28c2 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 fbb28c2 276b212 fbb28c2 2ee63b1 276b212 fbb28c2 276b212 2ee63b1 276b212 2ee63b1 fbb28c2 2ee63b1 fbb28c2 276b212 2ee63b1 276b212 2ee63b1 276b212 1430d22 fbb28c2 276b212 2ee63b1 1430d22 2ee63b1 276b212 2ee63b1 1430d22 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 2ee63b1 276b212 fbb28c2 276b212 fbb28c2 2ee63b1 276b212 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import os
from typing import Tuple
import numpy as np
import torch
import torchaudio
import gradio as gr
from speechbrain.pretrained import SpectralMaskEnhancement
# ----------------------------
# Constants / Globals
# ----------------------------
TITLE = "Zack's Audio Outpost — Voice Denoiser"
DESCRIPTION = """
Upload a short audio clip with speech (mono or stereo).
Choose **Light**, **Medium**, or **Strong** reduction and compare **Original vs Processed**.
"""
TARGET_SR = 16000 # MetricGAN+ VoiceBank expects 16 kHz
MIX_BY_STRENGTH = {"Light": 0.35, "Medium": 0.65, "Strong": 0.9}
_enhancer = None # lazy-loaded model
# ----------------------------
# Model Loader (cached)
# ----------------------------
def get_enhancer() -> SpectralMaskEnhancement:
"""
Loads SpeechBrain MetricGAN+ VoiceBank denoiser (once) and caches it.
Runs on CPU inside Spaces by default.
"""
global _enhancer
if _enhancer is None:
_enhancer = SpectralMaskEnhancement.from_hparams(
source="speechbrain/metricgan-plus-voicebank",
savedir="pretrained_models/metricgan-plus-voicebank",
run_opts={"device": "cpu"},
)
# Put underlying nn.Module in eval mode
_enhancer.mods.eval()
return _enhancer
# ----------------------------
# Audio utilities
# ----------------------------
def to_mono(x: np.ndarray) -> np.ndarray:
"""
Normalize shape consistently to mono float32 in [-1, 1].
Accepts:
(time,) -> mono
(time, channels) -> last dim is channels
(channels, time) -> first dim is channels
"""
if x.ndim == 1:
y = x
elif x.ndim == 2:
t, c = x.shape
# If last dim is 1 or 2, treat as (time, ch)
if c in (1, 2) and t >= c:
y = x if c == 1 else x.mean(axis=1)
# If first dim is 1 or 2, treat as (ch, time)
elif t in (1, 2) and x.shape[1] > t:
y = x[0] if t == 1 else x.mean(axis=0)
else:
# Fallback: assume (time, ch)
y = x.mean(axis=1)
else:
raise ValueError(f"Unsupported audio shape {x.shape} (need 1D or 2D).")
# Ensure float32 in [-1, 1], handle int16/24/32 just in case
if np.issubdtype(y.dtype, np.integer):
# assume int16-like full scale
y = y.astype(np.float32) / 32768.0
else:
y = y.astype(np.float32, copy=False)
# Remove NaNs/Infs and hard-clip
y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)
return np.clip(y, -1.0, 1.0)
def resample_to_16k_mono(x: np.ndarray, sr: int) -> torch.Tensor:
"""
Returns torch float32 (1, time) @ 16 kHz mono on CPU.
"""
mono = to_mono(x)
wav = torch.from_numpy(mono).to(torch.float32) # (time,)
if sr != TARGET_SR:
wav = torchaudio.functional.resample(
wav, orig_freq=sr, new_freq=TARGET_SR
)
return wav.unsqueeze(0) # (1, time)
# ----------------------------
# Denoise core
# ----------------------------
@torch.no_grad()
def denoise_numpy(
audio: Tuple[int, np.ndarray], strength: str
) -> Tuple[Tuple[int, np.ndarray], Tuple[int, np.ndarray]]:
"""
Gradio callback:
input: (sr, np.ndarray) where array is mono or stereo
output: ((sr_in, mono_orig), (16k, mono_processed))
"""
if audio is None:
return None, None
in_sr, in_wav = audio
if in_wav is None or in_wav.size == 0:
return None, None
# Load model and prepare input
enhancer = get_enhancer()
device = next(enhancer.mods.parameters()).device # CPU
wav16 = resample_to_16k_mono(in_wav, in_sr).to(device) # (1, time)
lengths = torch.tensor([1.0], dtype=torch.float32, device=device)
# If effectively silent, skip processing
if wav16.abs().mean().item() < 1e-6:
original = (in_sr, to_mono(in_wav))
processed = (TARGET_SR, wav16.squeeze(0).cpu().numpy())
return original, processed
# Enhance (MetricGAN+ outputs enhanced waveform directly via mix-mask pipeline)
enhanced = enhancer.enhance_batch(wav16, lengths=lengths).squeeze(0) # (time,)
dry = wav16.squeeze(0)
# Wet/Dry mix by strength
mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
out = dry * (1.0 - mix) + enhanced * mix
# Clean up output
out = torch.nan_to_num(out, nan=0.0, posinf=0.0, neginf=0.0)
y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)
original = (in_sr, to_mono(in_wav))
processed = (TARGET_SR, y)
return original, processed
# ----------------------------
# Gradio UI
# ----------------------------
CSS = """
:root { --brand: #4b6bfb; } /* tweak to your brand color */
.gradio-container { font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; }
#title { font-weight: 800; font-size: 1.4rem; }
#footer { opacity: 0.75; font-size: 0.85rem; }
button.primary { background: var(--brand) !important; }
"""
with gr.Blocks(css=CSS, title=TITLE, fill_height=True) as demo:
gr.Markdown(f"<div id='title'>{TITLE}</div>")
gr.Markdown(DESCRIPTION)
with gr.Row():
audio_in = gr.Audio(
label="Upload or record (mono or stereo)",
sources=["upload", "microphone"],
type="numpy", # (sr, np.ndarray)
waveform_options={"show_controls": True}, # keep simple transport
)
strength = gr.Radio(
choices=["Light", "Medium", "Strong"],
value="Medium",
label="Reduction Strength",
)
run_btn = gr.Button("Process", variant="primary")
with gr.Row():
orig_out = gr.Audio(label="Original (mono)", interactive=False)
proc_out = gr.Audio(label="Processed (16 kHz mono)", interactive=False)
gr.Markdown(
"<div id='footer'>Tip: Try Medium first. Strong may sound more 'processed' but removes more traffic/hiss.</div>"
)
run_btn.click(
fn=denoise_numpy,
inputs=[audio_in, strength],
outputs=[orig_out, proc_out],
scroll_to_output=True,
show_progress=True,
)
if __name__ == "__main__":
# On Hugging Face Spaces the host/port are set for you.
demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=True)
|