Update app.py
Browse files
app.py
CHANGED
@@ -1,46 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import torchaudio
|
3 |
-
|
4 |
|
5 |
-
|
|
|
|
|
|
|
6 |
|
7 |
-
#
|
8 |
MIX_BY_STRENGTH = {
|
9 |
-
"Light": 0.
|
10 |
-
"Medium": 0.75,
|
11 |
-
"Strong": 1.
|
12 |
}
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
if sr != TARGET_SR:
|
20 |
wav = torchaudio.functional.resample(wav, sr, TARGET_SR)
|
21 |
return wav.unsqueeze(0) # (1, time)
|
22 |
|
|
|
|
|
|
|
|
|
23 |
@torch.no_grad()
|
24 |
-
def
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
enhanced = enhancer.enhance_batch(wav16, lengths=lengths) # (1, time)
|
35 |
-
enhanced = enhanced.squeeze(0)
|
36 |
dry = wav16.squeeze(0)
|
37 |
|
38 |
-
# Wet/dry mix
|
39 |
-
mix = MIX_BY_STRENGTH.get(strength,
|
40 |
-
out = dry * (1.0 - mix) + enhanced * mix
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
|
43 |
-
y = out.cpu().numpy().astype(np.float32)
|
44 |
|
45 |
-
|
46 |
-
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
+
# Zack's Audio Outpost — AI Noise Reducer (SpeechBrain MetricGAN)
|
3 |
+
# Works on CPU in a Hugging Face Space. No GPU required.
|
4 |
+
|
5 |
+
import os
|
6 |
+
from typing import Tuple
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
import numpy as np
|
10 |
import torch
|
11 |
import torchaudio
|
12 |
+
from speechbrain.pretrained import SpectralMaskEnhancement
|
13 |
|
14 |
+
# -----------------------------
|
15 |
+
# Config
|
16 |
+
# -----------------------------
|
17 |
+
TARGET_SR = 16_000 # The SpeechBrain mtl-mimic-voicebank model expects 16 kHz mono
|
18 |
|
19 |
+
# Wet/dry mix by "strength"
|
20 |
MIX_BY_STRENGTH = {
|
21 |
+
"Light": 0.50, # 50% wet
|
22 |
+
"Medium": 0.75, # 75% wet
|
23 |
+
"Strong": 1.00, # 100% wet
|
24 |
}
|
25 |
|
26 |
+
MODEL_SOURCE = "speechbrain/mtl-mimic-voicebank"
|
27 |
+
MODEL_DIR = "pretrained_models/mtl-mimic-voicebank"
|
28 |
+
|
29 |
+
# Global enhancer (loaded once)
|
30 |
+
_enhancer: SpectralMaskEnhancement | None = None
|
31 |
+
|
32 |
+
|
33 |
+
def get_enhancer() -> SpectralMaskEnhancement:
|
34 |
+
"""Lazy-load the SpeechBrain enhancer once."""
|
35 |
+
global _enhancer
|
36 |
+
if _enhancer is None:
|
37 |
+
# Downloads the small MetricGAN+ checkpoint on first run
|
38 |
+
_enhancer = SpectralMaskEnhancement.from_hparams(
|
39 |
+
source=MODEL_SOURCE, savedir=MODEL_DIR
|
40 |
+
)
|
41 |
+
_enhancer.mods.eval()
|
42 |
+
torch.set_grad_enabled(False)
|
43 |
+
return _enhancer
|
44 |
+
|
45 |
+
|
46 |
+
# -----------------------------
|
47 |
+
# Audio helpers
|
48 |
+
# -----------------------------
|
49 |
+
def to_mono(x: np.ndarray) -> np.ndarray:
|
50 |
+
"""
|
51 |
+
Ensure mono. Accepts shapes:
|
52 |
+
- (time,) already mono
|
53 |
+
- (time, channels) -> average channels
|
54 |
+
- (channels, time) (rare) -> average channels, return (time,)
|
55 |
+
Returns float32 -1..1
|
56 |
+
"""
|
57 |
+
if x.ndim == 1:
|
58 |
+
y = x
|
59 |
+
elif x.ndim == 2:
|
60 |
+
# pick which axis is channels
|
61 |
+
if x.shape[0] < x.shape[1]:
|
62 |
+
# (channels, time)
|
63 |
+
y = x.mean(axis=0)
|
64 |
+
else:
|
65 |
+
# (time, channels)
|
66 |
+
y = x.mean(axis=1)
|
67 |
+
else:
|
68 |
+
raise ValueError("Unsupported audio shape; expected 1D or 2D ndarray")
|
69 |
+
return y.astype(np.float32, copy=False)
|
70 |
+
|
71 |
+
|
72 |
+
def resample_to_16k_mono(x: np.ndarray, sr: int) -> torch.Tensor:
|
73 |
+
"""
|
74 |
+
Numpy -> torch (1, time) @ 16 kHz mono, float32 in [-1, 1]
|
75 |
+
"""
|
76 |
+
mono = to_mono(x)
|
77 |
+
wav = torch.from_numpy(mono) # (time,)
|
78 |
if sr != TARGET_SR:
|
79 |
wav = torchaudio.functional.resample(wav, sr, TARGET_SR)
|
80 |
return wav.unsqueeze(0) # (1, time)
|
81 |
|
82 |
+
|
83 |
+
# -----------------------------
|
84 |
+
# Core processing
|
85 |
+
# -----------------------------
|
86 |
@torch.no_grad()
|
87 |
+
def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[int, np.ndarray], Tuple[int, np.ndarray]]:
|
88 |
+
"""
|
89 |
+
Gradio callback.
|
90 |
+
Input:
|
91 |
+
audio: (sr, numpy waveform)
|
92 |
+
strength: "Light" | "Medium" | "Strong"
|
93 |
+
Output:
|
94 |
+
(original_sr, original_wav), (TARGET_SR, processed_wav)
|
95 |
+
Both as float32 in [-1, 1]
|
96 |
+
"""
|
97 |
+
if audio is None:
|
98 |
+
# Nothing uploaded
|
99 |
+
return None, None
|
100 |
|
101 |
+
in_sr, in_wav = audio
|
102 |
+
if in_wav is None or in_wav.size == 0:
|
103 |
+
return None, None
|
104 |
|
105 |
+
# Normalize types just in case
|
106 |
+
in_wav = in_wav.astype(np.float32, copy=False)
|
107 |
+
|
108 |
+
# Prepare input for model (mono, 16k)
|
109 |
+
wav16 = resample_to_16k_mono(in_wav, in_sr) # torch (1, time)
|
110 |
+
|
111 |
+
# SpeechBrain expects relative lengths tensor (batch-size == 1)
|
112 |
+
lengths = torch.tensor([1.0])
|
113 |
+
|
114 |
+
# Enhance
|
115 |
+
enhancer = get_enhancer()
|
116 |
enhanced = enhancer.enhance_batch(wav16, lengths=lengths) # (1, time)
|
117 |
+
enhanced = enhanced.squeeze(0) # (time,)
|
118 |
dry = wav16.squeeze(0)
|
119 |
|
120 |
+
# Wet/dry mix
|
121 |
+
mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
|
122 |
+
out = dry * (1.0 - mix) + enhanced * mix # (time,)
|
123 |
+
|
124 |
+
# Clamp just in case, then back to numpy
|
125 |
+
y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)
|
126 |
+
|
127 |
+
# For "Original", we return the user’s uploaded audio unmodified
|
128 |
+
# (Gradio prefers (sr, waveform) for type="numpy")
|
129 |
+
original = (in_sr, to_mono(in_wav)) # make sure it plays as mono
|
130 |
+
processed = (TARGET_SR, y)
|
131 |
+
|
132 |
+
return original, processed
|
133 |
+
|
134 |
+
|
135 |
+
# -----------------------------
|
136 |
+
# UI
|
137 |
+
# -----------------------------
|
138 |
+
CSS = """
|
139 |
+
/* simple brand-ish tweaks */
|
140 |
+
.gradio-container { max-width: 1100px !important; }
|
141 |
+
#title { font-weight: 700; font-size: 1.4rem; margin-bottom: .25rem; }
|
142 |
+
#subtitle { opacity: .8; margin-bottom: .75rem; }
|
143 |
+
"""
|
144 |
+
|
145 |
+
with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
|
146 |
+
gr.HTML('<div id="title">Zack’s Audio Outpost — AI Noise Reducer</div>')
|
147 |
+
gr.HTML('<div id="subtitle">Upload a file and compare <b>Original</b> vs <b>Processed</b>.</div>')
|
148 |
+
|
149 |
+
with gr.Row():
|
150 |
+
audio_in = gr.Audio(
|
151 |
+
sources=["upload"],
|
152 |
+
type="numpy", # returns (sr, np.ndarray)
|
153 |
+
label="Upload Audio",
|
154 |
+
waveform_options=gr.WaveformOptions(show_controls=True),
|
155 |
+
)
|
156 |
+
strength = gr.Radio(
|
157 |
+
choices=["Light", "Medium", "Strong"],
|
158 |
+
value="Medium",
|
159 |
+
label="Noise Reduction Strength",
|
160 |
+
)
|
161 |
+
|
162 |
+
btn = gr.Button("Run Noise Reduction", variant="primary")
|
163 |
+
|
164 |
+
with gr.Row():
|
165 |
+
out_orig = gr.Audio(type="numpy", label="Original")
|
166 |
+
out_proc = gr.Audio(type="numpy", label="Processed")
|
167 |
|
168 |
+
btn.click(denoise_numpy, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
|
|
|
169 |
|
170 |
+
# Recommended: SSR is fine on Spaces; leave default
|
171 |
+
if __name__ == "__main__":
|
172 |
+
# In Spaces this is ignored; locally it runs on http://0.0.0.0:7860
|
173 |
+
demo.launch()
|