Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# app.py
|
2 |
-
# Zack's Audio Outpost — AI Noise Reducer (SpeechBrain MetricGAN)
|
3 |
-
#
|
4 |
|
5 |
import os
|
6 |
from typing import Tuple
|
@@ -14,7 +14,7 @@ from speechbrain.pretrained import SpectralMaskEnhancement
|
|
14 |
# -----------------------------
|
15 |
# Config
|
16 |
# -----------------------------
|
17 |
-
TARGET_SR = 16_000 #
|
18 |
|
19 |
# Wet/dry mix by "strength"
|
20 |
MIX_BY_STRENGTH = {
|
@@ -23,8 +23,9 @@ MIX_BY_STRENGTH = {
|
|
23 |
"Strong": 1.00, # 100% wet
|
24 |
}
|
25 |
|
26 |
-
|
27 |
-
|
|
|
28 |
|
29 |
# Global enhancer (loaded once)
|
30 |
_enhancer: SpectralMaskEnhancement | None = None
|
@@ -34,9 +35,9 @@ def get_enhancer() -> SpectralMaskEnhancement:
|
|
34 |
"""Lazy-load the SpeechBrain enhancer once."""
|
35 |
global _enhancer
|
36 |
if _enhancer is None:
|
37 |
-
# Downloads the small MetricGAN+ checkpoint on first run
|
38 |
_enhancer = SpectralMaskEnhancement.from_hparams(
|
39 |
-
source=MODEL_SOURCE,
|
|
|
40 |
)
|
41 |
_enhancer.mods.eval()
|
42 |
torch.set_grad_enabled(False)
|
@@ -51,13 +52,12 @@ def to_mono(x: np.ndarray) -> np.ndarray:
|
|
51 |
Ensure mono. Accepts shapes:
|
52 |
- (time,) already mono
|
53 |
- (time, channels) -> average channels
|
54 |
-
- (channels, time)
|
55 |
Returns float32 -1..1
|
56 |
"""
|
57 |
if x.ndim == 1:
|
58 |
y = x
|
59 |
elif x.ndim == 2:
|
60 |
-
# pick which axis is channels
|
61 |
if x.shape[0] < x.shape[1]:
|
62 |
# (channels, time)
|
63 |
y = x.mean(axis=0)
|
@@ -95,14 +95,12 @@ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[i
|
|
95 |
Both as float32 in [-1, 1]
|
96 |
"""
|
97 |
if audio is None:
|
98 |
-
# Nothing uploaded
|
99 |
return None, None
|
100 |
|
101 |
in_sr, in_wav = audio
|
102 |
if in_wav is None or in_wav.size == 0:
|
103 |
return None, None
|
104 |
|
105 |
-
# Normalize types just in case
|
106 |
in_wav = in_wav.astype(np.float32, copy=False)
|
107 |
|
108 |
# Prepare input for model (mono, 16k)
|
@@ -113,22 +111,19 @@ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[i
|
|
113 |
|
114 |
# Enhance
|
115 |
enhancer = get_enhancer()
|
116 |
-
enhanced = enhancer.enhance_batch(wav16, lengths=lengths) # (
|
117 |
-
enhanced = enhanced.squeeze(0) # (time,)
|
118 |
dry = wav16.squeeze(0)
|
119 |
|
120 |
# Wet/dry mix
|
121 |
mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
|
122 |
-
out = dry * (1.0 - mix) + enhanced * mix
|
123 |
|
124 |
-
# Clamp
|
125 |
y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)
|
126 |
|
127 |
-
#
|
128 |
-
|
129 |
-
original = (in_sr, to_mono(in_wav)) # make sure it plays as mono
|
130 |
processed = (TARGET_SR, y)
|
131 |
-
|
132 |
return original, processed
|
133 |
|
134 |
|
@@ -136,7 +131,6 @@ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[i
|
|
136 |
# UI
|
137 |
# -----------------------------
|
138 |
CSS = """
|
139 |
-
/* simple brand-ish tweaks */
|
140 |
.gradio-container { max-width: 1100px !important; }
|
141 |
#title { font-weight: 700; font-size: 1.4rem; margin-bottom: .25rem; }
|
142 |
#subtitle { opacity: .8; margin-bottom: .75rem; }
|
@@ -149,9 +143,9 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
|
|
149 |
with gr.Row():
|
150 |
audio_in = gr.Audio(
|
151 |
sources=["upload"],
|
152 |
-
type="numpy",
|
153 |
label="Upload Audio",
|
154 |
-
|
155 |
)
|
156 |
strength = gr.Radio(
|
157 |
choices=["Light", "Medium", "Strong"],
|
@@ -167,7 +161,5 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
|
|
167 |
|
168 |
btn.click(denoise_numpy, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
|
169 |
|
170 |
-
# Recommended: SSR is fine on Spaces; leave default
|
171 |
if __name__ == "__main__":
|
172 |
-
# In Spaces this is ignored; locally it runs on http://0.0.0.0:7860
|
173 |
demo.launch()
|
|
|
1 |
# app.py
|
2 |
+
# Zack's Audio Outpost — AI Noise Reducer (SpeechBrain MetricGAN+)
|
3 |
+
# CPU-friendly; provides Light/Medium/Strong wet mix and Original vs Processed.
|
4 |
|
5 |
import os
|
6 |
from typing import Tuple
|
|
|
14 |
# -----------------------------
|
15 |
# Config
|
16 |
# -----------------------------
|
17 |
+
TARGET_SR = 16_000 # MetricGAN+ expects 16 kHz mono
|
18 |
|
19 |
# Wet/dry mix by "strength"
|
20 |
MIX_BY_STRENGTH = {
|
|
|
23 |
"Strong": 1.00, # 100% wet
|
24 |
}
|
25 |
|
26 |
+
# ✅ Correct SpeechBrain model for SpectralMaskEnhancement
|
27 |
+
MODEL_SOURCE = "speechbrain/metricgan-plus-voicebank"
|
28 |
+
MODEL_DIR = "pretrained_models/metricgan-plus-voicebank"
|
29 |
|
30 |
# Global enhancer (loaded once)
|
31 |
_enhancer: SpectralMaskEnhancement | None = None
|
|
|
35 |
"""Lazy-load the SpeechBrain enhancer once."""
|
36 |
global _enhancer
|
37 |
if _enhancer is None:
|
|
|
38 |
_enhancer = SpectralMaskEnhancement.from_hparams(
|
39 |
+
source=MODEL_SOURCE,
|
40 |
+
savedir=MODEL_DIR,
|
41 |
)
|
42 |
_enhancer.mods.eval()
|
43 |
torch.set_grad_enabled(False)
|
|
|
52 |
Ensure mono. Accepts shapes:
|
53 |
- (time,) already mono
|
54 |
- (time, channels) -> average channels
|
55 |
+
- (channels, time) -> average channels, return (time,)
|
56 |
Returns float32 -1..1
|
57 |
"""
|
58 |
if x.ndim == 1:
|
59 |
y = x
|
60 |
elif x.ndim == 2:
|
|
|
61 |
if x.shape[0] < x.shape[1]:
|
62 |
# (channels, time)
|
63 |
y = x.mean(axis=0)
|
|
|
95 |
Both as float32 in [-1, 1]
|
96 |
"""
|
97 |
if audio is None:
|
|
|
98 |
return None, None
|
99 |
|
100 |
in_sr, in_wav = audio
|
101 |
if in_wav is None or in_wav.size == 0:
|
102 |
return None, None
|
103 |
|
|
|
104 |
in_wav = in_wav.astype(np.float32, copy=False)
|
105 |
|
106 |
# Prepare input for model (mono, 16k)
|
|
|
111 |
|
112 |
# Enhance
|
113 |
enhancer = get_enhancer()
|
114 |
+
enhanced = enhancer.enhance_batch(wav16, lengths=lengths).squeeze(0) # (time,)
|
|
|
115 |
dry = wav16.squeeze(0)
|
116 |
|
117 |
# Wet/dry mix
|
118 |
mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
|
119 |
+
out = dry * (1.0 - mix) + enhanced * mix
|
120 |
|
121 |
+
# Clamp & back to numpy
|
122 |
y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)
|
123 |
|
124 |
+
# Return original (mono copy for consistent playback) + processed @16k
|
125 |
+
original = (in_sr, to_mono(in_wav))
|
|
|
126 |
processed = (TARGET_SR, y)
|
|
|
127 |
return original, processed
|
128 |
|
129 |
|
|
|
131 |
# UI
|
132 |
# -----------------------------
|
133 |
CSS = """
|
|
|
134 |
.gradio-container { max-width: 1100px !important; }
|
135 |
#title { font-weight: 700; font-size: 1.4rem; margin-bottom: .25rem; }
|
136 |
#subtitle { opacity: .8; margin-bottom: .75rem; }
|
|
|
143 |
with gr.Row():
|
144 |
audio_in = gr.Audio(
|
145 |
sources=["upload"],
|
146 |
+
type="numpy", # returns (sr, np.ndarray)
|
147 |
label="Upload Audio",
|
148 |
+
# show_controls is deprecated; we leave default controls on
|
149 |
)
|
150 |
strength = gr.Radio(
|
151 |
choices=["Light", "Medium", "Strong"],
|
|
|
161 |
|
162 |
btn.click(denoise_numpy, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
|
163 |
|
|
|
164 |
if __name__ == "__main__":
|
|
|
165 |
demo.launch()
|