ZDingman commited on
Commit
2ee63b1
·
verified ·
1 Parent(s): fbb28c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -27
app.py CHANGED
@@ -1,46 +1,173 @@
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import torchaudio
3
- import numpy as np
4
 
5
- TARGET_SR = 16000 # model expects 16 kHz
 
 
 
6
 
7
- # strength -> wet mix
8
  MIX_BY_STRENGTH = {
9
- "Light": 0.5,
10
- "Medium": 0.75,
11
- "Strong": 1.0,
12
  }
13
 
14
- def _to_16k_mono(x: np.ndarray, sr: int) -> torch.Tensor:
15
- """x: (time,) or (time, channels) float32 -1..1 -> torch (1, time) @16k"""
16
- if x.ndim == 2: # stereo -> mono average
17
- x = x.mean(axis=1)
18
- wav = torch.from_numpy(x.astype(np.float32)) # (time,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  if sr != TARGET_SR:
20
  wav = torchaudio.functional.resample(wav, sr, TARGET_SR)
21
  return wav.unsqueeze(0) # (1, time)
22
 
 
 
 
 
23
  @torch.no_grad()
24
- def denoise(audio, strength):
25
- # audio comes from gradio as (sr, np.ndarray) or filepath depending on your IO
26
- # If you already have (sr, np.ndarray) upstream, keep that. Example below assumes tuple:
27
- sr, x = audio # x shape (time, [channels]) float32 -1..1
 
 
 
 
 
 
 
 
 
28
 
29
- # to 16k mono
30
- wav16 = _to_16k_mono(x, sr) # (1, time) torch.float32
31
- lengths = torch.tensor([1.0]) # full-length (relative) as required
32
 
33
- # Run SpeechBrain enhancer (already created as `enhancer`)
 
 
 
 
 
 
 
 
 
 
34
  enhanced = enhancer.enhance_batch(wav16, lengths=lengths) # (1, time)
35
- enhanced = enhanced.squeeze(0) # (time,)
36
  dry = wav16.squeeze(0)
37
 
38
- # Wet/dry mix per UI strength
39
- mix = MIX_BY_STRENGTH.get(strength, 0.75)
40
- out = dry * (1.0 - mix) + enhanced * mix
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # back to numpy @16k
43
- y = out.cpu().numpy().astype(np.float32)
44
 
45
- # Return (sr, waveform) to Gradio (or whatever your interface expects)
46
- return (TARGET_SR, y)
 
 
 
1
+ # app.py
2
+ # Zack's Audio Outpost — AI Noise Reducer (SpeechBrain MetricGAN)
3
+ # Works on CPU in a Hugging Face Space. No GPU required.
4
+
5
+ import os
6
+ from typing import Tuple
7
+
8
+ import gradio as gr
9
+ import numpy as np
10
  import torch
11
  import torchaudio
12
+ from speechbrain.pretrained import SpectralMaskEnhancement
13
 
14
+ # -----------------------------
15
+ # Config
16
+ # -----------------------------
17
+ TARGET_SR = 16_000 # The SpeechBrain mtl-mimic-voicebank model expects 16 kHz mono
18
 
19
+ # Wet/dry mix by "strength"
20
  MIX_BY_STRENGTH = {
21
+ "Light": 0.50, # 50% wet
22
+ "Medium": 0.75, # 75% wet
23
+ "Strong": 1.00, # 100% wet
24
  }
25
 
26
+ MODEL_SOURCE = "speechbrain/mtl-mimic-voicebank"
27
+ MODEL_DIR = "pretrained_models/mtl-mimic-voicebank"
28
+
29
+ # Global enhancer (loaded once)
30
+ _enhancer: SpectralMaskEnhancement | None = None
31
+
32
+
33
+ def get_enhancer() -> SpectralMaskEnhancement:
34
+ """Lazy-load the SpeechBrain enhancer once."""
35
+ global _enhancer
36
+ if _enhancer is None:
37
+ # Downloads the small MetricGAN+ checkpoint on first run
38
+ _enhancer = SpectralMaskEnhancement.from_hparams(
39
+ source=MODEL_SOURCE, savedir=MODEL_DIR
40
+ )
41
+ _enhancer.mods.eval()
42
+ torch.set_grad_enabled(False)
43
+ return _enhancer
44
+
45
+
46
+ # -----------------------------
47
+ # Audio helpers
48
+ # -----------------------------
49
+ def to_mono(x: np.ndarray) -> np.ndarray:
50
+ """
51
+ Ensure mono. Accepts shapes:
52
+ - (time,) already mono
53
+ - (time, channels) -> average channels
54
+ - (channels, time) (rare) -> average channels, return (time,)
55
+ Returns float32 -1..1
56
+ """
57
+ if x.ndim == 1:
58
+ y = x
59
+ elif x.ndim == 2:
60
+ # pick which axis is channels
61
+ if x.shape[0] < x.shape[1]:
62
+ # (channels, time)
63
+ y = x.mean(axis=0)
64
+ else:
65
+ # (time, channels)
66
+ y = x.mean(axis=1)
67
+ else:
68
+ raise ValueError("Unsupported audio shape; expected 1D or 2D ndarray")
69
+ return y.astype(np.float32, copy=False)
70
+
71
+
72
+ def resample_to_16k_mono(x: np.ndarray, sr: int) -> torch.Tensor:
73
+ """
74
+ Numpy -> torch (1, time) @ 16 kHz mono, float32 in [-1, 1]
75
+ """
76
+ mono = to_mono(x)
77
+ wav = torch.from_numpy(mono) # (time,)
78
  if sr != TARGET_SR:
79
  wav = torchaudio.functional.resample(wav, sr, TARGET_SR)
80
  return wav.unsqueeze(0) # (1, time)
81
 
82
+
83
+ # -----------------------------
84
+ # Core processing
85
+ # -----------------------------
86
  @torch.no_grad()
87
+ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[int, np.ndarray], Tuple[int, np.ndarray]]:
88
+ """
89
+ Gradio callback.
90
+ Input:
91
+ audio: (sr, numpy waveform)
92
+ strength: "Light" | "Medium" | "Strong"
93
+ Output:
94
+ (original_sr, original_wav), (TARGET_SR, processed_wav)
95
+ Both as float32 in [-1, 1]
96
+ """
97
+ if audio is None:
98
+ # Nothing uploaded
99
+ return None, None
100
 
101
+ in_sr, in_wav = audio
102
+ if in_wav is None or in_wav.size == 0:
103
+ return None, None
104
 
105
+ # Normalize types just in case
106
+ in_wav = in_wav.astype(np.float32, copy=False)
107
+
108
+ # Prepare input for model (mono, 16k)
109
+ wav16 = resample_to_16k_mono(in_wav, in_sr) # torch (1, time)
110
+
111
+ # SpeechBrain expects relative lengths tensor (batch-size == 1)
112
+ lengths = torch.tensor([1.0])
113
+
114
+ # Enhance
115
+ enhancer = get_enhancer()
116
  enhanced = enhancer.enhance_batch(wav16, lengths=lengths) # (1, time)
117
+ enhanced = enhanced.squeeze(0) # (time,)
118
  dry = wav16.squeeze(0)
119
 
120
+ # Wet/dry mix
121
+ mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
122
+ out = dry * (1.0 - mix) + enhanced * mix # (time,)
123
+
124
+ # Clamp just in case, then back to numpy
125
+ y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)
126
+
127
+ # For "Original", we return the user’s uploaded audio unmodified
128
+ # (Gradio prefers (sr, waveform) for type="numpy")
129
+ original = (in_sr, to_mono(in_wav)) # make sure it plays as mono
130
+ processed = (TARGET_SR, y)
131
+
132
+ return original, processed
133
+
134
+
135
+ # -----------------------------
136
+ # UI
137
+ # -----------------------------
138
+ CSS = """
139
+ /* simple brand-ish tweaks */
140
+ .gradio-container { max-width: 1100px !important; }
141
+ #title { font-weight: 700; font-size: 1.4rem; margin-bottom: .25rem; }
142
+ #subtitle { opacity: .8; margin-bottom: .75rem; }
143
+ """
144
+
145
+ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
146
+ gr.HTML('<div id="title">Zack’s Audio Outpost — AI Noise Reducer</div>')
147
+ gr.HTML('<div id="subtitle">Upload a file and compare <b>Original</b> vs <b>Processed</b>.</div>')
148
+
149
+ with gr.Row():
150
+ audio_in = gr.Audio(
151
+ sources=["upload"],
152
+ type="numpy", # returns (sr, np.ndarray)
153
+ label="Upload Audio",
154
+ waveform_options=gr.WaveformOptions(show_controls=True),
155
+ )
156
+ strength = gr.Radio(
157
+ choices=["Light", "Medium", "Strong"],
158
+ value="Medium",
159
+ label="Noise Reduction Strength",
160
+ )
161
+
162
+ btn = gr.Button("Run Noise Reduction", variant="primary")
163
+
164
+ with gr.Row():
165
+ out_orig = gr.Audio(type="numpy", label="Original")
166
+ out_proc = gr.Audio(type="numpy", label="Processed")
167
 
168
+ btn.click(denoise_numpy, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
 
169
 
170
+ # Recommended: SSR is fine on Spaces; leave default
171
+ if __name__ == "__main__":
172
+ # In Spaces this is ignored; locally it runs on http://0.0.0.0:7860
173
+ demo.launch()