ZDingman commited on
Commit
1430d22
·
verified ·
1 Parent(s): 2ee63b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -24
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # app.py
2
- # Zack's Audio Outpost — AI Noise Reducer (SpeechBrain MetricGAN)
3
- # Works on CPU in a Hugging Face Space. No GPU required.
4
 
5
  import os
6
  from typing import Tuple
@@ -14,7 +14,7 @@ from speechbrain.pretrained import SpectralMaskEnhancement
14
  # -----------------------------
15
  # Config
16
  # -----------------------------
17
- TARGET_SR = 16_000 # The SpeechBrain mtl-mimic-voicebank model expects 16 kHz mono
18
 
19
  # Wet/dry mix by "strength"
20
  MIX_BY_STRENGTH = {
@@ -23,8 +23,9 @@ MIX_BY_STRENGTH = {
23
  "Strong": 1.00, # 100% wet
24
  }
25
 
26
- MODEL_SOURCE = "speechbrain/mtl-mimic-voicebank"
27
- MODEL_DIR = "pretrained_models/mtl-mimic-voicebank"
 
28
 
29
  # Global enhancer (loaded once)
30
  _enhancer: SpectralMaskEnhancement | None = None
@@ -34,9 +35,9 @@ def get_enhancer() -> SpectralMaskEnhancement:
34
  """Lazy-load the SpeechBrain enhancer once."""
35
  global _enhancer
36
  if _enhancer is None:
37
- # Downloads the small MetricGAN+ checkpoint on first run
38
  _enhancer = SpectralMaskEnhancement.from_hparams(
39
- source=MODEL_SOURCE, savedir=MODEL_DIR
 
40
  )
41
  _enhancer.mods.eval()
42
  torch.set_grad_enabled(False)
@@ -51,13 +52,12 @@ def to_mono(x: np.ndarray) -> np.ndarray:
51
  Ensure mono. Accepts shapes:
52
  - (time,) already mono
53
  - (time, channels) -> average channels
54
- - (channels, time) (rare) -> average channels, return (time,)
55
  Returns float32 -1..1
56
  """
57
  if x.ndim == 1:
58
  y = x
59
  elif x.ndim == 2:
60
- # pick which axis is channels
61
  if x.shape[0] < x.shape[1]:
62
  # (channels, time)
63
  y = x.mean(axis=0)
@@ -95,14 +95,12 @@ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[i
95
  Both as float32 in [-1, 1]
96
  """
97
  if audio is None:
98
- # Nothing uploaded
99
  return None, None
100
 
101
  in_sr, in_wav = audio
102
  if in_wav is None or in_wav.size == 0:
103
  return None, None
104
 
105
- # Normalize types just in case
106
  in_wav = in_wav.astype(np.float32, copy=False)
107
 
108
  # Prepare input for model (mono, 16k)
@@ -113,22 +111,19 @@ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[i
113
 
114
  # Enhance
115
  enhancer = get_enhancer()
116
- enhanced = enhancer.enhance_batch(wav16, lengths=lengths) # (1, time)
117
- enhanced = enhanced.squeeze(0) # (time,)
118
  dry = wav16.squeeze(0)
119
 
120
  # Wet/dry mix
121
  mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
122
- out = dry * (1.0 - mix) + enhanced * mix # (time,)
123
 
124
- # Clamp just in case, then back to numpy
125
  y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)
126
 
127
- # For "Original", we return the user’s uploaded audio unmodified
128
- # (Gradio prefers (sr, waveform) for type="numpy")
129
- original = (in_sr, to_mono(in_wav)) # make sure it plays as mono
130
  processed = (TARGET_SR, y)
131
-
132
  return original, processed
133
 
134
 
@@ -136,7 +131,6 @@ def denoise_numpy(audio: Tuple[int, np.ndarray], strength: str) -> Tuple[Tuple[i
136
  # UI
137
  # -----------------------------
138
  CSS = """
139
- /* simple brand-ish tweaks */
140
  .gradio-container { max-width: 1100px !important; }
141
  #title { font-weight: 700; font-size: 1.4rem; margin-bottom: .25rem; }
142
  #subtitle { opacity: .8; margin-bottom: .75rem; }
@@ -149,9 +143,9 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
149
  with gr.Row():
150
  audio_in = gr.Audio(
151
  sources=["upload"],
152
- type="numpy", # returns (sr, np.ndarray)
153
  label="Upload Audio",
154
- waveform_options=gr.WaveformOptions(show_controls=True),
155
  )
156
  strength = gr.Radio(
157
  choices=["Light", "Medium", "Strong"],
@@ -167,7 +161,5 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
167
 
168
  btn.click(denoise_numpy, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
169
 
170
- # Recommended: SSR is fine on Spaces; leave default
171
  if __name__ == "__main__":
172
- # In Spaces this is ignored; locally it runs on http://0.0.0.0:7860
173
  demo.launch()
 
1
  # app.py
2
+ # Zack's Audio Outpost — AI Noise Reducer (SpeechBrain MetricGAN+)
3
+ # CPU-friendly; provides Light/Medium/Strong wet mix and Original vs Processed.
4
 
5
  import os
6
  from typing import Tuple
 
14
  # -----------------------------
15
  # Config
16
  # -----------------------------
17
+ TARGET_SR = 16_000 # MetricGAN+ expects 16 kHz mono
18
 
19
  # Wet/dry mix by "strength"
20
  MIX_BY_STRENGTH = {
 
23
  "Strong": 1.00, # 100% wet
24
  }
25
 
26
+ # Correct SpeechBrain model for SpectralMaskEnhancement
27
+ MODEL_SOURCE = "speechbrain/metricgan-plus-voicebank"
28
+ MODEL_DIR = "pretrained_models/metricgan-plus-voicebank"
29
 
30
  # Global enhancer (loaded once)
31
  _enhancer: SpectralMaskEnhancement | None = None
 
35
  """Lazy-load the SpeechBrain enhancer once."""
36
  global _enhancer
37
  if _enhancer is None:
 
38
  _enhancer = SpectralMaskEnhancement.from_hparams(
39
+ source=MODEL_SOURCE,
40
+ savedir=MODEL_DIR,
41
  )
42
  _enhancer.mods.eval()
43
  torch.set_grad_enabled(False)
 
52
  Ensure mono. Accepts shapes:
53
  - (time,) already mono
54
  - (time, channels) -> average channels
55
+ - (channels, time) -> average channels, return (time,)
56
  Returns float32 -1..1
57
  """
58
  if x.ndim == 1:
59
  y = x
60
  elif x.ndim == 2:
 
61
  if x.shape[0] < x.shape[1]:
62
  # (channels, time)
63
  y = x.mean(axis=0)
 
95
  Both as float32 in [-1, 1]
96
  """
97
  if audio is None:
 
98
  return None, None
99
 
100
  in_sr, in_wav = audio
101
  if in_wav is None or in_wav.size == 0:
102
  return None, None
103
 
 
104
  in_wav = in_wav.astype(np.float32, copy=False)
105
 
106
  # Prepare input for model (mono, 16k)
 
111
 
112
  # Enhance
113
  enhancer = get_enhancer()
114
+ enhanced = enhancer.enhance_batch(wav16, lengths=lengths).squeeze(0) # (time,)
 
115
  dry = wav16.squeeze(0)
116
 
117
  # Wet/dry mix
118
  mix = MIX_BY_STRENGTH.get(strength, MIX_BY_STRENGTH["Medium"])
119
+ out = dry * (1.0 - mix) + enhanced * mix
120
 
121
+ # Clamp & back to numpy
122
  y = torch.clamp(out, -1.0, 1.0).cpu().numpy().astype(np.float32)
123
 
124
+ # Return original (mono copy for consistent playback) + processed @16k
125
+ original = (in_sr, to_mono(in_wav))
 
126
  processed = (TARGET_SR, y)
 
127
  return original, processed
128
 
129
 
 
131
  # UI
132
  # -----------------------------
133
  CSS = """
 
134
  .gradio-container { max-width: 1100px !important; }
135
  #title { font-weight: 700; font-size: 1.4rem; margin-bottom: .25rem; }
136
  #subtitle { opacity: .8; margin-bottom: .75rem; }
 
143
  with gr.Row():
144
  audio_in = gr.Audio(
145
  sources=["upload"],
146
+ type="numpy", # returns (sr, np.ndarray)
147
  label="Upload Audio",
148
+ # show_controls is deprecated; we leave default controls on
149
  )
150
  strength = gr.Radio(
151
  choices=["Light", "Medium", "Strong"],
 
161
 
162
  btn.click(denoise_numpy, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
163
 
 
164
  if __name__ == "__main__":
 
165
  demo.launch()