ZDingman commited on
Commit
683fcfc
·
verified ·
1 Parent(s): f7b8d3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -69
app.py CHANGED
@@ -1,116 +1,92 @@
1
- import os
2
  import numpy as np
3
  import gradio as gr
4
  import soundfile as sf
 
5
  import torch
6
 
7
- # Try torchaudio for resampling. If it's not usable, fall back to SciPy.
8
- USE_TORCHAUDIO = True
9
- try:
10
- import torchaudio
11
- import torchaudio.functional as AF
12
- except Exception:
13
- USE_TORCHAUDIO = False
14
- from scipy.signal import resample_poly
15
-
16
- # SpeechBrain MetricGAN+ enhancement (CPU)
17
- from speechbrain.pretrained import SpectralMaskEnhancement
18
-
19
- torch.set_num_threads(1)
20
  DEVICE = "cpu"
21
- MODEL_ID = "speechbrain/metricgan-plus-voicebank"
22
-
23
- # Load the enhancer once
24
- ENHANCER = SpectralMaskEnhancement.from_hparams(
25
- source=MODEL_ID,
26
- savedir="pretrained_metricganp",
27
- run_opts={"device": DEVICE}
28
- )
29
 
30
- TARGET_SR = 16000 # MetricGAN+ expects 16 kHz
 
 
 
 
 
 
 
 
 
31
 
32
  def _to_mono(x: np.ndarray) -> np.ndarray:
33
- # x shape: (samples,) or (samples, channels)
34
  if x.ndim == 2 and x.shape[1] > 1:
35
- return np.mean(x, axis=1, dtype=np.float32)
36
- return x.astype(np.float32, copy=False)
 
37
 
38
  def _resample(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
39
  if sr_in == sr_out:
40
- return x
41
- if USE_TORCHAUDIO:
42
- with torch.no_grad():
43
- t = torch.from_numpy(x).unsqueeze(0) # (1, time)
44
- y = AF.resample(t, orig_freq=sr_in, new_freq=sr_out)
45
- return y.squeeze(0).cpu().numpy().astype(np.float32)
46
- # SciPy fall-back
47
  g = np.gcd(sr_in, sr_out)
48
  up, down = sr_out // g, sr_in // g
49
  y = resample_poly(x, up, down).astype(np.float32)
50
  return y
51
 
52
  def _mix(dry: np.ndarray, wet: np.ndarray, strength: str) -> np.ndarray:
53
- # Light / Medium / Strong → wet mix amounts
54
  mix = {"Light": 0.4, "Medium": 0.7, "Strong": 1.0}.get(strength, 0.7)
55
- # pad/truncate to the same length
56
  n = min(len(dry), len(wet))
57
- out = dry[:n] * (1.0 - mix) + wet[:n] * mix
58
- return out
59
 
60
  def denoise(audio: tuple, strength: str):
61
- """
62
- Gradio passes (sr, np.ndarray[int16/float32, shape=(n,) or (n, ch)]) when type='numpy'
63
- Return the processed audio as (sr, np.ndarray[float32]).
64
- """
65
  if audio is None:
66
  raise gr.Error("Please upload an audio file.")
 
67
  sr, data = audio
68
- if isinstance(data, list):
69
- data = np.array(data, dtype=np.float32)
70
 
71
- # To mono, float32 in [-1, 1]
72
- x_mono = _to_mono(data)
73
- x_mono = np.clip(x_mono, -1.0, 1.0).astype(np.float32)
74
 
75
- # Resample to 16 kHz for the model
76
- x_16k = _resample(x_mono, sr_in=sr, sr_out=TARGET_SR)
77
 
78
- # Enhance with MetricGAN+
 
79
  with torch.no_grad():
80
- # Enhance expects torch.Tensor: shape (batch, time)
81
- inp = torch.from_numpy(x_16k).unsqueeze(0)
82
- enhanced = ENHANCER.enhance_batch(inp, TARGET_SR)
83
  if isinstance(enhanced, torch.Tensor):
84
  enhanced = enhanced.squeeze(0).cpu().numpy().astype(np.float32)
85
 
86
- # Back to original sample rate
87
- enhanced_sr = _resample(enhanced, sr_in=TARGET_SR, sr_out=sr)
88
-
89
- # Mix according to strength (preserve dry transients)
90
- y = _mix(dry=x_mono, wet=enhanced_sr, strength=strength)
91
-
92
- # Return as mono track at original sr
93
- return (sr, y.astype(np.float32))
94
 
 
 
 
95
 
96
- # ---------- UI ----------
97
- with gr.Blocks(theme=gr.themes.Soft(), css="footer {visibility:hidden}") as demo:
98
  gr.Markdown("### Zack’s Audio Outpost — AI Noise Reducer\nUpload a file and compare **Original vs Processed**.")
99
  with gr.Row():
100
  audio_in = gr.Audio(type="numpy", label="Upload Audio")
101
- strength = gr.Radio(["Light", "Medium", "Strong"], value="Medium", label="Noise Reduction Strength")
102
  run_btn = gr.Button("Run Noise Reduction", variant="primary")
103
  with gr.Row():
104
  orig = gr.Audio(label="Original")
105
- clean = gr.Audio(label="Processed")
106
 
107
- def run(audio, strength):
108
  if audio is None:
109
  raise gr.Error("Please upload an audio file.")
110
- sr, data = audio
111
- processed = denoise((sr, data), strength)
112
- return (sr, data), processed
113
 
114
- run_btn.click(fn=run, inputs=[audio_in, strength], outputs=[orig, clean])
115
 
116
  demo.launch()
 
 
1
  import numpy as np
2
  import gradio as gr
3
  import soundfile as sf
4
+ from scipy.signal import resample_poly
5
  import torch
6
 
7
+ # Lazy import to avoid failing at build time
8
+ ENHANCER = None
9
+ TARGET_SR = 16000 # MetricGAN+ expects 16 kHz
 
 
 
 
 
 
 
 
 
 
10
  DEVICE = "cpu"
11
+ torch.set_num_threads(1)
 
 
 
 
 
 
 
12
 
13
+ def get_enhancer():
14
+ global ENHANCER
15
+ if ENHANCER is None:
16
+ from speechbrain.pretrained import SpectralMaskEnhancement
17
+ ENHANCER = SpectralMaskEnhancement.from_hparams(
18
+ source="speechbrain/metricgan-plus-voicebank",
19
+ savedir="pretrained_metricganp",
20
+ run_opts={"device": DEVICE}
21
+ )
22
+ return ENHANCER
23
 
24
  def _to_mono(x: np.ndarray) -> np.ndarray:
25
+ # x shape: (n,) or (n, ch); keep as float32 in [-1,1]
26
  if x.ndim == 2 and x.shape[1] > 1:
27
+ x = np.mean(x, axis=1)
28
+ x = np.asarray(x, dtype=np.float32)
29
+ return np.clip(x, -1.0, 1.0)
30
 
31
  def _resample(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
32
  if sr_in == sr_out:
33
+ return x.astype(np.float32, copy=False)
 
 
 
 
 
 
34
  g = np.gcd(sr_in, sr_out)
35
  up, down = sr_out // g, sr_in // g
36
  y = resample_poly(x, up, down).astype(np.float32)
37
  return y
38
 
39
  def _mix(dry: np.ndarray, wet: np.ndarray, strength: str) -> np.ndarray:
 
40
  mix = {"Light": 0.4, "Medium": 0.7, "Strong": 1.0}.get(strength, 0.7)
 
41
  n = min(len(dry), len(wet))
42
+ return dry[:n] * (1.0 - mix) + wet[:n] * mix
 
43
 
44
  def denoise(audio: tuple, strength: str):
 
 
 
 
45
  if audio is None:
46
  raise gr.Error("Please upload an audio file.")
47
+
48
  sr, data = audio
49
+ data = np.asarray(data) # gradio sometimes gives list
 
50
 
51
+ # to mono + float32
52
+ dry_mono = _to_mono(data)
 
53
 
54
+ # resample to 16k
55
+ x16 = _resample(dry_mono, sr_in=sr, sr_out=TARGET_SR)
56
 
57
+ # run enhancer (lazy load)
58
+ enhancer = get_enhancer()
59
  with torch.no_grad():
60
+ inp = torch.from_numpy(x16).unsqueeze(0) # (1, time)
61
+ enhanced = enhancer.enhance_batch(inp, TARGET_SR)
 
62
  if isinstance(enhanced, torch.Tensor):
63
  enhanced = enhanced.squeeze(0).cpu().numpy().astype(np.float32)
64
 
65
+ # back to original SR
66
+ enh_sr = _resample(enhanced, sr_in=TARGET_SR, sr_out=sr)
 
 
 
 
 
 
67
 
68
+ # wet/dry
69
+ out = _mix(dry_mono, enh_sr, strength)
70
+ return (sr, out.astype(np.float32))
71
 
72
+ # -------- UI --------
73
+ with gr.Blocks(theme=gr.themes.Soft(), css="footer{visibility:hidden}") as demo:
74
  gr.Markdown("### Zack’s Audio Outpost — AI Noise Reducer\nUpload a file and compare **Original vs Processed**.")
75
  with gr.Row():
76
  audio_in = gr.Audio(type="numpy", label="Upload Audio")
77
+ strength = gr.Radio(["Light","Medium","Strong"], value="Medium", label="Noise Reduction Strength")
78
  run_btn = gr.Button("Run Noise Reduction", variant="primary")
79
  with gr.Row():
80
  orig = gr.Audio(label="Original")
81
+ proc = gr.Audio(label="Processed")
82
 
83
+ def run(audio, s):
84
  if audio is None:
85
  raise gr.Error("Please upload an audio file.")
86
+ sr, x = audio
87
+ y = denoise(audio, s)
88
+ return (sr, x), y
89
 
90
+ run_btn.click(run, [audio_in, strength], [orig, proc])
91
 
92
  demo.launch()