ZDingman commited on
Commit
0af1c3f
·
verified ·
1 Parent(s): e2a2b71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -69
app.py CHANGED
@@ -1,79 +1,96 @@
1
  import gradio as gr
2
  import numpy as np
3
- import soundfile as sf
4
-
5
- # DeepFilterNet2
6
- from df.enhance import enhance, init_df
7
-
8
- APP_TITLE = "Zack’s Audio Outpost — Voice Denoiser (DeepFilterNet2)"
9
- APP_DESC = (
10
- "Upload a voice clip with traffic/hiss/room noise and compare Original vs Processed. "
11
- "Choose Light / Medium / Strong (1× / 2× / 3× passes)."
12
  )
13
 
14
- # Load DFN2 once (first run can take a few minutes while the Space installs packages)
15
- MODEL_DF, DF_STATE, _ = init_df()
16
-
17
- def _ensure_2d(x: np.ndarray) -> np.ndarray:
18
- """Make shape (samples, channels)."""
19
- if x.ndim == 1:
20
- x = x[:, None]
21
- return x
22
-
23
- def _run_single_pass(stereo: np.ndarray) -> np.ndarray:
24
- """Run DFN2 per channel; keep same length/channels."""
25
- out = np.zeros_like(stereo, dtype=np.float32)
26
- for ch in range(stereo.shape[1]):
27
- y = enhance(stereo[:, ch].astype(np.float32),
28
- DF_STATE, model=MODEL_DF, atten_lim_db=12.0)
29
- out[:len(y), ch] = y[:stereo.shape[0]]
30
- return out
31
-
32
- def process(file_obj, strength):
33
- if file_obj is None:
34
- raise gr.Error("Please upload an audio file first.")
35
-
36
- # Load original audio (mono or stereo)
37
- audio, sr = sf.read(file_obj.name, always_2d=False)
38
- x = _ensure_2d(audio.astype(np.float32))
39
-
40
- # Map UI strength to number of passes
41
- passes = {"Light": 1, "Medium": 2, "Strong": 3}[strength]
42
-
43
- y = x.copy()
44
- for _ in range(passes):
45
- y = _run_single_pass(y)
46
-
47
- # Avoid clipping if multi-pass pushed levels
48
- y = np.clip(y, -1.0, 1.0)
49
-
50
- # Gradio wants (sr, np.array). If mono, squeeze back to 1D
51
- return (sr, audio), (sr, y.squeeze())
52
-
53
- THEME = gr.themes.Soft(primary_hue="cyan", neutral_hue="slate").set(
54
- body_background_fill="#0b1020",
55
- body_text_color="#e6ecff",
56
- block_background_fill="#121830",
57
- block_border_color="#243154",
58
- button_primary_background_fill="#3dd6ff",
59
- button_primary_text_color="#001018",
60
- input_background_fill="#0e1530",
61
- input_border_color="#243154",
62
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- with gr.Blocks(title=APP_TITLE, theme=THEME) as demo:
65
- gr.Markdown(f"## {APP_TITLE}\n{APP_DESC}")
66
  with gr.Row():
67
- file = gr.File(label="Upload audio", file_types=["audio"])
68
- strength = gr.Radio(["Light","Medium","Strong"], value="Medium",
69
- label="Noise reduction strength")
70
- run = gr.Button("Process", variant="primary")
 
71
 
72
  with gr.Row():
73
- a_orig = gr.Audio(label="Original (A)", interactive=False)
74
- a_proc = gr.Audio(label="Processed (B)", interactive=False)
75
 
76
- run.click(process, inputs=[file, strength], outputs=[a_orig, a_proc])
77
 
78
- if __name__ == "__main__":
79
- demo.launch()
 
1
  import gradio as gr
2
  import numpy as np
3
+ import torch
4
+ import torchaudio
5
+ from speechbrain.pretrained import SpectralMaskEnhancement
6
+
7
+ # Download once and cache in the Space
8
+ ENHANCER = SpectralMaskEnhancement.from_hparams(
9
+ source="speechbrain/metricgan-plus-voicebank",
10
+ savedir="pretrained/metricgan-plus-voicebank",
 
11
  )
12
 
13
+ TARGET_SR = 16000 # model sample rate
14
+
15
+ def _to_tensor(mono_np: np.ndarray) -> torch.Tensor:
16
+ # ensure float32 [-1,1]
17
+ t = torch.from_numpy(mono_np.astype(np.float32))
18
+ peak = t.abs().max().clamp(min=1e-8)
19
+ return (t / peak)
20
+
21
+ def _enhance_channel(wav_np: np.ndarray, in_sr: int, mix: float) -> np.ndarray:
22
+ """Enhance one channel and wet/dry mix."""
23
+ x = _to_tensor(wav_np)
24
+ if in_sr != TARGET_SR:
25
+ x16 = torchaudio.functional.resample(x, in_sr, TARGET_SR)
26
+ else:
27
+ x16 = x
28
+
29
+ with torch.no_grad():
30
+ # enhance_batch expects shape [B, T]
31
+ est16 = ENHANCER.enhance_batch(x16.unsqueeze(0), TARGET_SR)[0].squeeze(0)
32
+
33
+ # back to original sr
34
+ if in_sr != TARGET_SR:
35
+ est = torchaudio.functional.resample(est16, TARGET_SR, in_sr)
36
+ else:
37
+ est = est16
38
+
39
+ # trim/pad to original length
40
+ n = x.shape[0]
41
+ if est.shape[0] >= n:
42
+ est = est[:n]
43
+ else:
44
+ est = torch.nn.functional.pad(est, (0, n - est.shape[0]))
45
+
46
+ y = (1.0 - mix) * x + mix * est
47
+ return y.cpu().numpy()
48
+
49
+ def denoise(audio, strength):
50
+ """
51
+ Gradio passes (sr, numpy) when type='numpy'.
52
+ numpy is shape [T] (mono) or [T, 2] (stereo).
53
+ We process mono or true stereo.
54
+ """
55
+ if audio is None:
56
+ return None, None
57
+
58
+ sr, data = audio
59
+ if data.ndim == 1: # mono
60
+ chs = [data]
61
+ else: # stereo (T,2)
62
+ chs = [data[:, 0], data[:, 1]]
63
+
64
+ mix_map = {"Light": 0.5, "Medium": 0.75, "Strong": 1.0}
65
+ mix = mix_map.get(strength, 0.75)
66
+
67
+ out_chs = [ _enhance_channel(c, sr, mix) for c in chs ]
68
+
69
+ if len(out_chs) == 2:
70
+ processed = np.stack(out_chs, axis=1) # (T,2)
71
+ original = data
72
+ else:
73
+ processed = out_chs[0]
74
+ original = data
75
+
76
+ # Return both so users can A/B
77
+ return (sr, original), (sr, processed)
78
+
79
+ # -------- UI --------
80
+ with gr.Blocks(css="footer {visibility: hidden}") as demo:
81
+ gr.Markdown("## Zack’s Audio Outpost — AI Noise Reducer\nUpload a file and compare **Original** vs **Processed**.")
82
 
 
 
83
  with gr.Row():
84
+ audio_in = gr.Audio(type="numpy", label="Upload Audio")
85
+ strength = gr.Radio(["Light", "Medium", "Strong"], value="Medium",
86
+ label="Noise Reduction Strength")
87
+
88
+ run = gr.Button("Run Noise Reduction", variant="primary")
89
 
90
  with gr.Row():
91
+ out_orig = gr.Audio(label="Original Audio")
92
+ out_proc = gr.Audio(label="Processed Audio")
93
 
94
+ run.click(denoise, inputs=[audio_in, strength], outputs=[out_orig, out_proc])
95
 
96
+ demo.launch()