Wataru commited on
Commit
c016dba
·
verified ·
1 Parent(s): 01e9d1f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +69 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ import torchaudio
5
+ import transformers
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ fe_path = hf_hub_download("sarulab-speech/sidon-v0.1", filename="feature_extractor.pt")
9
+ decoder_path = hf_hub_download("sarulab-speech/sidon-v0.1", filename="decoder.pt")
10
+
11
+ fe = torch.jit.load(fe_path)
12
+ decoder = torch.jit.load(decoder_path)
13
+ preprocessor = transformers.SeamlessM4TFeatureExtractor.from_pretrained(
14
+ "facebook/w2v-bert-2.0"
15
+ )
16
+
17
+
18
+ def denoise_speech(audio):
19
+ if audio is None:
20
+ return None
21
+
22
+ sample_rate, waveform = audio
23
+ waveform = 0.9 * (waveform / np.abs(waveform).max())
24
+
25
+ # Ensure waveform is a tensor
26
+ if not isinstance(waveform, torch.Tensor):
27
+ waveform = torch.tensor(waveform, dtype=torch.float32)
28
+
29
+ # If stereo, convert to mono
30
+ if waveform.ndim > 1 and waveform.shape[0] > 1:
31
+ waveform = torch.mean(waveform, dim=0)
32
+
33
+ # Add a batch dimension
34
+ waveform = waveform.view(1, -1)
35
+ wav = torchaudio.functional.highpass_biquad(waveform, sample_rate, 50)
36
+ wav_16k = torchaudio.functional.resample(wav, sample_rate, 16_000)
37
+ restoreds = []
38
+ feature_cache = None
39
+ for chunk in wav_16k.view(-1).split(16000 * 20):
40
+ inputs = preprocessor(
41
+ torch.nn.functional.pad(chunk, (40, 40)), return_tensors="pt"
42
+ )
43
+ with torch.inference_mode():
44
+ feature = fe(inputs["input_features"])["last_hidden_state"]
45
+ if feature_cache is not None:
46
+ feature = torch.cat([feature_cache, feature], dim=1)
47
+ restored_wav = decoder(feature.transpose(1, 2))
48
+ restored_wav = restored_wav[:, :, 4800:]
49
+ else:
50
+ restored_wav = decoder(feature.transpose(1, 2))
51
+ restored_wav = restored_wav[:, :, 50 * 3 :]
52
+ feature_cache = feature[:, -5:, :]
53
+ restoreds.append(restored_wav)
54
+ restored_wav = torch.cat(restoreds, dim=-1)
55
+
56
+ return 48_000, (restored_wav.view(-1, 1).numpy() * 32767).astype(np.int16)
57
+
58
+
59
+ # Create the Gradio interface
60
+ iface = gr.Interface(
61
+ fn=denoise_speech,
62
+ inputs=gr.Audio(type="numpy", label="Noisy Speech"),
63
+ outputs=gr.Audio(type="numpy", label="Restored Speech"),
64
+ title="Sidon Speech Restoration",
65
+ description="Upload a noisy audio file and the Sidon model will restore it.",
66
+ )
67
+
68
+ if __name__ == "__main__":
69
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ pydantic==2.10.6
4
+ transformers
5
+ gradio