Spaces:

mrfakename
/

SNAC

Paused

App Files Files Community

mrfakename commited on 9 days ago

Commit

574dde7

verified ·

1 Parent(s): 55781cc

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -33

app.py CHANGED Viewed

@@ -1,60 +1,50 @@
-# pip install gradio torch torchaudio soundfile snac
 import torch
 import torchaudio
 from snac import SNAC
 import gradio as gr
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# load SNAC once
 MODEL = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(DEVICE)
 def reconstruct(audio_in):
     if audio_in is None:
         return None
-    sr, data = audio_in
-    if data.ndim == 2 and data.shape[1] > 1:
-        data = data.mean(axis=1)
-    x = torch.from_numpy(data).float().unsqueeze(0)  # [1, T]
     if sr != 24000:
-        x = torchaudio.functional.resample(x, orig_freq=sr, new_freq=24000)
-    x = x.unsqueeze(0).to(DEVICE)  # [1, 1, T]
-    with torch.inference_mode():
-        out = MODEL(x)
-        audio_hat = out[0] if isinstance(out, (list, tuple)) else out
-    y = audio_hat.squeeze(0).squeeze(0).detach().cpu()
-    y = torch.clamp(y, -1.0, 1.0)
-    return (24000, y.numpy())
-with gr.Blocks(title="SNAC Audio Reconstructor") as demo:
-    gr.Markdown("## 🎵 SNAC Audio Reconstructor (24kHz)")
-    gr.Markdown("Upload or record audio. It’ll get resampled to 24kHz, "
-                "mono-ized, then passed through SNAC for reconstruction.")
     with gr.Row():
         with gr.Column():
-            audio_in = gr.Audio(
-                sources=["upload", "microphone"],
-                type="numpy",
-                label="Input audio"
-            )
             btn = gr.Button("Reconstruct")
         with gr.Column():
-            audio_out = gr.Audio(
-                type="numpy",
-                label="Reconstructed audio (24kHz)"
-            )
     btn.click(fn=reconstruct, inputs=audio_in, outputs=audio_out)

 import torch
 import torchaudio
 from snac import SNAC
 import gradio as gr
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(DEVICE)
 def reconstruct(audio_in):
     if audio_in is None:
         return None
+    sr, data = audio_in  # (sr, np.ndarray)
+    # to tensor [channels, T]
+    audio = torch.from_numpy(data.T).float()
+    # resample to 24k if needed
     if sr != 24000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=24000)
+        audio = resampler(audio)
+    # stereo → mono
+    if audio.size(0) > 1:
+        audio = audio.mean(dim=0, keepdim=True)
+    # expand to [1,1,T]
+    audio = audio.unsqueeze(0).to(DEVICE)
+    with torch.inference_mode():
+        out = MODEL(audio)
+        audio_hat = out[0] if isinstance(out,(list,tuple)) else out
+    y = audio_hat.squeeze().cpu().numpy()
+    return (24000, y)
+with gr.Blocks() as demo:
+    gr.Markdown("## SNAC Audio Reconstructor (24 kHz)")
     with gr.Row():
         with gr.Column():
+            audio_in = gr.Audio(sources=["upload","microphone"],
+                                type="numpy",
+                                label="Input Audio")
             btn = gr.Button("Reconstruct")
         with gr.Column():
+            audio_out = gr.Audio(type="numpy",
+                                 label="Reconstructed")
     btn.click(fn=reconstruct, inputs=audio_in, outputs=audio_out)