Spaces:

mrfakename
/

SNAC

Paused

App Files Files Community

mrfakename commited on 9 days ago

Commit

a926d98

verified ·

1 Parent(s): 574dde7

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -26

app.py CHANGED Viewed

@@ -3,50 +3,51 @@ import torchaudio
 from snac import SNAC
 import gradio as gr
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(DEVICE)
 def reconstruct(audio_in):
     if audio_in is None:
         return None
-    sr, data = audio_in  # (sr, np.ndarray)
-    # to tensor [channels, T]
-    audio = torch.from_numpy(data.T).float()
-    # resample to 24k if needed
-    if sr != 24000:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=24000)
-        audio = resampler(audio)
-    # stereo → mono
-    if audio.size(0) > 1:
-        audio = audio.mean(dim=0, keepdim=True)
-    # expand to [1,1,T]
-    audio = audio.unsqueeze(0).to(DEVICE)
     with torch.inference_mode():
-        out = MODEL(audio)
-        audio_hat = out[0] if isinstance(out,(list,tuple)) else out
     y = audio_hat.squeeze().cpu().numpy()
-    return (24000, y)
-with gr.Blocks() as demo:
-    gr.Markdown("## SNAC Audio Reconstructor (24 kHz)")
     with gr.Row():
         with gr.Column():
-            audio_in = gr.Audio(sources=["upload","microphone"],
-                                type="numpy",
-                                label="Input Audio")
-            btn = gr.Button("Reconstruct")
         with gr.Column():
-            audio_out = gr.Audio(type="numpy",
-                                 label="Reconstructed")
-    btn.click(fn=reconstruct, inputs=audio_in, outputs=audio_out)
 if __name__ == "__main__":
     demo.launch()

 from snac import SNAC
 import gradio as gr
+# pick the right SNAC model for your audio sample rate
+MODEL_NAME = "hubertsiuzdak/snac_24khz"
+SR = 24000
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+model = SNAC.from_pretrained(MODEL_NAME).eval().to(DEVICE)
 def reconstruct(audio_in):
     if audio_in is None:
         return None
+    sr, data = audio_in  # data: (T,) or (T,C)
+    # convert stereo → mono
+    if data.ndim == 2:
+        data = data.mean(axis=1)
+    # turn into torch [1,1,T]
+    audio = torch.from_numpy(data).float().unsqueeze(0).unsqueeze(0).to(DEVICE)
+    # run through SNAC
     with torch.inference_mode():
+        audio_hat, codes = model(audio)
     y = audio_hat.squeeze().cpu().numpy()
+    return (SR, y)
+with gr.Blocks(title="SNAC Round-Trip Demo") as demo:
+    gr.Markdown("## 🎧 SNAC Audio Reconstructor (minimal!)")
     with gr.Row():
         with gr.Column():
+            audio_in = gr.Audio(
+                sources=["upload", "microphone"],
+                type="numpy",
+                label="Input audio"
+            )
+            btn = gr.Button("Encode + Decode")
         with gr.Column():
+            audio_out = gr.Audio(
+                type="numpy",
+                label="Reconstructed audio"
+            )
+    btn.click(reconstruct, inputs=audio_in, outputs=audio_out)
 if __name__ == "__main__":
     demo.launch()