Spaces:

justus-tobias
/

Moshi

Paused

justus-tobias commited on Sep 25, 2024

Commit

60aba0c

1 Parent(s): 3d2b6af

updated spaces

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,14 +1,16 @@
 import gradio as gr
 import torch
 from huggingface_hub import hf_hub_download
 from moshi.models import loaders, LMGen
 import numpy as np
-import spaces
 @spaces.GPU
 def process_wav_new(in_wav):
     """wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]"""
     mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
     mimi = loaders.get_mimi(mimi_weight, device='cpu')
@@ -28,7 +30,7 @@ def process_wav_new(in_wav):
                 assert codes.shape[-1] == 1, codes.shape
                 all_codes.append(codes)
-    mimi.cuda()
     moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
     moshi = loaders.get_moshi_lm(moshi_weight, device='cuda')
     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)  # this handles sampling params etc.
@@ -38,7 +40,7 @@ def process_wav_new(in_wav):
     with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
         for idx, code in enumerate(all_codes):
             print("CODE: ", code.shape)
-            tokens_out = lm_gen.step(code.cuda())
             # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
             if tokens_out is not None:
                 wav_chunk = mimi.decode(tokens_out[:, 1:])

+import spaces
 import gradio as gr
 import torch
 from huggingface_hub import hf_hub_download
 from moshi.models import loaders, LMGen
 import numpy as np
 @spaces.GPU
 def process_wav_new(in_wav):
     """wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
     mimi = loaders.get_mimi(mimi_weight, device='cpu')
                 assert codes.shape[-1] == 1, codes.shape
                 all_codes.append(codes)
+    mimi.to(device)
     moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
     moshi = loaders.get_moshi_lm(moshi_weight, device='cuda')
     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)  # this handles sampling params etc.
     with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
         for idx, code in enumerate(all_codes):
             print("CODE: ", code.shape)
+            tokens_out = lm_gen.step(code.to(device))
             # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
             if tokens_out is not None:
                 wav_chunk = mimi.decode(tokens_out[:, 1:])