Spaces:

justus-tobias
/

Moshi

Paused

App Files Files Community

justus-tobias commited on Sep 25, 2024

Commit

2788c71

1 Parent(s): 60aba0c

minor improvements

Browse files

Files changed (1) hide show

app.py +6 -8

app.py CHANGED Viewed

@@ -8,16 +8,13 @@ import numpy as np
 @spaces.GPU
-def process_wav_new(in_wav):
     """wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]"""
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
     mimi = loaders.get_mimi(mimi_weight, device='cpu')
     mimi.set_num_codebooks(8)  # up to 32 for mimi, but limited to 8 for moshi.
-    # frame_size = int(mimi.sample_rate / mimi.frame_rate)
-    # wav = select_audio_frame(in_wav, frame_size)
-    wav = in_wav
     with torch.no_grad():
         # Supports streaming too.
@@ -32,23 +29,24 @@ def process_wav_new(in_wav):
     mimi.to(device)
     moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
-    moshi = loaders.get_moshi_lm(moshi_weight, device='cuda')
     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)  # this handles sampling params etc.
     out_wav_chunks = []
     # Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
     with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
         for idx, code in enumerate(all_codes):
-            print("CODE: ", code.shape)
             tokens_out = lm_gen.step(code.to(device))
             # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
             if tokens_out is not None:
                 wav_chunk = mimi.decode(tokens_out[:, 1:])
                 out_wav_chunks.append(wav_chunk)
             print(idx, end='\r')
-    out_wav = torch.cat(out_wav_chunks, dim=-1)
-    return out_wav
 def convert2wav(audio):
     if audio is None:

 @spaces.GPU
+def process_wav_new(wav):
     """wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]"""
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
     mimi = loaders.get_mimi(mimi_weight, device='cpu')
     mimi.set_num_codebooks(8)  # up to 32 for mimi, but limited to 8 for moshi.
     with torch.no_grad():
         # Supports streaming too.
     mimi.to(device)
     moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
+    moshi = loaders.get_moshi_lm(moshi_weight, device='cpu')
+    moshi.to(device)  # Move to GPU after loading
     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)  # this handles sampling params etc.
     out_wav_chunks = []
     # Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
     with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
         for idx, code in enumerate(all_codes):
+            # print("CODE: ", code.shape)
             tokens_out = lm_gen.step(code.to(device))
             # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
             if tokens_out is not None:
                 wav_chunk = mimi.decode(tokens_out[:, 1:])
                 out_wav_chunks.append(wav_chunk)
             print(idx, end='\r')
+    # out_wav = torch.cat(out_wav_chunks, dim=-1)
+    return torch.cat(out_wav_chunks, dim=-1)
 def convert2wav(audio):
     if audio is None: