Spaces:
Paused
Paused
Commit
·
2788c71
1
Parent(s):
60aba0c
minor improvements
Browse files
app.py
CHANGED
@@ -8,16 +8,13 @@ import numpy as np
|
|
8 |
|
9 |
|
10 |
@spaces.GPU
|
11 |
-
def process_wav_new(
|
12 |
"""wav = torch.randn(1, 1, 24000 * 10) # should be [B, C=1, T]"""
|
13 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
|
15 |
mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
|
16 |
mimi = loaders.get_mimi(mimi_weight, device='cpu')
|
17 |
mimi.set_num_codebooks(8) # up to 32 for mimi, but limited to 8 for moshi.
|
18 |
-
# frame_size = int(mimi.sample_rate / mimi.frame_rate)
|
19 |
-
# wav = select_audio_frame(in_wav, frame_size)
|
20 |
-
wav = in_wav
|
21 |
|
22 |
with torch.no_grad():
|
23 |
# Supports streaming too.
|
@@ -32,23 +29,24 @@ def process_wav_new(in_wav):
|
|
32 |
|
33 |
mimi.to(device)
|
34 |
moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
|
35 |
-
moshi = loaders.get_moshi_lm(moshi_weight, device='
|
|
|
36 |
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7) # this handles sampling params etc.
|
37 |
|
38 |
out_wav_chunks = []
|
39 |
# Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
|
40 |
with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
|
41 |
for idx, code in enumerate(all_codes):
|
42 |
-
print("CODE: ", code.shape)
|
43 |
tokens_out = lm_gen.step(code.to(device))
|
44 |
# tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
|
45 |
if tokens_out is not None:
|
46 |
wav_chunk = mimi.decode(tokens_out[:, 1:])
|
47 |
out_wav_chunks.append(wav_chunk)
|
48 |
print(idx, end='\r')
|
49 |
-
out_wav = torch.cat(out_wav_chunks, dim=-1)
|
50 |
|
51 |
-
return
|
52 |
|
53 |
def convert2wav(audio):
|
54 |
if audio is None:
|
|
|
8 |
|
9 |
|
10 |
@spaces.GPU
|
11 |
+
def process_wav_new(wav):
|
12 |
"""wav = torch.randn(1, 1, 24000 * 10) # should be [B, C=1, T]"""
|
13 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
|
15 |
mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
|
16 |
mimi = loaders.get_mimi(mimi_weight, device='cpu')
|
17 |
mimi.set_num_codebooks(8) # up to 32 for mimi, but limited to 8 for moshi.
|
|
|
|
|
|
|
18 |
|
19 |
with torch.no_grad():
|
20 |
# Supports streaming too.
|
|
|
29 |
|
30 |
mimi.to(device)
|
31 |
moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
|
32 |
+
moshi = loaders.get_moshi_lm(moshi_weight, device='cpu')
|
33 |
+
moshi.to(device) # Move to GPU after loading
|
34 |
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7) # this handles sampling params etc.
|
35 |
|
36 |
out_wav_chunks = []
|
37 |
# Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
|
38 |
with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
|
39 |
for idx, code in enumerate(all_codes):
|
40 |
+
# print("CODE: ", code.shape)
|
41 |
tokens_out = lm_gen.step(code.to(device))
|
42 |
# tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
|
43 |
if tokens_out is not None:
|
44 |
wav_chunk = mimi.decode(tokens_out[:, 1:])
|
45 |
out_wav_chunks.append(wav_chunk)
|
46 |
print(idx, end='\r')
|
47 |
+
# out_wav = torch.cat(out_wav_chunks, dim=-1)
|
48 |
|
49 |
+
return torch.cat(out_wav_chunks, dim=-1)
|
50 |
|
51 |
def convert2wav(audio):
|
52 |
if audio is None:
|