justus-tobias commited on
Commit
2788c71
·
1 Parent(s): 60aba0c

minor improvements

Browse files
Files changed (1) hide show
  1. app.py +6 -8
app.py CHANGED
@@ -8,16 +8,13 @@ import numpy as np
8
 
9
 
10
  @spaces.GPU
11
- def process_wav_new(in_wav):
12
  """wav = torch.randn(1, 1, 24000 * 10) # should be [B, C=1, T]"""
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
 
15
  mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
16
  mimi = loaders.get_mimi(mimi_weight, device='cpu')
17
  mimi.set_num_codebooks(8) # up to 32 for mimi, but limited to 8 for moshi.
18
- # frame_size = int(mimi.sample_rate / mimi.frame_rate)
19
- # wav = select_audio_frame(in_wav, frame_size)
20
- wav = in_wav
21
 
22
  with torch.no_grad():
23
  # Supports streaming too.
@@ -32,23 +29,24 @@ def process_wav_new(in_wav):
32
 
33
  mimi.to(device)
34
  moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
35
- moshi = loaders.get_moshi_lm(moshi_weight, device='cuda')
 
36
  lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7) # this handles sampling params etc.
37
 
38
  out_wav_chunks = []
39
  # Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
40
  with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
41
  for idx, code in enumerate(all_codes):
42
- print("CODE: ", code.shape)
43
  tokens_out = lm_gen.step(code.to(device))
44
  # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
45
  if tokens_out is not None:
46
  wav_chunk = mimi.decode(tokens_out[:, 1:])
47
  out_wav_chunks.append(wav_chunk)
48
  print(idx, end='\r')
49
- out_wav = torch.cat(out_wav_chunks, dim=-1)
50
 
51
- return out_wav
52
 
53
  def convert2wav(audio):
54
  if audio is None:
 
8
 
9
 
10
  @spaces.GPU
11
+ def process_wav_new(wav):
12
  """wav = torch.randn(1, 1, 24000 * 10) # should be [B, C=1, T]"""
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
 
15
  mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
16
  mimi = loaders.get_mimi(mimi_weight, device='cpu')
17
  mimi.set_num_codebooks(8) # up to 32 for mimi, but limited to 8 for moshi.
 
 
 
18
 
19
  with torch.no_grad():
20
  # Supports streaming too.
 
29
 
30
  mimi.to(device)
31
  moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
32
+ moshi = loaders.get_moshi_lm(moshi_weight, device='cpu')
33
+ moshi.to(device) # Move to GPU after loading
34
  lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7) # this handles sampling params etc.
35
 
36
  out_wav_chunks = []
37
  # Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
38
  with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
39
  for idx, code in enumerate(all_codes):
40
+ # print("CODE: ", code.shape)
41
  tokens_out = lm_gen.step(code.to(device))
42
  # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
43
  if tokens_out is not None:
44
  wav_chunk = mimi.decode(tokens_out[:, 1:])
45
  out_wav_chunks.append(wav_chunk)
46
  print(idx, end='\r')
47
+ # out_wav = torch.cat(out_wav_chunks, dim=-1)
48
 
49
+ return torch.cat(out_wav_chunks, dim=-1)
50
 
51
  def convert2wav(audio):
52
  if audio is None: