shukdevdatta123 commited on
Commit
0a51a48
·
verified ·
1 Parent(s): 8ed43f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -30
app.py CHANGED
@@ -1,11 +1,10 @@
1
  import gradio as gr
2
  from bark import SAMPLE_RATE, generate_audio, preload_models
3
- from bark.generation import load_model, generate_text_semantic, _tokenize
4
  from scipy.io.wavfile import write as write_wav
5
  import tempfile
6
- import torch
7
  import librosa
8
  import numpy as np
 
9
 
10
  # Save the original torch.load function
11
  original_load = torch.load
@@ -34,39 +33,26 @@ def preprocess_audio_to_npz(audio_path):
34
  Returns:
35
  str: Path to the generated .npz file.
36
  """
37
- # Set device to CPU
 
 
 
 
 
 
 
 
38
  with torch.device("cpu"):
39
- # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
40
- audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
41
-
42
- # Ensure audio is a float32 array
43
- audio = audio.astype(np.float32)
44
-
45
- # Load HuBERT models for semantic token extraction
46
- hubert_manager = load_model(model_type="hubert")
47
- hubert_tokenizer = load_model(model_type="hubert_tokenizer")
48
-
49
- # Generate semantic tokens
50
- tokens = _tokenize(audio, hubert_manager, hubert_tokenizer)
51
- semantic_tokens = tokens[0] # Extract semantic tokens
52
-
53
- # Load coarse model for coarse tokens
54
- coarse_model = load_model(model_type="coarse")
55
-
56
- # Generate coarse tokens
57
- coarse_tokens = generate_text_semantic(
58
- semantic_tokens=semantic_tokens,
59
- model=coarse_model,
60
- max_gen_len=512
61
- )
62
 
63
- # Create history prompt dictionary
64
  history_prompt = {
65
- "semantic_prompt": semantic_tokens,
66
- "coarse_prompt": coarse_tokens
67
  }
68
 
69
- # Save to temporary .npz file
70
  with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
71
  np.savez(temp_file.name, **history_prompt)
72
  npz_path = temp_file.name
 
1
  import gradio as gr
2
  from bark import SAMPLE_RATE, generate_audio, preload_models
 
3
  from scipy.io.wavfile import write as write_wav
4
  import tempfile
 
5
  import librosa
6
  import numpy as np
7
+ import torch
8
 
9
  # Save the original torch.load function
10
  original_load = torch.load
 
33
  Returns:
34
  str: Path to the generated .npz file.
35
  """
36
+ # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
37
+ audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
38
+
39
+ # Ensure audio is a float32 array
40
+ audio = audio.astype(np.float32)
41
+
42
+ # Generate semantic tokens directly using Bark's internal processing
43
+ # Since HuBERT models are not implemented, we rely on generate_audio's history prompt
44
+ # This is a simplified approach assuming Bark can handle raw audio for history prompt
45
  with torch.device("cpu"):
46
+ # Generate audio tokens to create a history prompt
47
+ # We use a dummy text to generate a history prompt from the audio
48
+ dummy_text = "Dummy text for history prompt generation."
49
+ audio_array = generate_audio(dummy_text, history_prompt=audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ # Save the audio array as a temporary .npz file
52
  history_prompt = {
53
+ "audio": audio_array
 
54
  }
55
 
 
56
  with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
57
  np.savez(temp_file.name, **history_prompt)
58
  npz_path = temp_file.name