Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
import gradio as gr
|
2 |
from bark import SAMPLE_RATE, generate_audio, preload_models
|
3 |
-
from bark.generation import load_model, generate_text_semantic, _tokenize
|
4 |
from scipy.io.wavfile import write as write_wav
|
5 |
import tempfile
|
6 |
-
import torch
|
7 |
import librosa
|
8 |
import numpy as np
|
|
|
9 |
|
10 |
# Save the original torch.load function
|
11 |
original_load = torch.load
|
@@ -34,39 +33,26 @@ def preprocess_audio_to_npz(audio_path):
|
|
34 |
Returns:
|
35 |
str: Path to the generated .npz file.
|
36 |
"""
|
37 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
with torch.device("cpu"):
|
39 |
-
#
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
audio = audio.astype(np.float32)
|
44 |
-
|
45 |
-
# Load HuBERT models for semantic token extraction
|
46 |
-
hubert_manager = load_model(model_type="hubert")
|
47 |
-
hubert_tokenizer = load_model(model_type="hubert_tokenizer")
|
48 |
-
|
49 |
-
# Generate semantic tokens
|
50 |
-
tokens = _tokenize(audio, hubert_manager, hubert_tokenizer)
|
51 |
-
semantic_tokens = tokens[0] # Extract semantic tokens
|
52 |
-
|
53 |
-
# Load coarse model for coarse tokens
|
54 |
-
coarse_model = load_model(model_type="coarse")
|
55 |
-
|
56 |
-
# Generate coarse tokens
|
57 |
-
coarse_tokens = generate_text_semantic(
|
58 |
-
semantic_tokens=semantic_tokens,
|
59 |
-
model=coarse_model,
|
60 |
-
max_gen_len=512
|
61 |
-
)
|
62 |
|
63 |
-
#
|
64 |
history_prompt = {
|
65 |
-
"
|
66 |
-
"coarse_prompt": coarse_tokens
|
67 |
}
|
68 |
|
69 |
-
# Save to temporary .npz file
|
70 |
with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
|
71 |
np.savez(temp_file.name, **history_prompt)
|
72 |
npz_path = temp_file.name
|
|
|
1 |
import gradio as gr
|
2 |
from bark import SAMPLE_RATE, generate_audio, preload_models
|
|
|
3 |
from scipy.io.wavfile import write as write_wav
|
4 |
import tempfile
|
|
|
5 |
import librosa
|
6 |
import numpy as np
|
7 |
+
import torch
|
8 |
|
9 |
# Save the original torch.load function
|
10 |
original_load = torch.load
|
|
|
33 |
Returns:
|
34 |
str: Path to the generated .npz file.
|
35 |
"""
|
36 |
+
# Load and resample audio to Bark's SAMPLE_RATE (24kHz)
|
37 |
+
audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
|
38 |
+
|
39 |
+
# Ensure audio is a float32 array
|
40 |
+
audio = audio.astype(np.float32)
|
41 |
+
|
42 |
+
# Generate semantic tokens directly using Bark's internal processing
|
43 |
+
# Since HuBERT models are not implemented, we rely on generate_audio's history prompt
|
44 |
+
# This is a simplified approach assuming Bark can handle raw audio for history prompt
|
45 |
with torch.device("cpu"):
|
46 |
+
# Generate audio tokens to create a history prompt
|
47 |
+
# We use a dummy text to generate a history prompt from the audio
|
48 |
+
dummy_text = "Dummy text for history prompt generation."
|
49 |
+
audio_array = generate_audio(dummy_text, history_prompt=audio_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
# Save the audio array as a temporary .npz file
|
52 |
history_prompt = {
|
53 |
+
"audio": audio_array
|
|
|
54 |
}
|
55 |
|
|
|
56 |
with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
|
57 |
np.savez(temp_file.name, **history_prompt)
|
58 |
npz_path = temp_file.name
|