Spaces:
Running
Running
import gradio as gr | |
from bark import SAMPLE_RATE, generate_audio, preload_models | |
from bark.generation import generate_text_semantic | |
from scipy.io.wavfile import write as write_wav | |
import tempfile | |
import librosa | |
import numpy as np | |
import torch | |
# Save the original torch.load function | |
original_load = torch.load | |
# Define a custom load function to bypass weights_only=True issue | |
def custom_load(*args, **kwargs): | |
kwargs['weights_only'] = False | |
return original_load(*args, **kwargs) | |
# Monkey-patch torch.load | |
torch.load = custom_load | |
# Preload Bark models | |
preload_models() | |
# Restore the original torch.load | |
torch.load = original_load | |
def preprocess_audio_to_npz(audio_path): | |
""" | |
Preprocess an audio file to create a .npz history prompt for voice cloning. | |
Parameters: | |
audio_path (str): Path to the input audio file. | |
Returns: | |
str: Path to the generated .npz file. | |
""" | |
# Load and resample audio to Bark's SAMPLE_RATE (24kHz) | |
audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True) | |
# Ensure audio is a float32 array | |
audio = audio.astype(np.float32) | |
with torch.device("cpu"): | |
# Generate dummy semantic tokens using generate_text_semantic | |
dummy_text = "Dummy text for history prompt generation." | |
semantic_tokens = generate_text_semantic( | |
text=dummy_text, | |
max_gen_len=512, | |
temp=0.7, | |
silent=True | |
) | |
# Ensure semantic_tokens is a numpy array with correct shape | |
semantic_tokens = np.array(semantic_tokens, dtype=np.int64) | |
if semantic_tokens.ndim == 0: | |
semantic_tokens = semantic_tokens.reshape(-1) | |
# Coarse and fine prompts are derived from semantic tokens | |
# Bark often uses similar tokens for coarse and fine prompts | |
coarse_tokens = semantic_tokens # Simplified assumption | |
fine_tokens = semantic_tokens # Simplified assumption | |
# Create history prompt dictionary | |
history_prompt = { | |
"semantic_prompt": semantic_tokens, | |
"coarse_prompt": coarse_tokens, | |
"fine_prompt": fine_tokens | |
} | |
# Save to temporary .npz file | |
with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file: | |
np.savez(temp_file.name, **history_prompt) | |
npz_path = temp_file.name | |
return npz_path | |
def generate_speech(reference_audio, text): | |
""" | |
Generate speech audio mimicking the voice from the reference audio using Bark. | |
Parameters: | |
reference_audio (str): Filepath to the uploaded voice sample. | |
text (str): Text to convert to speech. | |
Returns: | |
str: Path to the generated audio file. | |
""" | |
if not reference_audio: | |
raise ValueError("Please upload a voice sample.") | |
if not text: | |
raise ValueError("Please enter text to convert.") | |
# Preprocess audio to create .npz history prompt | |
history_prompt = preprocess_audio_to_npz(reference_audio) | |
# Generate speech using the processed history prompt | |
audio_array = generate_audio(text, history_prompt=history_prompt) | |
# Save the audio to a temporary file | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
write_wav(temp_file.name, SAMPLE_RATE, audio_array) | |
temp_file_path = temp_file.name | |
return temp_file_path | |
# Build the Gradio interface | |
with gr.Blocks(title="Voice Cloning TTS with Bark") as app: | |
gr.Markdown("## Voice Cloning Text-to-Speech with Bark") | |
gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!") | |
with gr.Row(): | |
audio_input = gr.Audio( | |
type="filepath", | |
label="Upload Your Voice Sample (English)", | |
interactive=True | |
) | |
text_input = gr.Textbox( | |
label="Enter Text to Convert to Speech", | |
placeholder="e.g., I love chocolate" | |
) | |
generate_btn = gr.Button("Generate Speech") | |
audio_output = gr.Audio(label="Generated Speech", interactive=False) | |
# Connect the button to the generation function | |
generate_btn.click( | |
fn=generate_speech, | |
inputs=[audio_input, text_input], | |
outputs=audio_output | |
) | |
# Launch the application | |
app.launch() |