Spaces:
Running
Running
File size: 4,401 Bytes
4760b00 4999708 523a466 4999708 ba92b2d e25f277 0a51a48 47d7c50 6d7fea6 f8f4a26 e25f277 6d7fea6 e25f277 6d7fea6 e25f277 4999708 4760b00 e25f277 6d7fea6 e25f277 523a466 e25f277 0a51a48 523a466 7666acf 523a466 e25f277 4999708 e25f277 4999708 e25f277 4999708 ba92b2d 4999708 e47cdda 4999708 e25f277 523a466 e25f277 e47cdda ba92b2d e47cdda 4999708 3220f5e 4999708 e25f277 3220f5e e25f277 3220f5e e25f277 4999708 523a466 e25f277 4999708 f74edeb 3220f5e 4999708 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import gradio as gr
from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import generate_text_semantic
from scipy.io.wavfile import write as write_wav
import tempfile
import librosa
import numpy as np
import torch
# Save the original torch.load function
original_load = torch.load
# Define a custom load function to bypass weights_only=True issue
def custom_load(*args, **kwargs):
kwargs['weights_only'] = False
return original_load(*args, **kwargs)
# Monkey-patch torch.load
torch.load = custom_load
# Preload Bark models
preload_models()
# Restore the original torch.load
torch.load = original_load
def preprocess_audio_to_npz(audio_path):
"""
Preprocess an audio file to create a .npz history prompt for voice cloning.
Parameters:
audio_path (str): Path to the input audio file.
Returns:
str: Path to the generated .npz file.
"""
# Load and resample audio to Bark's SAMPLE_RATE (24kHz)
audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
# Ensure audio is a float32 array
audio = audio.astype(np.float32)
with torch.device("cpu"):
# Generate dummy semantic tokens using generate_text_semantic
dummy_text = "Dummy text for history prompt generation."
semantic_tokens = generate_text_semantic(
text=dummy_text,
max_gen_len=512,
temp=0.7,
silent=True
)
# Ensure semantic_tokens is a numpy array with correct shape
semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
if semantic_tokens.ndim == 0:
semantic_tokens = semantic_tokens.reshape(-1)
# Coarse and fine prompts are derived from semantic tokens
# Bark often uses similar tokens for coarse and fine prompts
coarse_tokens = semantic_tokens # Simplified assumption
fine_tokens = semantic_tokens # Simplified assumption
# Create history prompt dictionary
history_prompt = {
"semantic_prompt": semantic_tokens,
"coarse_prompt": coarse_tokens,
"fine_prompt": fine_tokens
}
# Save to temporary .npz file
with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
np.savez(temp_file.name, **history_prompt)
npz_path = temp_file.name
return npz_path
def generate_speech(reference_audio, text):
"""
Generate speech audio mimicking the voice from the reference audio using Bark.
Parameters:
reference_audio (str): Filepath to the uploaded voice sample.
text (str): Text to convert to speech.
Returns:
str: Path to the generated audio file.
"""
if not reference_audio:
raise ValueError("Please upload a voice sample.")
if not text:
raise ValueError("Please enter text to convert.")
# Preprocess audio to create .npz history prompt
history_prompt = preprocess_audio_to_npz(reference_audio)
# Generate speech using the processed history prompt
audio_array = generate_audio(text, history_prompt=history_prompt)
# Save the audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
write_wav(temp_file.name, SAMPLE_RATE, audio_array)
temp_file_path = temp_file.name
return temp_file_path
# Build the Gradio interface
with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
with gr.Row():
audio_input = gr.Audio(
type="filepath",
label="Upload Your Voice Sample (English)",
interactive=True
)
text_input = gr.Textbox(
label="Enter Text to Convert to Speech",
placeholder="e.g., I love chocolate"
)
generate_btn = gr.Button("Generate Speech")
audio_output = gr.Audio(label="Generated Speech", interactive=False)
# Connect the button to the generation function
generate_btn.click(
fn=generate_speech,
inputs=[audio_input, text_input],
outputs=audio_output
)
# Launch the application
app.launch() |