Spaces:
Running
Running
File size: 4,517 Bytes
4760b00 4999708 523a466 4999708 ba92b2d e25f277 0a51a48 47d7c50 6d7fea6 f8f4a26 e25f277 6d7fea6 e25f277 6d7fea6 e25f277 4999708 4760b00 e25f277 6d7fea6 e25f277 4ee577e e25f277 4ee577e e25f277 523a466 e25f277 0a51a48 4ee577e 99ef324 0a51a48 4ee577e 523a466 99ef324 523a466 99ef324 523a466 4ee577e 99ef324 523a466 99ef324 4ee577e 99ef324 4ee577e 523a466 4ee577e 523a466 4ee577e 523a466 e25f277 4999708 e25f277 4ee577e 4999708 e25f277 4999708 4ee577e 4999708 e47cdda 4999708 e25f277 4ee577e 523a466 e25f277 4ee577e e25f277 e47cdda 4ee577e e47cdda 4ee577e 4999708 3220f5e 4999708 e25f277 4ee577e e25f277 4ee577e e25f277 4999708 4ee577e e25f277 4999708 f74edeb 3220f5e 4999708 4ee577e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import gradio as gr
from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import generate_text_semantic
from scipy.io.wavfile import write as write_wav
import tempfile
import librosa
import numpy as np
import torch
# Save the original torch.load function
original_load = torch.load
# Define a custom load function to bypass weights_only=True issue
def custom_load(*args, **kwargs):
kwargs['weights_only'] = False
return original_load(*args, **kwargs)
# Monkey-patch torch.load
torch.load = custom_load
# Preload Bark models
preload_models()
# Restore the original torch.load
torch.load = original_load
def preprocess_audio_to_npz(audio_path):
"""
Preprocess an audio file to create a .npz history prompt for voice cloning.
Parameters:
audio_path (str): Path to the input audio file.
Returns:
str: Path to the generated .npz file.
"""
# Load and resample audio to Bark's SAMPLE_RATE (24kHz)
audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
# Ensure audio is a float32 array (for potential future use)
audio = audio.astype(np.float32)
with torch.device("cpu"):
# Generate semantic tokens using generate_text_semantic
dummy_text = "Dummy text for history prompt generation."
semantic_tokens = generate_text_semantic(
text=dummy_text,
max_gen_len=512,
temp=0.7,
silent=True
)
# Ensure semantic_tokens is a 1D numpy array of int64
semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
if semantic_tokens.ndim != 1:
semantic_tokens = semantic_tokens.flatten()
# Simulate coarse tokens (typically shorter or quantized version of semantic tokens)
coarse_tokens = semantic_tokens[:256] # Truncate to simulate coarse quantization
coarse_tokens = np.array(coarse_tokens, dtype=np.int64)
# Simulate fine tokens (often similar to coarse tokens in Bark)
fine_tokens = coarse_tokens.copy() # Simplified assumption
fine_tokens = np.array(fine_tokens, dtype=np.int64)
# Create history prompt dictionary
history_prompt = {
"semantic_prompt": semantic_tokens,
"coarse_prompt": coarse_tokens,
"fine_prompt": fine_tokens
}
# Save to temporary .npz file
with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
np.savez(temp_file.name, **history_prompt)
npz_path = temp_file.name
return npz_path
def generate_speech(reference_audio, text):
"""
Generate speech audio mimicking the voice from the reference audio using Bark.
Parameters:
reference_audio (str): Filepath to the uploaded voice sample.
text (str): Text to convert to speech.
Returns:
str: Path to the generated audio file.
"""
if not reference_audio:
raise ValueError("Please upload a voice sample.")
if not text:
raise ValueError("Please enter text to convert.")
# Preprocess audio to create .npz history prompt
history_prompt = preprocess_audio_to_npz(reference_audio)
# Generate speech using the processed history prompt
audio_array = generate_audio(text, history_prompt=history_prompt)
# Save the audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
write_wav(temp_file.name, SAMPLE_RATE, audio_array)
temp_file_path = temp_file.name
return temp_file_path
# Build the Gradio interface
with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
with gr.Row():
audio_input = gr.Audio(
type="filepath",
label="Upload Your Voice Sample (English)",
interactive=True
)
text_input = gr.Textbox(
label="Enter Text to Convert to Speech",
placeholder="e.g., I love chocolate"
)
generate_btn = gr.Button("Generate Speech")
audio_output = gr.Audio(label="Generated Speech", interactive=False)
# Connect the button to the generation function
generate_btn.click(
fn=generate_speech,
inputs=[audio_input, text_input],
outputs=audio_output
)
# Launch the application
app.launch(share=True) |