VocalForge-AI / app.py
shukdevdatta123's picture
Update app.py
c120dc7 verified
raw
history blame
4.49 kB
import gradio as gr
from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import generate_text_semantic
from scipy.io.wavfile import write as write_wav
import tempfile
import librosa
import numpy as np
import torch
# Save the original torch.load function
original_load = torch.load
# Define a custom load function to bypass weights_only=True issue
def custom_load(*args, **kwargs):
kwargs['weights_only'] = False
return original_load(*args, **kwargs)
# Monkey-patch torch.load
torch.load = custom_load
# Preload Bark models
preload_models()
# Restore the original torch.load
torch.load = original_load
def preprocess_audio_to_npz(audio_path):
"""
Preprocess an audio file to create a .npz history prompt for voice cloning.
Parameters:
audio_path (str): Path to the input audio file.
Returns:
str: Path to the generated .npz file.
"""
# Load and resample audio to Bark's SAMPLE_RATE (24kHz)
audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
# Ensure audio is a float32 array (for potential future use)
audio = audio.astype(np.float32)
with torch.device("cpu"):
# Generate semantic tokens using generate_text_semantic
dummy_text = "Dummy text for history prompt generation."
semantic_tokens = generate_text_semantic(
text=dummy_text,
temp=0.7,
silent=True
)
# Ensure semantic_tokens is a 1D numpy array of int64
semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
if semantic_tokens.ndim != 1:
semantic_tokens = semantic_tokens.flatten()
# Simulate coarse tokens (typically shorter or quantized version of semantic tokens)
coarse_tokens = semantic_tokens[:256] # Truncate to simulate coarse quantization
coarse_tokens = np.array(coarse_tokens, dtype=np.int64)
# Simulate fine tokens (often similar to coarse tokens in Bark)
fine_tokens = coarse_tokens.copy() # Simplified assumption
fine_tokens = np.array(fine_tokens, dtype=np.int64)
# Create history prompt dictionary
history_prompt = {
"semantic_prompt": semantic_tokens,
"coarse_prompt": coarse_tokens,
"fine_prompt": fine_tokens
}
# Save to temporary .npz file
with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
np.savez(temp_file.name, **history_prompt)
npz_path = temp_file.name
return npz_path
def generate_speech(reference_audio, text):
"""
Generate speech audio mimicking the voice from the reference audio using Bark.
Parameters:
reference_audio (str): Filepath to the uploaded voice sample.
text (str): Text to convert to speech.
Returns:
str: Path to the generated audio file.
"""
if not reference_audio:
raise ValueError("Please upload a voice sample.")
if not text:
raise ValueError("Please enter text to convert.")
# Preprocess audio to create .npz history prompt
history_prompt = preprocess_audio_to_npz(reference_audio)
# Generate speech using the processed history prompt
audio_array = generate_audio(text, history_prompt=history_prompt)
# Save the audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
write_wav(temp_file.name, SAMPLE_RATE, audio_array)
temp_file_path = temp_file.name
return temp_file_path
# Build the Gradio interface
with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
with gr.Row():
audio_input = gr.Audio(
type="filepath",
label="Upload Your Voice Sample (English)",
interactive=True
)
text_input = gr.Textbox(
label="Enter Text to Convert to Speech",
placeholder="e.g., I love chocolate"
)
generate_btn = gr.Button("Generate Speech")
audio_output = gr.Audio(label="Generated Speech", interactive=False)
# Connect the button to the generation function
generate_btn.click(
fn=generate_speech,
inputs=[audio_input, text_input],
outputs=audio_output
)
# Launch the application
app.launch(share=True)