Spaces:

shukdevdatta123
/

VocalForge-AI

Running

App Files Files Community

VocalForge-AI / app.py

shukdevdatta123

Update app.py

c120dc7 verified about 1 month ago

raw

history blame

4.49 kB

	import gradio as gr
	from bark import SAMPLE_RATE, generate_audio, preload_models
	from bark.generation import generate_text_semantic
	from scipy.io.wavfile import write as write_wav
	import tempfile
	import librosa
	import numpy as np
	import torch

	# Save the original torch.load function
	original_load = torch.load

	# Define a custom load function to bypass weights_only=True issue
	def custom_load(args, *kwargs):
	kwargs['weights_only'] = False
	return original_load(args, *kwargs)

	# Monkey-patch torch.load
	torch.load = custom_load

	# Preload Bark models
	preload_models()

	# Restore the original torch.load
	torch.load = original_load

	def preprocess_audio_to_npz(audio_path):
	"""
	Preprocess an audio file to create a .npz history prompt for voice cloning.

	Parameters:
	audio_path (str): Path to the input audio file.

	Returns:
	str: Path to the generated .npz file.
	"""
	# Load and resample audio to Bark's SAMPLE_RATE (24kHz)
	audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)

	# Ensure audio is a float32 array (for potential future use)
	audio = audio.astype(np.float32)

	with torch.device("cpu"):
	# Generate semantic tokens using generate_text_semantic
	dummy_text = "Dummy text for history prompt generation."
	semantic_tokens = generate_text_semantic(
	text=dummy_text,
	temp=0.7,
	silent=True
	)

	# Ensure semantic_tokens is a 1D numpy array of int64
	semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
	if semantic_tokens.ndim != 1:
	semantic_tokens = semantic_tokens.flatten()

	# Simulate coarse tokens (typically shorter or quantized version of semantic tokens)
	coarse_tokens = semantic_tokens[:256] # Truncate to simulate coarse quantization
	coarse_tokens = np.array(coarse_tokens, dtype=np.int64)

	# Simulate fine tokens (often similar to coarse tokens in Bark)
	fine_tokens = coarse_tokens.copy() # Simplified assumption
	fine_tokens = np.array(fine_tokens, dtype=np.int64)

	# Create history prompt dictionary
	history_prompt = {
	"semantic_prompt": semantic_tokens,
	"coarse_prompt": coarse_tokens,
	"fine_prompt": fine_tokens
	}

	# Save to temporary .npz file
	with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
	np.savez(temp_file.name, **history_prompt)
	npz_path = temp_file.name

	return npz_path

	def generate_speech(reference_audio, text):
	"""
	Generate speech audio mimicking the voice from the reference audio using Bark.

	Parameters:
	reference_audio (str): Filepath to the uploaded voice sample.
	text (str): Text to convert to speech.

	Returns:
	str: Path to the generated audio file.
	"""
	if not reference_audio:
	raise ValueError("Please upload a voice sample.")
	if not text:
	raise ValueError("Please enter text to convert.")

	# Preprocess audio to create .npz history prompt
	history_prompt = preprocess_audio_to_npz(reference_audio)

	# Generate speech using the processed history prompt
	audio_array = generate_audio(text, history_prompt=history_prompt)

	# Save the audio to a temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
	write_wav(temp_file.name, SAMPLE_RATE, audio_array)
	temp_file_path = temp_file.name

	return temp_file_path

	# Build the Gradio interface
	with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
	gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
	gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")

	with gr.Row():
	audio_input = gr.Audio(
	type="filepath",
	label="Upload Your Voice Sample (English)",
	interactive=True
	)
	text_input = gr.Textbox(
	label="Enter Text to Convert to Speech",
	placeholder="e.g., I love chocolate"
	)

	generate_btn = gr.Button("Generate Speech")
	audio_output = gr.Audio(label="Generated Speech", interactive=False)

	# Connect the button to the generation function
	generate_btn.click(
	fn=generate_speech,
	inputs=[audio_input, text_input],
	outputs=audio_output
	)

	# Launch the application
	app.launch(share=True)