Spaces:

shukdevdatta123
/

VocalForge-AI

Running

App Files Files Community

VocalForge-AI / app.py

shukdevdatta123

Update app.py

f75668a verified about 1 month ago

raw

history blame

3.24 kB

	# !pip install TTS gradio numpy librosa torch soundfile

	from TTS.api import TTS
	import gradio as gr
	import numpy as np
	import librosa
	import torch
	import tempfile
	import os
	import soundfile as sf # Added for better audio handling

	# Check device availability
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Initialize TTS model with device parameter
	model_name = "tts_models/multilingual/multi-dataset/your_tts"
	tts = TTS(model_name=model_name).to(device) # This line is the problem

	def process_audio(audio_path, max_duration=10):
	"""Load and trim audio to specified duration"""
	y, sr = librosa.load(audio_path, sr=16000, mono=True)
	max_samples = max_duration * sr
	if len(y) > max_samples:
	y = y[:int(max_samples)]
	return y, sr

	def generate_speech(audio_file, text):
	# Create temp files
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \
	tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file:

	ref_path = ref_file.name
	out_path = out_file.name

	# Process reference audio
	y, sr = process_audio(audio_file)
	sf.write(ref_path, y, sr) # Using soundfile instead of librosa for writing

	# Generate speech
	try:
	tts.tts_to_file(
	text=text,
	speaker_wav=ref_path,
	language="en",
	file_path=out_path
	)

	# Clean up temporary files
	os.unlink(ref_path)
	return out_path
	except Exception as e:
	print(f"Error: {e}")
	return None

	# Gradio interface
	with gr.Blocks(title="Voice Clone TTS") as demo:
	gr.Markdown("""
	# 🎤 Voice Clone Text-to-Speech
	1. Upload a short English voice sample (5-10 seconds)
	2. Enter text you want to speak
	3. Generate audio in your voice!
	""")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	sources=["upload", "microphone"],
	type="filepath",
	label="Upload Voice Sample",
	interactive=True
	)
	text_input = gr.Textbox(
	label="Text to Speak",
	placeholder="Enter English text here...",
	lines=4
	)
	btn = gr.Button("Generate Speech", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(
	label="Generated Speech",
	interactive=False
	)
	error_output = gr.Textbox(label="Processing Info", visible=False)

	# Example inputs
	gr.Examples(
	examples=[
	["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"],
	["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"]
	],
	inputs=[audio_input, text_input],
	outputs=audio_output,
	fn=generate_speech,
	cache_examples=False # Disabled cache to avoid potential issues
	)

	btn.click(
	fn=generate_speech,
	inputs=[audio_input, text_input],
	outputs=audio_output
	)

	if __name__ == "__main__":
	demo.launch(server_port=7860, share=True)