VocalForge-AI / app.py
shukdevdatta123's picture
Update app.py
f75668a verified
raw
history blame
3.24 kB
# !pip install TTS gradio numpy librosa torch soundfile
from TTS.api import TTS
import gradio as gr
import numpy as np
import librosa
import torch
import tempfile
import os
import soundfile as sf # Added for better audio handling
# Check device availability
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize TTS model with device parameter
model_name = "tts_models/multilingual/multi-dataset/your_tts"
tts = TTS(model_name=model_name).to(device) # This line is the problem
def process_audio(audio_path, max_duration=10):
"""Load and trim audio to specified duration"""
y, sr = librosa.load(audio_path, sr=16000, mono=True)
max_samples = max_duration * sr
if len(y) > max_samples:
y = y[:int(max_samples)]
return y, sr
def generate_speech(audio_file, text):
# Create temp files
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \
tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file:
ref_path = ref_file.name
out_path = out_file.name
# Process reference audio
y, sr = process_audio(audio_file)
sf.write(ref_path, y, sr) # Using soundfile instead of librosa for writing
# Generate speech
try:
tts.tts_to_file(
text=text,
speaker_wav=ref_path,
language="en",
file_path=out_path
)
# Clean up temporary files
os.unlink(ref_path)
return out_path
except Exception as e:
print(f"Error: {e}")
return None
# Gradio interface
with gr.Blocks(title="Voice Clone TTS") as demo:
gr.Markdown("""
# 🎀 Voice Clone Text-to-Speech
1. Upload a short English voice sample (5-10 seconds)
2. Enter text you want to speak
3. Generate audio in your voice!
""")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Upload Voice Sample",
interactive=True
)
text_input = gr.Textbox(
label="Text to Speak",
placeholder="Enter English text here...",
lines=4
)
btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(
label="Generated Speech",
interactive=False
)
error_output = gr.Textbox(label="Processing Info", visible=False)
# Example inputs
gr.Examples(
examples=[
["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"],
["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"]
],
inputs=[audio_input, text_input],
outputs=audio_output,
fn=generate_speech,
cache_examples=False # Disabled cache to avoid potential issues
)
btn.click(
fn=generate_speech,
inputs=[audio_input, text_input],
outputs=audio_output
)
if __name__ == "__main__":
demo.launch(server_port=7860, share=True)