Spaces:
Running
Running
# !pip install TTS gradio numpy librosa torch soundfile | |
from TTS.api import TTS | |
import gradio as gr | |
import numpy as np | |
import librosa | |
import torch | |
import tempfile | |
import os | |
import soundfile as sf # Added for better audio handling | |
# Check device availability | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Initialize TTS model with device parameter | |
model_name = "tts_models/multilingual/multi-dataset/your_tts" | |
tts = TTS(model_name=model_name).to(device) # This line is the problem | |
def process_audio(audio_path, max_duration=10): | |
"""Load and trim audio to specified duration""" | |
y, sr = librosa.load(audio_path, sr=16000, mono=True) | |
max_samples = max_duration * sr | |
if len(y) > max_samples: | |
y = y[:int(max_samples)] | |
return y, sr | |
def generate_speech(audio_file, text): | |
# Create temp files | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \ | |
tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file: | |
ref_path = ref_file.name | |
out_path = out_file.name | |
# Process reference audio | |
y, sr = process_audio(audio_file) | |
sf.write(ref_path, y, sr) # Using soundfile instead of librosa for writing | |
# Generate speech | |
try: | |
tts.tts_to_file( | |
text=text, | |
speaker_wav=ref_path, | |
language="en", | |
file_path=out_path | |
) | |
# Clean up temporary files | |
os.unlink(ref_path) | |
return out_path | |
except Exception as e: | |
print(f"Error: {e}") | |
return None | |
# Gradio interface | |
with gr.Blocks(title="Voice Clone TTS") as demo: | |
gr.Markdown(""" | |
# π€ Voice Clone Text-to-Speech | |
1. Upload a short English voice sample (5-10 seconds) | |
2. Enter text you want to speak | |
3. Generate audio in your voice! | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
sources=["upload", "microphone"], | |
type="filepath", | |
label="Upload Voice Sample", | |
interactive=True | |
) | |
text_input = gr.Textbox( | |
label="Text to Speak", | |
placeholder="Enter English text here...", | |
lines=4 | |
) | |
btn = gr.Button("Generate Speech", variant="primary") | |
with gr.Column(): | |
audio_output = gr.Audio( | |
label="Generated Speech", | |
interactive=False | |
) | |
error_output = gr.Textbox(label="Processing Info", visible=False) | |
# Example inputs | |
gr.Examples( | |
examples=[ | |
["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"], | |
["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"] | |
], | |
inputs=[audio_input, text_input], | |
outputs=audio_output, | |
fn=generate_speech, | |
cache_examples=False # Disabled cache to avoid potential issues | |
) | |
btn.click( | |
fn=generate_speech, | |
inputs=[audio_input, text_input], | |
outputs=audio_output | |
) | |
if __name__ == "__main__": | |
demo.launch(server_port=7860, share=True) |