Spaces:
Running
Running
File size: 3,235 Bytes
f75668a 4760b00 f74edeb 4760b00 3220f5e 4760b00 ba92b2d f75668a f8f4a26 4760b00 f8f4a26 f75668a 4760b00 f75668a 4760b00 ba92b2d 4760b00 f75668a ba92b2d 4760b00 3220f5e 4760b00 3220f5e 4760b00 3220f5e 4760b00 f75668a 4760b00 3220f5e 4760b00 f74edeb 3220f5e 4760b00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# !pip install TTS gradio numpy librosa torch soundfile
from TTS.api import TTS
import gradio as gr
import numpy as np
import librosa
import torch
import tempfile
import os
import soundfile as sf # Added for better audio handling
# Check device availability
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize TTS model with device parameter
model_name = "tts_models/multilingual/multi-dataset/your_tts"
tts = TTS(model_name=model_name).to(device) # This line is the problem
def process_audio(audio_path, max_duration=10):
"""Load and trim audio to specified duration"""
y, sr = librosa.load(audio_path, sr=16000, mono=True)
max_samples = max_duration * sr
if len(y) > max_samples:
y = y[:int(max_samples)]
return y, sr
def generate_speech(audio_file, text):
# Create temp files
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \
tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file:
ref_path = ref_file.name
out_path = out_file.name
# Process reference audio
y, sr = process_audio(audio_file)
sf.write(ref_path, y, sr) # Using soundfile instead of librosa for writing
# Generate speech
try:
tts.tts_to_file(
text=text,
speaker_wav=ref_path,
language="en",
file_path=out_path
)
# Clean up temporary files
os.unlink(ref_path)
return out_path
except Exception as e:
print(f"Error: {e}")
return None
# Gradio interface
with gr.Blocks(title="Voice Clone TTS") as demo:
gr.Markdown("""
# 🎤 Voice Clone Text-to-Speech
1. Upload a short English voice sample (5-10 seconds)
2. Enter text you want to speak
3. Generate audio in your voice!
""")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Upload Voice Sample",
interactive=True
)
text_input = gr.Textbox(
label="Text to Speak",
placeholder="Enter English text here...",
lines=4
)
btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(
label="Generated Speech",
interactive=False
)
error_output = gr.Textbox(label="Processing Info", visible=False)
# Example inputs
gr.Examples(
examples=[
["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"],
["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"]
],
inputs=[audio_input, text_input],
outputs=audio_output,
fn=generate_speech,
cache_examples=False # Disabled cache to avoid potential issues
)
btn.click(
fn=generate_speech,
inputs=[audio_input, text_input],
outputs=audio_output
)
if __name__ == "__main__":
demo.launch(server_port=7860, share=True) |