|
|
|
|
|
from TTS.api import TTS |
|
import gradio as gr |
|
import numpy as np |
|
import librosa |
|
import torch |
|
import tempfile |
|
import os |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
model_name = "tts_models/multilingual/multi-dataset/your_tts" |
|
tts = TTS(model_name=model_name).to(device) |
|
|
|
def process_audio(audio_path, max_duration=10): |
|
"""Load and trim audio to specified duration""" |
|
y, sr = librosa.load(audio_path, sr=16000, mono=True) |
|
max_samples = max_duration * sr |
|
if len(y) > max_samples: |
|
y = y[:int(max_samples)] |
|
return y, sr |
|
|
|
def generate_speech(audio_file, text): |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \ |
|
tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file: |
|
|
|
ref_path = ref_file.name |
|
out_path = out_file.name |
|
|
|
|
|
y, sr = process_audio(audio_file) |
|
librosa.output.write_wav(ref_path, y, sr) |
|
|
|
|
|
try: |
|
tts.tts_to_file( |
|
text=text, |
|
speaker_wav=ref_path, |
|
language="en", |
|
file_path=out_path |
|
) |
|
|
|
|
|
os.unlink(ref_path) |
|
return out_path |
|
except Exception as e: |
|
print(f"Error: {e}") |
|
return None |
|
|
|
|
|
with gr.Blocks(title="Voice Clone TTS") as demo: |
|
gr.Markdown(""" |
|
# π€ Voice Clone Text-to-Speech |
|
1. Upload a short English voice sample (5-10 seconds) |
|
2. Enter text you want to speak |
|
3. Generate audio in your voice! |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio( |
|
sources=["upload", "microphone"], |
|
type="filepath", |
|
label="Upload Voice Sample", |
|
interactive=True |
|
) |
|
text_input = gr.Textbox( |
|
label="Text to Speak", |
|
placeholder="Enter English text here...", |
|
lines=4 |
|
) |
|
btn = gr.Button("Generate Speech", variant="primary") |
|
|
|
with gr.Column(): |
|
audio_output = gr.Audio( |
|
label="Generated Speech", |
|
interactive=False |
|
) |
|
error_output = gr.Textbox(label="Processing Info", visible=False) |
|
|
|
|
|
gr.Examples( |
|
examples=[ |
|
["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"], |
|
["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"] |
|
], |
|
inputs=[audio_input, text_input], |
|
outputs=audio_output, |
|
fn=generate_speech, |
|
cache_examples=True |
|
) |
|
|
|
btn.click( |
|
fn=generate_speech, |
|
inputs=[audio_input, text_input], |
|
outputs=audio_output |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(server_port=7860, share=True) |