File size: 3,235 Bytes
f75668a
4760b00
f74edeb
4760b00
3220f5e
4760b00
 
ba92b2d
 
f75668a
f8f4a26
4760b00
 
f8f4a26
f75668a
4760b00
f75668a
4760b00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba92b2d
4760b00
 
f75668a
ba92b2d
4760b00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3220f5e
4760b00
 
 
 
 
 
 
 
3220f5e
 
4760b00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3220f5e
4760b00
 
 
 
 
 
 
 
 
f75668a
4760b00
3220f5e
4760b00
f74edeb
 
 
 
3220f5e
4760b00
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# !pip install TTS gradio numpy librosa torch soundfile

from TTS.api import TTS
import gradio as gr
import numpy as np
import librosa
import torch
import tempfile
import os
import soundfile as sf  # Added for better audio handling

# Check device availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize TTS model with device parameter
model_name = "tts_models/multilingual/multi-dataset/your_tts"
tts = TTS(model_name=model_name).to(device)  # This line is the problem

def process_audio(audio_path, max_duration=10):
    """Load and trim audio to specified duration"""
    y, sr = librosa.load(audio_path, sr=16000, mono=True)
    max_samples = max_duration * sr
    if len(y) > max_samples:
        y = y[:int(max_samples)]
    return y, sr

def generate_speech(audio_file, text):
    # Create temp files
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \
         tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file:
        
        ref_path = ref_file.name
        out_path = out_file.name
    
    # Process reference audio
    y, sr = process_audio(audio_file)
    sf.write(ref_path, y, sr)  # Using soundfile instead of librosa for writing
    
    # Generate speech
    try:
        tts.tts_to_file(
            text=text,
            speaker_wav=ref_path,
            language="en",
            file_path=out_path
        )
        
        # Clean up temporary files
        os.unlink(ref_path)
        return out_path
    except Exception as e:
        print(f"Error: {e}")
        return None

# Gradio interface
with gr.Blocks(title="Voice Clone TTS") as demo:
    gr.Markdown("""
    # 🎤 Voice Clone Text-to-Speech
    1. Upload a short English voice sample (5-10 seconds)
    2. Enter text you want to speak
    3. Generate audio in your voice!
    """)
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Upload Voice Sample",
                interactive=True
            )
            text_input = gr.Textbox(
                label="Text to Speak",
                placeholder="Enter English text here...",
                lines=4
            )
            btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Speech",
                interactive=False
            )
            error_output = gr.Textbox(label="Processing Info", visible=False)
    
    # Example inputs
    gr.Examples(
        examples=[
            ["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"],
            ["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"]
        ],
        inputs=[audio_input, text_input],
        outputs=audio_output,
        fn=generate_speech,
        cache_examples=False  # Disabled cache to avoid potential issues
    )
    
    btn.click(
        fn=generate_speech,
        inputs=[audio_input, text_input],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(server_port=7860, share=True)