Spaces:

shukdevdatta123
/

VocalForge-AI

Running

File size: 3,235 Bytes

# !pip install TTS gradio numpy librosa torch soundfile

from TTS.api import TTS
import gradio as gr
import numpy as np
import librosa
import torch
import tempfile
import os
import soundfile as sf  # Added for better audio handling

# Check device availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize TTS model with device parameter
model_name = "tts_models/multilingual/multi-dataset/your_tts"
tts = TTS(model_name=model_name).to(device)  # This line is the problem

def process_audio(audio_path, max_duration=10):
    """Load and trim audio to specified duration"""
    y, sr = librosa.load(audio_path, sr=16000, mono=True)
    max_samples = max_duration * sr
    if len(y) > max_samples:
        y = y[:int(max_samples)]
    return y, sr

def generate_speech(audio_file, text):
    # Create temp files
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \
         tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file:
        
        ref_path = ref_file.name
        out_path = out_file.name
    
    # Process reference audio
    y, sr = process_audio(audio_file)
    sf.write(ref_path, y, sr)  # Using soundfile instead of librosa for writing
    
    # Generate speech
    try:
        tts.tts_to_file(
            text=text,
            speaker_wav=ref_path,
            language="en",
            file_path=out_path
        )
        
        # Clean up temporary files
        os.unlink(ref_path)
        return out_path
    except Exception as e:
        print(f"Error: {e}")
        return None

# Gradio interface
with gr.Blocks(title="Voice Clone TTS") as demo:
    gr.Markdown("""
    # 🎤 Voice Clone Text-to-Speech
    1. Upload a short English voice sample (5-10 seconds)
    2. Enter text you want to speak
    3. Generate audio in your voice!
    """)
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Upload Voice Sample",
                interactive=True
            )
            text_input = gr.Textbox(
                label="Text to Speak",
                placeholder="Enter English text here...",
                lines=4
            )
            btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Speech",
                interactive=False
            )
            error_output = gr.Textbox(label="Processing Info", visible=False)
    
    # Example inputs
    gr.Examples(
        examples=[
            ["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"],
            ["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"]
        ],
        inputs=[audio_input, text_input],
        outputs=audio_output,
        fn=generate_speech,
        cache_examples=False  # Disabled cache to avoid potential issues
    )
    
    btn.click(
        fn=generate_speech,
        inputs=[audio_input, text_input],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(server_port=7860, share=True)