Spaces:

shukdevdatta123
/

VocalForge-AI

Running

File size: 4,401 Bytes

import gradio as gr
from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import generate_text_semantic
from scipy.io.wavfile import write as write_wav
import tempfile
import librosa
import numpy as np
import torch

# Save the original torch.load function
original_load = torch.load

# Define a custom load function to bypass weights_only=True issue
def custom_load(*args, **kwargs):
    kwargs['weights_only'] = False
    return original_load(*args, **kwargs)

# Monkey-patch torch.load
torch.load = custom_load

# Preload Bark models
preload_models()

# Restore the original torch.load
torch.load = original_load

def preprocess_audio_to_npz(audio_path):
    """
    Preprocess an audio file to create a .npz history prompt for voice cloning.
    
    Parameters:
    audio_path (str): Path to the input audio file.
    
    Returns:
    str: Path to the generated .npz file.
    """
    # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
    audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
    
    # Ensure audio is a float32 array
    audio = audio.astype(np.float32)
    
    with torch.device("cpu"):
        # Generate dummy semantic tokens using generate_text_semantic
        dummy_text = "Dummy text for history prompt generation."
        semantic_tokens = generate_text_semantic(
            text=dummy_text,
            max_gen_len=512,
            temp=0.7,
            silent=True
        )
        
        # Ensure semantic_tokens is a numpy array with correct shape
        semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
        if semantic_tokens.ndim == 0:
            semantic_tokens = semantic_tokens.reshape(-1)
        
        # Coarse and fine prompts are derived from semantic tokens
        # Bark often uses similar tokens for coarse and fine prompts
        coarse_tokens = semantic_tokens  # Simplified assumption
        fine_tokens = semantic_tokens    # Simplified assumption
        
        # Create history prompt dictionary
        history_prompt = {
            "semantic_prompt": semantic_tokens,
            "coarse_prompt": coarse_tokens,
            "fine_prompt": fine_tokens
        }
        
        # Save to temporary .npz file
        with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
            np.savez(temp_file.name, **history_prompt)
            npz_path = temp_file.name
    
    return npz_path

def generate_speech(reference_audio, text):
    """
    Generate speech audio mimicking the voice from the reference audio using Bark.
    
    Parameters:
    reference_audio (str): Filepath to the uploaded voice sample.
    text (str): Text to convert to speech.
    
    Returns:
    str: Path to the generated audio file.
    """
    if not reference_audio:
        raise ValueError("Please upload a voice sample.")
    if not text:
        raise ValueError("Please enter text to convert.")
    
    # Preprocess audio to create .npz history prompt
    history_prompt = preprocess_audio_to_npz(reference_audio)
    
    # Generate speech using the processed history prompt
    audio_array = generate_audio(text, history_prompt=history_prompt)
    
    # Save the audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
        write_wav(temp_file.name, SAMPLE_RATE, audio_array)
        temp_file_path = temp_file.name
    
    return temp_file_path

# Build the Gradio interface
with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
    gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
    gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
    
    with gr.Row():
        audio_input = gr.Audio(
            type="filepath",
            label="Upload Your Voice Sample (English)",
            interactive=True
        )
        text_input = gr.Textbox(
            label="Enter Text to Convert to Speech",
            placeholder="e.g., I love chocolate"
        )
    
    generate_btn = gr.Button("Generate Speech")
    audio_output = gr.Audio(label="Generated Speech", interactive=False)
    
    # Connect the button to the generation function
    generate_btn.click(
        fn=generate_speech,
        inputs=[audio_input, text_input],
        outputs=audio_output
    )

# Launch the application
app.launch()