import gradio as gr
from bark import SAMPLE_RATE, generate_audio, preload_models
from scipy.io.wavfile import write as write_wav
import tempfile
import torch
from numpy.core.multiarray import scalar
import numpy

# Add NumPy scalar and dtype to safe globals to fix UnpicklingError
torch.serialization.add_safe_globals([scalar, numpy.dtype])

# Preload the models at startup
preload_models()

def generate_speech(reference_audio, text):
    """
    Generate speech audio mimicking the voice from the reference audio using Bark.
    
    Parameters:
    reference_audio (str): Filepath to the uploaded voice sample.
    text (str): Text to convert to speech.
    
    Returns:
    str: Path to the generated audio file
    """
    # Generate speech using the reference audio and text
    audio_array = generate_audio(text, history_prompt=reference_audio)
    
    # Create a temporary file to save the audio
    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    temp_file_path = temp_file.name
    # Save the audio to the temporary file
    write_wav(temp_file_path, SAMPLE_RATE, audio_array)
    temp_file.close()
    
    return temp_file_path

# Build the Gradio interface
with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
    gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
    gr.Markdown("Upload a short voice sample in English, then enter text to hear it in your voice!")
    
    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload Your Voice Sample (English)")
        text_input = gr.Textbox(label="Enter Text to Convert to Speech", placeholder="e.g., I love chocolate")
    
    generate_btn = gr.Button("Generate Speech")
    audio_output = gr.Audio(label="Generated Speech", interactive=False)
    
    # Connect the button to the generation function
    generate_btn.click(
        fn=generate_speech,
        inputs=[audio_input, text_input],
        outputs=audio_output
    )

# Launch the application
app.launch()