import gradio as gr from bark import SAMPLE_RATE, generate_audio, preload_models from scipy.io.wavfile import write as write_wav import tempfile import torch from numpy.core.multiarray import scalar import numpy # Add NumPy scalar and dtype to safe globals to fix UnpicklingError torch.serialization.add_safe_globals([scalar, numpy.dtype]) # Preload the models at startup preload_models() def generate_speech(reference_audio, text): """ Generate speech audio mimicking the voice from the reference audio using Bark. Parameters: reference_audio (str): Filepath to the uploaded voice sample. text (str): Text to convert to speech. Returns: str: Path to the generated audio file """ # Generate speech using the reference audio and text audio_array = generate_audio(text, history_prompt=reference_audio) # Create a temporary file to save the audio temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) temp_file_path = temp_file.name # Save the audio to the temporary file write_wav(temp_file_path, SAMPLE_RATE, audio_array) temp_file.close() return temp_file_path # Build the Gradio interface with gr.Blocks(title="Voice Cloning TTS with Bark") as app: gr.Markdown("## Voice Cloning Text-to-Speech with Bark") gr.Markdown("Upload a short voice sample in English, then enter text to hear it in your voice!") with gr.Row(): audio_input = gr.Audio(type="filepath", label="Upload Your Voice Sample (English)") text_input = gr.Textbox(label="Enter Text to Convert to Speech", placeholder="e.g., I love chocolate") generate_btn = gr.Button("Generate Speech") audio_output = gr.Audio(label="Generated Speech", interactive=False) # Connect the button to the generation function generate_btn.click( fn=generate_speech, inputs=[audio_input, text_input], outputs=audio_output ) # Launch the application app.launch()