import gradio as gr from bark import SAMPLE_RATE, generate_audio, preload_models from scipy.io.wavfile import write as write_wav import tempfile import librosa import numpy as np import torch # Save the original torch.load function original_load = torch.load # Define a custom load function to bypass weights_only=True issue def custom_load(*args, **kwargs): kwargs['weights_only'] = False return original_load(*args, **kwargs) # Monkey-patch torch.load torch.load = custom_load # Preload Bark models preload_models() # Restore the original torch.load torch.load = original_load def preprocess_audio_to_npz(audio_path): """ Preprocess an audio file to create a .npz history prompt for voice cloning. Parameters: audio_path (str): Path to the input audio file. Returns: str: Path to the generated .npz file. """ # Load and resample audio to Bark's SAMPLE_RATE (24kHz) audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True) # Ensure audio is a float32 array audio = audio.astype(np.float32) # Generate semantic tokens directly using Bark's internal processing # Since HuBERT models are not implemented, we rely on generate_audio's history prompt # This is a simplified approach assuming Bark can handle raw audio for history prompt with torch.device("cpu"): # Generate audio tokens to create a history prompt # We use a dummy text to generate a history prompt from the audio dummy_text = "Dummy text for history prompt generation." audio_array = generate_audio(dummy_text, history_prompt=audio_path) # Save the audio array as a temporary .npz file history_prompt = { "audio": audio_array } with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file: np.savez(temp_file.name, **history_prompt) npz_path = temp_file.name return npz_path def generate_speech(reference_audio, text): """ Generate speech audio mimicking the voice from the reference audio using Bark. Parameters: reference_audio (str): Filepath to the uploaded voice sample. text (str): Text to convert to speech. Returns: str: Path to the generated audio file. """ if not reference_audio: raise ValueError("Please upload a voice sample.") if not text: raise ValueError("Please enter text to convert.") # Preprocess audio to create .npz history prompt history_prompt = preprocess_audio_to_npz(reference_audio) # Generate speech using the processed history prompt audio_array = generate_audio(text, history_prompt=history_prompt) # Save the audio to a temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: write_wav(temp_file.name, SAMPLE_RATE, audio_array) temp_file_path = temp_file.name return temp_file_path # Build the Gradio interface with gr.Blocks(title="Voice Cloning TTS with Bark") as app: gr.Markdown("## Voice Cloning Text-to-Speech with Bark") gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!") with gr.Row(): audio_input = gr.Audio( type="filepath", label="Upload Your Voice Sample (English)", interactive=True ) text_input = gr.Textbox( label="Enter Text to Convert to Speech", placeholder="e.g., I love chocolate" ) generate_btn = gr.Button("Generate Speech") audio_output = gr.Audio(label="Generated Speech", interactive=False) # Connect the button to the generation function generate_btn.click( fn=generate_speech, inputs=[audio_input, text_input], outputs=audio_output ) # Launch the application app.launch()