File size: 4,517 Bytes
4760b00
4999708
523a466
4999708
ba92b2d
e25f277
 
0a51a48
47d7c50
6d7fea6
 
f8f4a26
e25f277
6d7fea6
 
 
 
e25f277
6d7fea6
 
e25f277
4999708
4760b00
e25f277
6d7fea6
 
e25f277
 
 
4ee577e
e25f277
 
4ee577e
e25f277
523a466
e25f277
0a51a48
 
4ee577e
99ef324
0a51a48
4ee577e
523a466
99ef324
523a466
 
 
99ef324
523a466
 
 
4ee577e
99ef324
523a466
99ef324
 
4ee577e
99ef324
 
 
 
 
 
 
4ee577e
523a466
 
 
 
 
 
4ee577e
523a466
 
 
 
4ee577e
523a466
e25f277
4999708
 
e25f277
4ee577e
4999708
e25f277
4999708
4ee577e
4999708
e47cdda
4999708
e25f277
 
 
 
4ee577e
523a466
e25f277
4ee577e
e25f277
e47cdda
4ee577e
e47cdda
 
 
 
4ee577e
4999708
3220f5e
4999708
e25f277
 
 
4ee577e
e25f277
 
 
 
 
 
 
 
 
 
4ee577e
e25f277
4999708
4ee577e
e25f277
4999708
f74edeb
 
 
 
3220f5e
4999708
4ee577e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import generate_text_semantic
from scipy.io.wavfile import write as write_wav
import tempfile
import librosa
import numpy as np
import torch

# Save the original torch.load function
original_load = torch.load

# Define a custom load function to bypass weights_only=True issue
def custom_load(*args, **kwargs):
    kwargs['weights_only'] = False
    return original_load(*args, **kwargs)

# Monkey-patch torch.load
torch.load = custom_load

# Preload Bark models
preload_models()

# Restore the original torch.load
torch.load = original_load

def preprocess_audio_to_npz(audio_path):
    """
    Preprocess an audio file to create a .npz history prompt for voice cloning.

    Parameters:
    audio_path (str): Path to the input audio file.

    Returns:
    str: Path to the generated .npz file.
    """
    # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
    audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)

    # Ensure audio is a float32 array (for potential future use)
    audio = audio.astype(np.float32)

    with torch.device("cpu"):
        # Generate semantic tokens using generate_text_semantic
        dummy_text = "Dummy text for history prompt generation."
        semantic_tokens = generate_text_semantic(
            text=dummy_text,
            max_gen_len=512,
            temp=0.7,
            silent=True
        )

        # Ensure semantic_tokens is a 1D numpy array of int64
        semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
        if semantic_tokens.ndim != 1:
            semantic_tokens = semantic_tokens.flatten()

        # Simulate coarse tokens (typically shorter or quantized version of semantic tokens)
        coarse_tokens = semantic_tokens[:256]  # Truncate to simulate coarse quantization
        coarse_tokens = np.array(coarse_tokens, dtype=np.int64)

        # Simulate fine tokens (often similar to coarse tokens in Bark)
        fine_tokens = coarse_tokens.copy()  # Simplified assumption
        fine_tokens = np.array(fine_tokens, dtype=np.int64)

        # Create history prompt dictionary
        history_prompt = {
            "semantic_prompt": semantic_tokens,
            "coarse_prompt": coarse_tokens,
            "fine_prompt": fine_tokens
        }

        # Save to temporary .npz file
        with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
            np.savez(temp_file.name, **history_prompt)
            npz_path = temp_file.name

    return npz_path

def generate_speech(reference_audio, text):
    """
    Generate speech audio mimicking the voice from the reference audio using Bark.

    Parameters:
    reference_audio (str): Filepath to the uploaded voice sample.
    text (str): Text to convert to speech.

    Returns:
    str: Path to the generated audio file.
    """
    if not reference_audio:
        raise ValueError("Please upload a voice sample.")
    if not text:
        raise ValueError("Please enter text to convert.")

    # Preprocess audio to create .npz history prompt
    history_prompt = preprocess_audio_to_npz(reference_audio)

    # Generate speech using the processed history prompt
    audio_array = generate_audio(text, history_prompt=history_prompt)

    # Save the audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
        write_wav(temp_file.name, SAMPLE_RATE, audio_array)
        temp_file_path = temp_file.name

    return temp_file_path

# Build the Gradio interface
with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
    gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
    gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")

    with gr.Row():
        audio_input = gr.Audio(
            type="filepath",
            label="Upload Your Voice Sample (English)",
            interactive=True
        )
        text_input = gr.Textbox(
            label="Enter Text to Convert to Speech",
            placeholder="e.g., I love chocolate"
        )

    generate_btn = gr.Button("Generate Speech")
    audio_output = gr.Audio(label="Generated Speech", interactive=False)

    # Connect the button to the generation function
    generate_btn.click(
        fn=generate_speech,
        inputs=[audio_input, text_input],
        outputs=audio_output
    )

# Launch the application
app.launch(share=True)