File size: 4,401 Bytes
4760b00
4999708
523a466
4999708
ba92b2d
e25f277
 
0a51a48
47d7c50
6d7fea6
 
f8f4a26
e25f277
6d7fea6
 
 
 
e25f277
6d7fea6
 
e25f277
4999708
4760b00
e25f277
6d7fea6
 
e25f277
 
 
 
 
 
 
 
523a466
e25f277
0a51a48
 
 
 
 
 
523a466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7666acf
523a466
 
 
 
 
 
 
 
 
 
 
 
 
e25f277
4999708
 
e25f277
4999708
 
e25f277
4999708
ba92b2d
4999708
e47cdda
4999708
e25f277
 
 
 
 
523a466
e25f277
 
 
e47cdda
ba92b2d
e47cdda
 
 
 
4999708
 
3220f5e
4999708
e25f277
 
 
3220f5e
e25f277
 
 
 
 
 
 
 
 
 
3220f5e
e25f277
4999708
523a466
e25f277
4999708
f74edeb
 
 
 
3220f5e
4999708
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import gradio as gr
from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import generate_text_semantic
from scipy.io.wavfile import write as write_wav
import tempfile
import librosa
import numpy as np
import torch

# Save the original torch.load function
original_load = torch.load

# Define a custom load function to bypass weights_only=True issue
def custom_load(*args, **kwargs):
    kwargs['weights_only'] = False
    return original_load(*args, **kwargs)

# Monkey-patch torch.load
torch.load = custom_load

# Preload Bark models
preload_models()

# Restore the original torch.load
torch.load = original_load

def preprocess_audio_to_npz(audio_path):
    """
    Preprocess an audio file to create a .npz history prompt for voice cloning.
    
    Parameters:
    audio_path (str): Path to the input audio file.
    
    Returns:
    str: Path to the generated .npz file.
    """
    # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
    audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
    
    # Ensure audio is a float32 array
    audio = audio.astype(np.float32)
    
    with torch.device("cpu"):
        # Generate dummy semantic tokens using generate_text_semantic
        dummy_text = "Dummy text for history prompt generation."
        semantic_tokens = generate_text_semantic(
            text=dummy_text,
            max_gen_len=512,
            temp=0.7,
            silent=True
        )
        
        # Ensure semantic_tokens is a numpy array with correct shape
        semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
        if semantic_tokens.ndim == 0:
            semantic_tokens = semantic_tokens.reshape(-1)
        
        # Coarse and fine prompts are derived from semantic tokens
        # Bark often uses similar tokens for coarse and fine prompts
        coarse_tokens = semantic_tokens  # Simplified assumption
        fine_tokens = semantic_tokens    # Simplified assumption
        
        # Create history prompt dictionary
        history_prompt = {
            "semantic_prompt": semantic_tokens,
            "coarse_prompt": coarse_tokens,
            "fine_prompt": fine_tokens
        }
        
        # Save to temporary .npz file
        with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
            np.savez(temp_file.name, **history_prompt)
            npz_path = temp_file.name
    
    return npz_path

def generate_speech(reference_audio, text):
    """
    Generate speech audio mimicking the voice from the reference audio using Bark.
    
    Parameters:
    reference_audio (str): Filepath to the uploaded voice sample.
    text (str): Text to convert to speech.
    
    Returns:
    str: Path to the generated audio file.
    """
    if not reference_audio:
        raise ValueError("Please upload a voice sample.")
    if not text:
        raise ValueError("Please enter text to convert.")
    
    # Preprocess audio to create .npz history prompt
    history_prompt = preprocess_audio_to_npz(reference_audio)
    
    # Generate speech using the processed history prompt
    audio_array = generate_audio(text, history_prompt=history_prompt)
    
    # Save the audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
        write_wav(temp_file.name, SAMPLE_RATE, audio_array)
        temp_file_path = temp_file.name
    
    return temp_file_path

# Build the Gradio interface
with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
    gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
    gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
    
    with gr.Row():
        audio_input = gr.Audio(
            type="filepath",
            label="Upload Your Voice Sample (English)",
            interactive=True
        )
        text_input = gr.Textbox(
            label="Enter Text to Convert to Speech",
            placeholder="e.g., I love chocolate"
        )
    
    generate_btn = gr.Button("Generate Speech")
    audio_output = gr.Audio(label="Generated Speech", interactive=False)
    
    # Connect the button to the generation function
    generate_btn.click(
        fn=generate_speech,
        inputs=[audio_input, text_input],
        outputs=audio_output
    )

# Launch the application
app.launch()