File size: 4,291 Bytes
4760b00
4999708
e25f277
4999708
ba92b2d
47d7c50
e25f277
 
47d7c50
6d7fea6
 
f8f4a26
e25f277
6d7fea6
 
 
 
e25f277
6d7fea6
 
e25f277
4999708
4760b00
e25f277
6d7fea6
 
e25f277
 
 
 
 
 
 
 
 
 
7666acf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e25f277
 
 
4999708
 
e25f277
4999708
 
e25f277
4999708
ba92b2d
4999708
e47cdda
4999708
e25f277
 
 
 
 
 
 
 
 
e47cdda
ba92b2d
e47cdda
 
 
 
4999708
 
3220f5e
4999708
e25f277
 
 
3220f5e
e25f277
 
 
 
 
 
 
 
 
 
3220f5e
e25f277
4999708
3220f5e
e25f277
4999708
f74edeb
 
 
 
3220f5e
4999708
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import load_model, generate_text_semantic, _tokenize
from scipy.io.wavfile import write as write_wav
import tempfile
import torch
import librosa
import numpy as np

# Save the original torch.load function
original_load = torch.load

# Define a custom load function to bypass weights_only=True issue
def custom_load(*args, **kwargs):
    kwargs['weights_only'] = False
    return original_load(*args, **kwargs)

# Monkey-patch torch.load
torch.load = custom_load

# Preload Bark models
preload_models()

# Restore the original torch.load
torch.load = original_load

def preprocess_audio_to_npz(audio_path):
    """
    Preprocess an audio file to create a .npz history prompt for voice cloning.
    
    Parameters:
    audio_path (str): Path to the input audio file.
    
    Returns:
    str: Path to the generated .npz file.
    """
    # Set device to CPU
    with torch.device("cpu"):
        # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
        audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
        
        # Ensure audio is a float32 array
        audio = audio.astype(np.float32)
        
        # Load HuBERT models for semantic token extraction
        hubert_manager = load_model(model_type="hubert")
        hubert_tokenizer = load_model(model_type="hubert_tokenizer")
        
        # Generate semantic tokens
        tokens = _tokenize(audio, hubert_manager, hubert_tokenizer)
        semantic_tokens = tokens[0]  # Extract semantic tokens
        
        # Load coarse model for coarse tokens
        coarse_model = load_model(model_type="coarse")
        
        # Generate coarse tokens
        coarse_tokens = generate_text_semantic(
            semantic_tokens=semantic_tokens,
            model=coarse_model,
            max_gen_len=512
        )
        
        # Create history prompt dictionary
        history_prompt = {
            "semantic_prompt": semantic_tokens,
            "coarse_prompt": coarse_tokens
        }
        
        # Save to temporary .npz file
        with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
            np.savez(temp_file.name, **history_prompt)
            npz_path = temp_file.name
    
    return npz_path

def generate_speech(reference_audio, text):
    """
    Generate speech audio mimicking the voice from the reference audio using Bark.
    
    Parameters:
    reference_audio (str): Filepath to the uploaded voice sample.
    text (str): Text to convert to speech.
    
    Returns:
    str: Path to the generated audio file.
    """
    if not reference_audio:
        raise ValueError("Please upload a voice sample.")
    if not text:
        raise ValueError("Please enter text to convert.")
    
    # Preprocess audio to create .npz history prompt
    history_prompt = preprocess_audio_to_npz(reference_audio)
    
    # Generate speech using the processed history prompt
    audio_array = generate_audio(text, history_prompt=history_prompt)
    
    # Save the audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
        write_wav(temp_file.name, SAMPLE_RATE, audio_array)
        temp_file_path = temp_file.name
    
    return temp_file_path

# Build the Gradio interface
with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
    gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
    gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
    
    with gr.Row():
        audio_input = gr.Audio(
            type="filepath",
            label="Upload Your Voice Sample (English)",
            interactive=True
        )
        text_input = gr.Textbox(
            label="Enter Text to Convert to Speech",
            placeholder="e.g., I love chocolate"
        )
    
    generate_btn = gr.Button("Generate Speech")
    audio_output = gr.Audio(label="Generated Speech", interactive=False)
    
    # Connect the button to the generation function
    generate_btn.click(
        fn=generate_speech,
        inputs=[audio_input, text_input],
        outputs=audio_output
    )

# Launch the application
app.launch()