shukdevdatta123 commited on
Commit
e25f277
·
verified ·
1 Parent(s): e47cdda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -27
app.py CHANGED
@@ -1,39 +1,96 @@
1
  import gradio as gr
2
  from bark import SAMPLE_RATE, generate_audio, preload_models
 
3
  from scipy.io.wavfile import write as write_wav
4
  import tempfile
5
  import torch
 
 
6
 
7
  # Save the original torch.load function
8
  original_load = torch.load
9
 
10
- # Define a custom load function that forces weights_only=False
11
  def custom_load(*args, **kwargs):
12
  kwargs['weights_only'] = False
13
  return original_load(*args, **kwargs)
14
 
15
- # Monkey-patch torch.load with the custom function
16
  torch.load = custom_load
17
 
18
- # Preload the models with the patched torch.load
19
  preload_models()
20
 
21
- # Restore the original torch.load function
22
  torch.load = original_load
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def generate_speech(reference_audio, text):
25
  """
26
- Generate speech audio using a pre-defined speaker.
27
 
28
  Parameters:
29
- reference_audio (str): Path to uploaded audio (ignored in this version).
30
  text (str): Text to convert to speech.
31
 
32
  Returns:
33
  str: Path to the generated audio file.
34
  """
35
- # Use a pre-defined speaker since custom voice cloning isn't supported
36
- history_prompt = "v2/en_speaker_6" # Pre-defined speaker ID
 
 
 
 
 
 
 
37
  audio_array = generate_audio(text, history_prompt=history_prompt)
38
 
39
  # Save the audio to a temporary file
@@ -44,29 +101,25 @@ def generate_speech(reference_audio, text):
44
  return temp_file_path
45
 
46
  # Build the Gradio interface
47
- with gr.Blocks(title="Text-to-Speech with Bark") as app:
48
- gr.Markdown("## Text-to-Speech with Bark")
49
- gr.Markdown(
50
- "Enter text to hear it in a pre-defined voice. "
51
- "Custom voice cloning from uploaded audio is not supported in this version."
52
- )
53
 
54
- # Input components
55
- audio_input = gr.Audio(
56
- type="filepath",
57
- label="Upload Your Voice Sample (English, Ignored)",
58
- visible=True # Kept for future functionality, but ignored
59
- )
60
- text_input = gr.Textbox(
61
- label="Enter Text to Convert to Speech",
62
- placeholder="e.g., I love chocolate"
63
- )
64
 
65
- # Output component
66
  audio_output = gr.Audio(label="Generated Speech", interactive=False)
67
 
68
- # Button to trigger generation
69
- generate_btn = gr.Button("Generate Speech")
70
  generate_btn.click(
71
  fn=generate_speech,
72
  inputs=[audio_input, text_input],
 
1
  import gradio as gr
2
  from bark import SAMPLE_RATE, generate_audio, preload_models
3
+ from bark.generation import load_model, generate_text_semantic, _tokenize
4
  from scipy.io.wavfile import write as write_wav
5
  import tempfile
6
  import torch
7
+ import librosa
8
+ import numpy as np
9
 
10
  # Save the original torch.load function
11
  original_load = torch.load
12
 
13
+ # Define a custom load function to bypass weights_only=True issue
14
  def custom_load(*args, **kwargs):
15
  kwargs['weights_only'] = False
16
  return original_load(*args, **kwargs)
17
 
18
+ # Monkey-patch torch.load
19
  torch.load = custom_load
20
 
21
+ # Preload Bark models
22
  preload_models()
23
 
24
+ # Restore the original torch.load
25
  torch.load = original_load
26
 
27
+ def preprocess_audio_to_npz(audio_path):
28
+ """
29
+ Preprocess an audio file to create a .npz history prompt for voice cloning.
30
+
31
+ Parameters:
32
+ audio_path (str): Path to the input audio file.
33
+
34
+ Returns:
35
+ str: Path to the generated .npz file.
36
+ """
37
+ # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
38
+ audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
39
+
40
+ # Ensure audio is a float32 array
41
+ audio = audio.astype(np.float32)
42
+
43
+ # Tokenize and process through HuBERT for semantic tokens
44
+ hubert_manager = load_model(model_type="hubert", device="cpu")
45
+ hubert_tokenizer = load_model(model_type="hubert_tokenizer", device="cpu")
46
+
47
+ # Generate semantic tokens
48
+ tokens = _tokenize(audio, hubert_manager, hubert_tokenizer)
49
+ semantic_tokens = tokens[0] # Extract semantic tokens
50
+
51
+ # Load coarse model for coarse tokens
52
+ coarse_model = load_model(model_type="coarse", device="cpu")
53
+
54
+ # Generate coarse tokens
55
+ coarse_tokens = generate_text_semantic(
56
+ semantic_tokens=semantic_tokens,
57
+ model=coarse_model,
58
+ max_gen_len=512
59
+ )
60
+
61
+ # Create history prompt dictionary
62
+ history_prompt = {
63
+ "semantic_prompt": semantic_tokens,
64
+ "coarse_prompt": coarse_tokens
65
+ }
66
+
67
+ # Save to temporary .npz file
68
+ with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
69
+ np.savez(temp_file.name, **history_prompt)
70
+ npz_path = temp_file.name
71
+
72
+ return npz_path
73
+
74
  def generate_speech(reference_audio, text):
75
  """
76
+ Generate speech audio mimicking the voice from the reference audio using Bark.
77
 
78
  Parameters:
79
+ reference_audio (str): Filepath to the uploaded voice sample.
80
  text (str): Text to convert to speech.
81
 
82
  Returns:
83
  str: Path to the generated audio file.
84
  """
85
+ if not reference_audio:
86
+ raise ValueError("Please upload a voice sample.")
87
+ if not text:
88
+ raise ValueError("Please enter text to convert.")
89
+
90
+ # Preprocess audio to create .npz history prompt
91
+ history_prompt = preprocess_audio_to_npz(reference_audio)
92
+
93
+ # Generate speech using the processed history prompt
94
  audio_array = generate_audio(text, history_prompt=history_prompt)
95
 
96
  # Save the audio to a temporary file
 
101
  return temp_file_path
102
 
103
  # Build the Gradio interface
104
+ with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
105
+ gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
106
+ gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
 
 
 
107
 
108
+ with gr.Row():
109
+ audio_input = gr.Audio(
110
+ type="filepath",
111
+ label="Upload Your Voice Sample (English)",
112
+ interactive=True
113
+ )
114
+ text_input = gr.Textbox(
115
+ label="Enter Text to Convert to Speech",
116
+ placeholder="e.g., I love chocolate"
117
+ )
118
 
119
+ generate_btn = gr.Button("Generate Speech")
120
  audio_output = gr.Audio(label="Generated Speech", interactive=False)
121
 
122
+ # Connect the button to the generation function
 
123
  generate_btn.click(
124
  fn=generate_speech,
125
  inputs=[audio_input, text_input],