Spaces:
Runtime error
Runtime error
Commit
·
5104e18
1
Parent(s):
8a1732e
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from cProfile import label
|
2 |
+
from distutils.command.check import check
|
3 |
+
from doctest import Example
|
4 |
+
import gradio as gr
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
import numpy as np
|
8 |
+
import logging
|
9 |
+
import torch
|
10 |
+
from xml.sax import saxutils
|
11 |
+
#import nltk
|
12 |
+
|
13 |
+
from bark import SAMPLE_RATE, generate_audio
|
14 |
+
from bark.clonevoice import clone_voice
|
15 |
+
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic
|
16 |
+
from scipy.io.wavfile import write as write_wav
|
17 |
+
from parseinput import split_and_recombine_text, build_ssml, is_ssml, create_clips_from_ssml
|
18 |
+
from datetime import datetime
|
19 |
+
from tqdm.auto import tqdm
|
20 |
+
|
21 |
+
OUTPUTFOLDER = "Outputs"
|
22 |
+
|
23 |
+
|
24 |
+
def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):
|
25 |
+
|
26 |
+
# generation with more control
|
27 |
+
x_semantic = generate_text_semantic(
|
28 |
+
text_prompt,
|
29 |
+
history_prompt=voice_name if use_semantic_history_prompt else None,
|
30 |
+
temp=semantic_temp,
|
31 |
+
top_k=semantic_top_k,
|
32 |
+
top_p=semantic_top_p
|
33 |
+
)
|
34 |
+
|
35 |
+
x_coarse_gen = generate_coarse(
|
36 |
+
x_semantic,
|
37 |
+
history_prompt=voice_name if use_coarse_history_prompt else None,
|
38 |
+
temp=coarse_temp,
|
39 |
+
top_k=coarse_top_k,
|
40 |
+
top_p=coarse_top_p
|
41 |
+
)
|
42 |
+
x_fine_gen = generate_fine(
|
43 |
+
x_coarse_gen,
|
44 |
+
history_prompt=voice_name if use_fine_history_prompt else None,
|
45 |
+
temp=fine_temp
|
46 |
+
)
|
47 |
+
|
48 |
+
if output_full:
|
49 |
+
full_generation = {
|
50 |
+
'semantic_prompt': x_semantic,
|
51 |
+
'coarse_prompt': x_coarse_gen,
|
52 |
+
'fine_prompt': x_fine_gen
|
53 |
+
}
|
54 |
+
return full_generation, codec_decode(x_fine_gen)
|
55 |
+
return codec_decode(x_fine_gen)
|
56 |
+
|
57 |
+
def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, quick_generation, complete_settings, progress=gr.Progress(track_tqdm=True)):
|
58 |
+
if text == None or len(text) < 1:
|
59 |
+
raise gr.Error('No text entered!')
|
60 |
+
|
61 |
+
# Chunk the text into smaller pieces then combine the generated audio
|
62 |
+
|
63 |
+
# generation settings
|
64 |
+
if selected_speaker == 'None':
|
65 |
+
selected_speaker = None
|
66 |
+
|
67 |
+
voice_name = selected_speaker
|
68 |
+
|
69 |
+
semantic_temp = text_temp
|
70 |
+
semantic_top_k = 50
|
71 |
+
semantic_top_p = 0.95
|
72 |
+
|
73 |
+
coarse_temp = waveform_temp
|
74 |
+
coarse_top_k = 50
|
75 |
+
coarse_top_p = 0.95
|
76 |
+
|
77 |
+
fine_temp = 0.5
|
78 |
+
|
79 |
+
use_semantic_history_prompt = "Use semantic history" in complete_settings
|
80 |
+
use_coarse_history_prompt = "Use coarse history" in complete_settings
|
81 |
+
use_fine_history_prompt = "Use fine history" in complete_settings
|
82 |
+
use_last_generation_as_history = "Use last generation as history" in complete_settings
|
83 |
+
progress(0, desc="Generating")
|
84 |
+
|
85 |
+
silence = np.zeros(int(0.25 * SAMPLE_RATE), dtype=np.float32) # quarter second of silence
|
86 |
+
|
87 |
+
all_parts = []
|
88 |
+
text = text.lstrip()
|
89 |
+
if is_ssml(text):
|
90 |
+
list_speak = create_clips_from_ssml(text)
|
91 |
+
for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)):
|
92 |
+
selected_speaker = clip[0]
|
93 |
+
text = clip[1]
|
94 |
+
text = saxutils.unescape(text)
|
95 |
+
if selected_speaker == "None":
|
96 |
+
selected_speaker = None
|
97 |
+
|
98 |
+
print(f"\nGenerating Text ({i+1}/{len(list_speak)}) -> {selected_speaker}:`{text}`")
|
99 |
+
audio_array = generate_audio(text, selected_speaker, text_temp, waveform_temp)
|
100 |
+
if len(list_speak) > 1:
|
101 |
+
save_wav(audio_array, create_filename(OUTPUTFOLDER, "audioclip",".wav"))
|
102 |
+
all_parts += [audio_array, silence.copy()]
|
103 |
+
else:
|
104 |
+
texts = split_and_recombine_text(text)
|
105 |
+
for i, text in tqdm(enumerate(texts), total=len(texts)):
|
106 |
+
print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker}:`{text}`")
|
107 |
+
if quick_generation == True:
|
108 |
+
audio_array = generate_audio(text, selected_speaker, text_temp, waveform_temp)
|
109 |
+
|
110 |
+
else:
|
111 |
+
full_generation, audio_array = generate_with_settings(
|
112 |
+
text,
|
113 |
+
semantic_temp=semantic_temp,
|
114 |
+
semantic_top_k=semantic_top_k,
|
115 |
+
semantic_top_p=semantic_top_p,
|
116 |
+
coarse_temp=coarse_temp,
|
117 |
+
coarse_top_k=coarse_top_k,
|
118 |
+
coarse_top_p=coarse_top_p,
|
119 |
+
fine_temp=fine_temp,
|
120 |
+
voice_name=voice_name,
|
121 |
+
use_semantic_history_prompt=use_semantic_history_prompt,
|
122 |
+
use_coarse_history_prompt=use_coarse_history_prompt,
|
123 |
+
use_fine_history_prompt=use_fine_history_prompt,
|
124 |
+
output_full=True,
|
125 |
+
)
|
126 |
+
|
127 |
+
# Noticed this in the HF Demo - convert to 16bit int -32767/32767 - most used audio format
|
128 |
+
# audio_array = (audio_array * 32767).astype(np.int16)
|
129 |
+
|
130 |
+
if len(texts) > 1:
|
131 |
+
save_wav(audio_array, create_filename(OUTPUTFOLDER, "audioclip",".wav"))
|
132 |
+
|
133 |
+
if quick_generation == False & use_last_generation_as_history:
|
134 |
+
# save to npz
|
135 |
+
voice_name = create_filename(OUTPUTFOLDER, "audioclip", "")
|
136 |
+
save_voice(voice_name,
|
137 |
+
full_generation['semantic_prompt'],
|
138 |
+
full_generation['coarse_prompt'],
|
139 |
+
full_generation['fine_prompt'])
|
140 |
+
# loading voice from custom folder needs to have extension
|
141 |
+
voice_name = voice_name + ".npz"
|
142 |
+
all_parts += [audio_array, silence.copy()]
|
143 |
+
|
144 |
+
# save & play audio
|
145 |
+
result = create_filename(OUTPUTFOLDER, "final",".wav")
|
146 |
+
save_wav(np.concatenate(all_parts), result)
|
147 |
+
return result
|
148 |
+
|
149 |
+
def create_filename(path, name, extension):
|
150 |
+
now = datetime.now()
|
151 |
+
date_str = now.strftime("%m-%d-%Y")
|
152 |
+
outputs_folder = os.path.join(os.getcwd(), path)
|
153 |
+
if not os.path.exists(outputs_folder):
|
154 |
+
os.makedirs(outputs_folder)
|
155 |
+
|
156 |
+
sub_folder = os.path.join(outputs_folder, date_str)
|
157 |
+
if not os.path.exists(sub_folder):
|
158 |
+
os.makedirs(sub_folder)
|
159 |
+
now = datetime.now()
|
160 |
+
time_str = now.strftime("%H-%M-%S")
|
161 |
+
file_name = f"{name}_{time_str}{extension}"
|
162 |
+
return os.path.join(sub_folder, file_name)
|
163 |
+
|
164 |
+
|
165 |
+
def save_wav(audio_array, filename):
|
166 |
+
write_wav(filename, SAMPLE_RATE, audio_array)
|
167 |
+
|
168 |
+
def save_voice(filename, semantic_prompt, coarse_prompt, fine_prompt):
|
169 |
+
np.savez_compressed(
|
170 |
+
filename,
|
171 |
+
semantic_prompt=semantic_prompt,
|
172 |
+
coarse_prompt=coarse_prompt,
|
173 |
+
fine_prompt=fine_prompt
|
174 |
+
)
|
175 |
+
|
176 |
+
|
177 |
+
def on_quick_gen_changed(checkbox):
|
178 |
+
if checkbox == False:
|
179 |
+
return gr.CheckboxGroup.update(visible=True)
|
180 |
+
return gr.CheckboxGroup.update(visible=False)
|
181 |
+
|
182 |
+
def delete_output_files(checkbox_state):
|
183 |
+
if checkbox_state:
|
184 |
+
outputs_folder = os.path.join(os.getcwd(), OUTPUTFOLDER)
|
185 |
+
if os.path.exists(outputs_folder):
|
186 |
+
purgedir(outputs_folder)
|
187 |
+
return False
|
188 |
+
|
189 |
+
|
190 |
+
# https://stackoverflow.com/a/54494779
|
191 |
+
def purgedir(parent):
|
192 |
+
for root, dirs, files in os.walk(parent):
|
193 |
+
for item in files:
|
194 |
+
# Delete subordinate files
|
195 |
+
filespec = os.path.join(root, item)
|
196 |
+
os.unlink(filespec)
|
197 |
+
for item in dirs:
|
198 |
+
# Recursively perform this operation for subordinate directories
|
199 |
+
purgedir(os.path.join(root, item))
|
200 |
+
|
201 |
+
def convert_text_to_ssml(text, selected_speaker):
|
202 |
+
return build_ssml(text, selected_speaker)
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
logger = logging.getLogger(__name__)
|
207 |
+
|
208 |
+
autolaunch = False
|
209 |
+
|
210 |
+
if len(sys.argv) > 1:
|
211 |
+
autolaunch = "-autolaunch" in sys.argv
|
212 |
+
|
213 |
+
|
214 |
+
if torch.cuda.is_available() == False:
|
215 |
+
os.environ['BARK_FORCE_CPU'] = 'True'
|
216 |
+
logger.warning("No CUDA detected, fallback to CPU!")
|
217 |
+
|
218 |
+
print(f'smallmodels={os.environ.get("SUNO_USE_SMALL_MODELS", False)}')
|
219 |
+
print(f'enablemps={os.environ.get("SUNO_ENABLE_MPS", False)}')
|
220 |
+
print(f'offloadcpu={os.environ.get("SUNO_OFFLOAD_CPU", False)}')
|
221 |
+
print(f'forcecpu={os.environ.get("BARK_FORCE_CPU", False)}')
|
222 |
+
print(f'autolaunch={autolaunch}\n\n')
|
223 |
+
|
224 |
+
#print("Updating nltk\n")
|
225 |
+
#nltk.download('punkt')
|
226 |
+
|
227 |
+
print("Preloading Models\n")
|
228 |
+
preload_models()
|
229 |
+
|
230 |
+
# Collect all existing speakers/voices in dir
|
231 |
+
speakers_list = []
|
232 |
+
|
233 |
+
for root, dirs, files in os.walk("./bark/assets/prompts"):
|
234 |
+
for file in files:
|
235 |
+
if(file.endswith(".npz")):
|
236 |
+
pathpart = root.replace("./bark/assets/prompts", "")
|
237 |
+
if len(pathpart) < 1:
|
238 |
+
pathpart = "/"
|
239 |
+
speakers_list.append(os.path.join(pathpart, file[:-4]))
|
240 |
+
|
241 |
+
speakers_list = sorted(speakers_list, key=lambda x: x.lower())
|
242 |
+
speakers_list.insert(0, 'None')
|
243 |
+
|
244 |
+
# Create Gradio Blocks
|
245 |
+
|
246 |
+
with gr.Blocks(title="Bark Enhanced Gradio GUI", mode="Bark Enhanced") as barkgui:
|
247 |
+
gr.Markdown("### [Bark Enhanced](https://github.com/C0untFloyd/bark-gui)")
|
248 |
+
with gr.Tab("TTS"):
|
249 |
+
with gr.Row():
|
250 |
+
with gr.Column():
|
251 |
+
placeholder = "Enter text here."
|
252 |
+
input_text = gr.Textbox(label="Input Text", lines=4, placeholder=placeholder)
|
253 |
+
with gr.Column():
|
254 |
+
convert_to_ssml_button = gr.Button("Convert Text to SSML")
|
255 |
+
with gr.Row():
|
256 |
+
with gr.Column():
|
257 |
+
examples = [
|
258 |
+
"Special meanings: [laughter] [laughs] [sighs] [music] [gasps] [clears throat] MAN: WOMAN:",
|
259 |
+
"♪ Never gonna make you cry, never gonna say goodbye, never gonna tell a lie and hurt you ♪",
|
260 |
+
"And now — a picture of a larch [laughter]",
|
261 |
+
"""
|
262 |
+
WOMAN: I would like an oatmilk latte please.
|
263 |
+
MAN: Wow, that's expensive!
|
264 |
+
""",
|
265 |
+
"""<?xml version="1.0"?>
|
266 |
+
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
|
267 |
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
268 |
+
xsi:schemaLocation="http://www.w3.org/2001/10/synthesis
|
269 |
+
http://www.w3.org/TR/speech-synthesis/synthesis.xsd"
|
270 |
+
xml:lang="en-US">
|
271 |
+
<voice name="/v2/en_speaker_9">Look at that drunk guy!</voice>
|
272 |
+
<voice name="/v2/en_speaker_3">Who is he?</voice>
|
273 |
+
<voice name="/v2/en_speaker_9">WOMAN: [clears throat] 10 years ago, he proposed me and I rejected him.</voice>
|
274 |
+
<voice name="/v2/en_speaker_3">Oh my God [laughs] he is still celebrating</voice>
|
275 |
+
</speak>"""
|
276 |
+
]
|
277 |
+
examples = gr.Examples(examples=examples, inputs=input_text)
|
278 |
+
|
279 |
+
with gr.Row():
|
280 |
+
with gr.Column():
|
281 |
+
gr.Markdown("[Voice Prompt Library](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c)")
|
282 |
+
speaker = gr.Dropdown(speakers_list, value=speakers_list[0], label="Voice")
|
283 |
+
with gr.Column():
|
284 |
+
text_temp = gr.Slider(
|
285 |
+
0.1,
|
286 |
+
1.0,
|
287 |
+
value=0.7,
|
288 |
+
label="Generation Temperature",
|
289 |
+
info="1.0 more diverse, 0.1 more conservative"
|
290 |
+
)
|
291 |
+
waveform_temp = gr.Slider(0.1, 1.0, value=0.7, label="Waveform temperature", info="1.0 more diverse, 0.1 more conservative")
|
292 |
+
|
293 |
+
with gr.Row():
|
294 |
+
with gr.Column():
|
295 |
+
quick_gen_checkbox = gr.Checkbox(label="Quick Generation", value=True)
|
296 |
+
with gr.Column():
|
297 |
+
settings_checkboxes = ["Use semantic history", "Use coarse history", "Use fine history", "Use last generation as history"]
|
298 |
+
complete_settings = gr.CheckboxGroup(choices=settings_checkboxes, value=settings_checkboxes, label="Detailed Generation Settings", type="value", interactive=True, visible=False)
|
299 |
+
quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings)
|
300 |
+
|
301 |
+
with gr.Row():
|
302 |
+
with gr.Column():
|
303 |
+
tts_create_button = gr.Button("Create")
|
304 |
+
with gr.Column():
|
305 |
+
hidden_checkbox = gr.Checkbox(visible=False)
|
306 |
+
button_delete_files = gr.Button("Clear output folder")
|
307 |
+
with gr.Row():
|
308 |
+
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
309 |
+
|
310 |
+
with gr.Tab("Clone Voice"):
|
311 |
+
input_audio_filename = gr.Audio(label="Input audio.wav", source="upload", type="filepath")
|
312 |
+
transcription_text = gr.Textbox(label="Transcription Text", lines=1, placeholder="Enter Text of your Audio Sample here...")
|
313 |
+
initialname = "./bark/assets/prompts/custom/MeMyselfAndI"
|
314 |
+
#inputAudioFilename = gr.Textbox(label="Filename of Input Audio", lines=1, placeholder="audio.wav")
|
315 |
+
output_voice = gr.Textbox(label="Filename of trained Voice", lines=1, placeholder=initialname, value=initialname)
|
316 |
+
clone_voice_button = gr.Button("Create Voice")
|
317 |
+
dummy = gr.Text(label="Progress")
|
318 |
+
|
319 |
+
convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text)
|
320 |
+
tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, quick_gen_checkbox, complete_settings],outputs=output_audio)
|
321 |
+
# Javascript hack to display modal confirmation dialog
|
322 |
+
js = "(x) => confirm('Are you sure? This will remove all files from output folder')"
|
323 |
+
button_delete_files.click(None, None, hidden_checkbox, _js=js)
|
324 |
+
hidden_checkbox.change(delete_output_files, [hidden_checkbox], [hidden_checkbox])
|
325 |
+
clone_voice_button.click(clone_voice, inputs=[input_audio_filename, transcription_text, output_voice], outputs=dummy)
|
326 |
+
|
327 |
+
barkgui.queue().launch(show_error=True)
|