kevinwang676 commited on
Commit
5203f30
·
1 Parent(s): 4430785

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +405 -0
app.py CHANGED
@@ -161,6 +161,411 @@ def voice_conversion(ta, ra, da):
161
 
162
  return (ap.sample_rate, ref_wav_voc)
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  c3 = gr.Interface(
165
  fn=voice_conversion,
166
  inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Clip To Convert')],
 
161
 
162
  return (ap.sample_rate, ref_wav_voc)
163
 
164
+
165
+ import os
166
+ import sys
167
+
168
+ os.system("git clone https://github.com/C0untFloyd/bark-gui.git")
169
+ sys.path.append("./bark-gui/")
170
+
171
+ from cProfile import label
172
+ from distutils.command.check import check
173
+ from doctest import Example
174
+ import gradio as gr
175
+ import numpy as np
176
+ import logging
177
+ import torch
178
+ import pytorch_seed
179
+ import time
180
+
181
+ from xml.sax import saxutils
182
+ from bark.api import generate_with_settings
183
+ from bark.api import save_as_prompt
184
+ from settings import Settings
185
+ #import nltk
186
+
187
+ from bark import SAMPLE_RATE
188
+ from bark.clonevoice import clone_voice
189
+ from bark.generation import SAMPLE_RATE, preload_models
190
+ from scipy.io.wavfile import write as write_wav
191
+ from parseinput import split_and_recombine_text, build_ssml, is_ssml, create_clips_from_ssml
192
+ from datetime import datetime
193
+ from tqdm.auto import tqdm
194
+ from id3tagging import add_id3_tag
195
+
196
+ OUTPUTFOLDER = "Outputs"
197
+
198
+
199
+ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, eos_prob, quick_generation, complete_settings, seed, progress=gr.Progress(track_tqdm=True)):
200
+ if text == None or len(text) < 1:
201
+ raise gr.Error('No text entered!')
202
+
203
+ # Chunk the text into smaller pieces then combine the generated audio
204
+
205
+ # generation settings
206
+ if selected_speaker == 'None':
207
+ selected_speaker = None
208
+ if seed != None and seed > 2**32 - 1:
209
+ logger.warning(f"Seed {seed} > 2**32 - 1 (max), setting to random")
210
+ seed = None
211
+ if seed == None or seed <= 0:
212
+ seed = np.random.default_rng().integers(1, 2**32 - 1)
213
+ assert(0 < seed and seed < 2**32)
214
+
215
+ voice_name = selected_speaker
216
+ use_last_generation_as_history = "Use last generation as history" in complete_settings
217
+ save_last_generation = "Save generation as Voice" in complete_settings
218
+ progress(0, desc="Generating")
219
+
220
+ silenceshort = np.zeros(int((float(settings.silence_sentence) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # quarter second of silence
221
+ silencelong = np.zeros(int((float(settings.silence_speakers) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # half a second of silence
222
+ full_generation = None
223
+
224
+ all_parts = []
225
+ complete_text = ""
226
+ text = text.lstrip()
227
+ if is_ssml(text):
228
+ list_speak = create_clips_from_ssml(text)
229
+ prev_speaker = None
230
+ for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)):
231
+ selected_speaker = clip[0]
232
+ # Add pause break between speakers
233
+ if i > 0 and selected_speaker != prev_speaker:
234
+ all_parts += [silencelong.copy()]
235
+ prev_speaker = selected_speaker
236
+ text = clip[1]
237
+ text = saxutils.unescape(text)
238
+ if selected_speaker == "None":
239
+ selected_speaker = None
240
+
241
+ print(f"\nGenerating Text ({i+1}/{len(list_speak)}) -> {selected_speaker} (Seed {seed}):`{text}`")
242
+ complete_text += text
243
+ with pytorch_seed.SavedRNG(seed):
244
+ audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
245
+ seed = torch.random.initial_seed()
246
+ if len(list_speak) > 1:
247
+ filename = create_filename(OUTPUTFOLDER, seed, "audioclip",".wav")
248
+ save_wav(audio_array, filename)
249
+ add_id3_tag(filename, text, selected_speaker, seed)
250
+
251
+ all_parts += [audio_array]
252
+ else:
253
+ texts = split_and_recombine_text(text, settings.input_text_desired_length, settings.input_text_max_length)
254
+ for i, text in tqdm(enumerate(texts), total=len(texts)):
255
+ print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker} (Seed {seed}):`{text}`")
256
+ complete_text += text
257
+ if quick_generation == True:
258
+ with pytorch_seed.SavedRNG(seed):
259
+ audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
260
+ seed = torch.random.initial_seed()
261
+ else:
262
+ full_output = use_last_generation_as_history or save_last_generation
263
+ if full_output:
264
+ full_generation, audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob, output_full=True)
265
+ else:
266
+ audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
267
+
268
+ # Noticed this in the HF Demo - convert to 16bit int -32767/32767 - most used audio format
269
+ # audio_array = (audio_array * 32767).astype(np.int16)
270
+
271
+ if len(texts) > 1:
272
+ filename = create_filename(OUTPUTFOLDER, seed, "audioclip",".wav")
273
+ save_wav(audio_array, filename)
274
+ add_id3_tag(filename, text, selected_speaker, seed)
275
+
276
+ if quick_generation == False and (save_last_generation == True or use_last_generation_as_history == True):
277
+ # save to npz
278
+ voice_name = create_filename(OUTPUTFOLDER, seed, "audioclip", ".npz")
279
+ save_as_prompt(voice_name, full_generation)
280
+ if use_last_generation_as_history:
281
+ selected_speaker = voice_name
282
+
283
+ all_parts += [audio_array]
284
+ # Add short pause between sentences
285
+ if text[-1] in "!?.\n" and i > 1:
286
+ all_parts += [silenceshort.copy()]
287
+
288
+ # save & play audio
289
+ result = create_filename(OUTPUTFOLDER, seed, "final",".wav")
290
+ save_wav(np.concatenate(all_parts), result)
291
+ # write id3 tag with text truncated to 60 chars, as a precaution...
292
+ add_id3_tag(result, complete_text, selected_speaker, seed)
293
+ return result
294
+
295
+ def create_filename(path, seed, name, extension):
296
+ now = datetime.now()
297
+ date_str =now.strftime("%m-%d-%Y")
298
+ outputs_folder = os.path.join(os.getcwd(), path)
299
+ if not os.path.exists(outputs_folder):
300
+ os.makedirs(outputs_folder)
301
+
302
+ sub_folder = os.path.join(outputs_folder, date_str)
303
+ if not os.path.exists(sub_folder):
304
+ os.makedirs(sub_folder)
305
+
306
+ time_str = now.strftime("%H-%M-%S")
307
+ file_name = f"{name}_{time_str}_s{seed}{extension}"
308
+ return os.path.join(sub_folder, file_name)
309
+
310
+
311
+ def save_wav(audio_array, filename):
312
+ write_wav(filename, SAMPLE_RATE, audio_array)
313
+
314
+ def save_voice(filename, semantic_prompt, coarse_prompt, fine_prompt):
315
+ np.savez_compressed(
316
+ filename,
317
+ semantic_prompt=semantic_prompt,
318
+ coarse_prompt=coarse_prompt,
319
+ fine_prompt=fine_prompt
320
+ )
321
+
322
+
323
+ def on_quick_gen_changed(checkbox):
324
+ if checkbox == False:
325
+ return gr.CheckboxGroup.update(visible=True)
326
+ return gr.CheckboxGroup.update(visible=False)
327
+
328
+ def delete_output_files(checkbox_state):
329
+ if checkbox_state:
330
+ outputs_folder = os.path.join(os.getcwd(), OUTPUTFOLDER)
331
+ if os.path.exists(outputs_folder):
332
+ purgedir(outputs_folder)
333
+ return False
334
+
335
+
336
+ # https://stackoverflow.com/a/54494779
337
+ def purgedir(parent):
338
+ for root, dirs, files in os.walk(parent):
339
+ for item in files:
340
+ # Delete subordinate files
341
+ filespec = os.path.join(root, item)
342
+ os.unlink(filespec)
343
+ for item in dirs:
344
+ # Recursively perform this operation for subordinate directories
345
+ purgedir(os.path.join(root, item))
346
+
347
+ def convert_text_to_ssml(text, selected_speaker):
348
+ return build_ssml(text, selected_speaker)
349
+
350
+
351
+ def apply_settings(themes, input_server_name, input_server_port, input_server_public, input_desired_len, input_max_len, input_silence_break, input_silence_speaker):
352
+ settings.selected_theme = themes
353
+ settings.server_name = input_server_name
354
+ settings.server_port = input_server_port
355
+ settings.server_share = input_server_public
356
+ settings.input_text_desired_length = input_desired_len
357
+ settings.input_text_max_length = input_max_len
358
+ settings.silence_sentence = input_silence_break
359
+ settings.silence_speaker = input_silence_speaker
360
+ settings.save()
361
+
362
+ def restart():
363
+ global restart_server
364
+ restart_server = True
365
+
366
+
367
+ def create_version_html():
368
+ python_version = ".".join([str(x) for x in sys.version_info[0:3]])
369
+ versions_html = f"""
370
+ python: <span title="{sys.version}">{python_version}</span>
371
+  • 
372
+ torch: {getattr(torch, '__long_version__',torch.__version__)}
373
+  • 
374
+ gradio: {gr.__version__}
375
+ """
376
+ return versions_html
377
+
378
+
379
+
380
+ logger = logging.getLogger(__name__)
381
+ APPTITLE = "Bark UI Enhanced v0.4.6"
382
+
383
+
384
+ autolaunch = False
385
+
386
+ if len(sys.argv) > 1:
387
+ autolaunch = "-autolaunch" in sys.argv
388
+
389
+
390
+ if torch.cuda.is_available() == False:
391
+ os.environ['BARK_FORCE_CPU'] = 'True'
392
+ logger.warning("No CUDA detected, fallback to CPU!")
393
+
394
+ print(f'smallmodels={os.environ.get("SUNO_USE_SMALL_MODELS", False)}')
395
+ print(f'enablemps={os.environ.get("SUNO_ENABLE_MPS", False)}')
396
+ print(f'offloadcpu={os.environ.get("SUNO_OFFLOAD_CPU", False)}')
397
+ print(f'forcecpu={os.environ.get("BARK_FORCE_CPU", False)}')
398
+ print(f'autolaunch={autolaunch}\n\n')
399
+
400
+ #print("Updating nltk\n")
401
+ #nltk.download('punkt')
402
+
403
+ print("Preloading Models\n")
404
+ preload_models()
405
+
406
+ settings = Settings('config.yaml')
407
+
408
+ # Collect all existing speakers/voices in dir
409
+ speakers_list = []
410
+
411
+ for root, dirs, files in os.walk("./bark/assets/prompts"):
412
+ for file in files:
413
+ if(file.endswith(".npz")):
414
+ pathpart = root.replace("./bark/assets/prompts", "")
415
+ name = os.path.join(pathpart, file[:-4])
416
+ if name.startswith("/") or name.startswith("\\"):
417
+ name = name[1:]
418
+ speakers_list.append(name)
419
+
420
+ speakers_list = sorted(speakers_list, key=lambda x: x.lower())
421
+ speakers_list.insert(0, 'None')
422
+
423
+ available_themes = ["Default", "gradio/glass", "gradio/monochrome", "gradio/seafoam", "gradio/soft", "gstaff/xkcd", "freddyaboulton/dracula_revamped", "ysharma/steampunk"]
424
+
425
+ seed = -1
426
+ server_name = settings.server_name
427
+ if len(server_name) < 1:
428
+ server_name = None
429
+ server_port = settings.server_port
430
+ if server_port <= 0:
431
+ server_port = None
432
+ global run_server
433
+ global restart_server
434
+
435
+ run_server = True
436
+
437
+
438
+ while run_server:
439
+ print(f'Launching {APPTITLE} Server')
440
+
441
+ # Create Gradio Blocks
442
+
443
+ with gr.Blocks(title=f"{APPTITLE}", mode=f"{APPTITLE}", theme=settings.selected_theme) as barkgui:
444
+ with gr.Row():
445
+ with gr.Column():
446
+ gr.Markdown(f"### [{APPTITLE}](https://github.com/C0untFloyd/bark-gui)")
447
+ with gr.Column():
448
+ gr.HTML(create_version_html(), elem_id="versions")
449
+
450
+ with gr.Tab("TTS"):
451
+ with gr.Row():
452
+ with gr.Column():
453
+ placeholder = "Enter text here."
454
+ input_text = gr.Textbox(label="Input Text", lines=4, placeholder=placeholder)
455
+ with gr.Column():
456
+ seedcomponent = gr.Number(label="Seed (default -1 = Random)", precision=0, value=-1)
457
+ convert_to_ssml_button = gr.Button("Convert Text to SSML")
458
+ with gr.Row():
459
+ with gr.Column():
460
+ examples = [
461
+ "Special meanings: [laughter] [laughs] [sighs] [music] [gasps] [clears throat] MAN: WOMAN:",
462
+ "♪ Never gonna make you cry, never gonna say goodbye, never gonna tell a lie and hurt you ♪",
463
+ "And now — a picture of a larch [laughter]",
464
+ """
465
+ WOMAN: I would like an oatmilk latte please.
466
+ MAN: Wow, that's expensive!
467
+ """,
468
+ """<?xml version="1.0"?>
469
+ <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
470
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
471
+ xsi:schemaLocation="http://www.w3.org/2001/10/synthesis
472
+ http://www.w3.org/TR/speech-synthesis/synthesis.xsd"
473
+ xml:lang="en-US">
474
+ <voice name="en_speaker_9">Look at that drunk guy!</voice>
475
+ <voice name="en_speaker_3">Who is he?</voice>
476
+ <voice name="en_speaker_9">WOMAN: [clears throat] 10 years ago, he proposed me and I rejected him.</voice>
477
+ <voice name="en_speaker_3">Oh my God [laughs] he is still celebrating</voice>
478
+ </speak>"""
479
+ ]
480
+ examples = gr.Examples(examples=examples, inputs=input_text)
481
+
482
+ with gr.Row():
483
+ with gr.Column():
484
+ gr.Markdown("[Voice Prompt Library](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c)")
485
+ speaker = gr.Dropdown(speakers_list, value=speakers_list[0], label="Voice")
486
+ with gr.Column():
487
+ text_temp = gr.Slider(0.1, 1.0, value=0.6, label="Generation Temperature", info="1.0 more diverse, 0.1 more conservative")
488
+ waveform_temp = gr.Slider(0.1, 1.0, value=0.7, label="Waveform temperature", info="1.0 more diverse, 0.1 more conservative")
489
+
490
+ with gr.Row():
491
+ with gr.Column():
492
+ quick_gen_checkbox = gr.Checkbox(label="Quick Generation", value=True)
493
+ settings_checkboxes = ["Use last generation as history", "Save generation as Voice"]
494
+ complete_settings = gr.CheckboxGroup(choices=settings_checkboxes, value=settings_checkboxes, label="Detailed Generation Settings", type="value", interactive=True, visible=False)
495
+ with gr.Column():
496
+ eos_prob = gr.Slider(0.0, 0.5, value=0.05, label="End of sentence probability")
497
+
498
+ with gr.Row():
499
+ with gr.Column():
500
+ tts_create_button = gr.Button("Generate")
501
+ with gr.Column():
502
+ hidden_checkbox = gr.Checkbox(visible=False)
503
+ button_stop_generation = gr.Button("Stop generation")
504
+ with gr.Row():
505
+ output_audio = gr.Audio(label="Generated Audio", type="filepath")
506
+
507
+ with gr.Row():
508
+ inp1 = gr.Audio(label='Target Speaker - Reference Clip')
509
+ inp2 = gr.Audio(label='Input Speaker - Reference Clip')
510
+ inp3 = gr.Audio(label='Input Speaker - Clip To Convert')
511
+ btn = gr.Button("Generate")
512
+ out1 = gr.Audio(label='Target Speaker - Converted Clip')
513
+ btn.click(voice_conversion, [inp1, inp2, inp3], [out1])
514
+
515
+
516
+ with gr.Tab("Clone Voice"):
517
+ input_audio_filename = gr.Audio(label="Input audio.wav", source="upload", type="filepath")
518
+ transcription_text = gr.Textbox(label="Transcription Text", lines=1, placeholder="Enter Text of your Audio Sample here...")
519
+ initialname = "./bark/assets/prompts/custom/MeMyselfAndI"
520
+ output_voice = gr.Textbox(label="Filename of trained Voice", lines=1, placeholder=initialname, value=initialname)
521
+ clone_voice_button = gr.Button("Create Voice")
522
+ dummy = gr.Text(label="Progress")
523
+
524
+ with gr.Tab("Settings"):
525
+ with gr.Row():
526
+ themes = gr.Dropdown(available_themes, label="Theme", info="Change needs complete restart", value=settings.selected_theme)
527
+ with gr.Row():
528
+ input_server_name = gr.Textbox(label="Server Name", lines=1, info="Leave blank to run locally", value=settings.server_name)
529
+ input_server_port = gr.Number(label="Server Port", precision=0, info="Leave at 0 to use default", value=settings.server_port)
530
+ share_checkbox = gr.Checkbox(label="Public Server", value=settings.server_share)
531
+ with gr.Row():
532
+ input_desired_len = gr.Slider(100, 150, value=settings.input_text_desired_length, label="Desired Input Text Length", info="Ideal length to split input sentences")
533
+ input_max_len = gr.Slider(150, 256, value=settings.input_text_max_length, label="Max Input Text Length", info="Maximum Input Text Length")
534
+ with gr.Row():
535
+ input_silence_break = gr.Slider(1, 1000, value=settings.silence_sentence, label="Sentence Pause Time (ms)", info="Silence between sentences in milliseconds")
536
+ input_silence_speakers = gr.Slider(1, 5000, value=settings.silence_speakers, label="Speaker Pause Time (ms)", info="Silence between different speakers in milliseconds")
537
+
538
+ with gr.Row():
539
+ button_apply_settings = gr.Button("Apply Settings")
540
+ button_apply_restart = gr.Button("Restart Server")
541
+ button_delete_files = gr.Button("Clear output folder")
542
+
543
+ quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings)
544
+ convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text)
545
+ gen_click = tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, eos_prob, quick_gen_checkbox, complete_settings, seedcomponent],outputs=output_audio)
546
+ button_stop_generation.click(fn=None, inputs=None, outputs=None, cancels=[gen_click])
547
+ # Javascript hack to display modal confirmation dialog
548
+ js = "(x) => confirm('Are you sure? This will remove all files from output folder')"
549
+ button_delete_files.click(None, None, hidden_checkbox, _js=js)
550
+ hidden_checkbox.change(delete_output_files, [hidden_checkbox], [hidden_checkbox])
551
+ clone_voice_button.click(clone_voice, inputs=[input_audio_filename, transcription_text, output_voice], outputs=dummy)
552
+ button_apply_settings.click(apply_settings, inputs=[themes, input_server_name, input_server_port, share_checkbox, input_desired_len, input_max_len, input_silence_break, input_silence_speakers])
553
+ button_apply_restart.click(restart)
554
+ restart_server = False
555
+ try:
556
+ barkgui.queue().launch(show_error=True)
557
+ except:
558
+ restart_server = True
559
+ run_server = False
560
+ try:
561
+ while restart_server == False:
562
+ time.sleep(1.0)
563
+ except (KeyboardInterrupt, OSError):
564
+ print("Keyboard interruption in main thread... closing server.")
565
+ run_server = False
566
+ barkgui.close()
567
+
568
+
569
  c3 = gr.Interface(
570
  fn=voice_conversion,
571
  inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Clip To Convert')],