kevinwang676 commited on
Commit
5104e18
·
1 Parent(s): 8a1732e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +327 -0
app.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from cProfile import label
2
+ from distutils.command.check import check
3
+ from doctest import Example
4
+ import gradio as gr
5
+ import os
6
+ import sys
7
+ import numpy as np
8
+ import logging
9
+ import torch
10
+ from xml.sax import saxutils
11
+ #import nltk
12
+
13
+ from bark import SAMPLE_RATE, generate_audio
14
+ from bark.clonevoice import clone_voice
15
+ from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic
16
+ from scipy.io.wavfile import write as write_wav
17
+ from parseinput import split_and_recombine_text, build_ssml, is_ssml, create_clips_from_ssml
18
+ from datetime import datetime
19
+ from tqdm.auto import tqdm
20
+
21
+ OUTPUTFOLDER = "Outputs"
22
+
23
+
24
+ def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):
25
+
26
+ # generation with more control
27
+ x_semantic = generate_text_semantic(
28
+ text_prompt,
29
+ history_prompt=voice_name if use_semantic_history_prompt else None,
30
+ temp=semantic_temp,
31
+ top_k=semantic_top_k,
32
+ top_p=semantic_top_p
33
+ )
34
+
35
+ x_coarse_gen = generate_coarse(
36
+ x_semantic,
37
+ history_prompt=voice_name if use_coarse_history_prompt else None,
38
+ temp=coarse_temp,
39
+ top_k=coarse_top_k,
40
+ top_p=coarse_top_p
41
+ )
42
+ x_fine_gen = generate_fine(
43
+ x_coarse_gen,
44
+ history_prompt=voice_name if use_fine_history_prompt else None,
45
+ temp=fine_temp
46
+ )
47
+
48
+ if output_full:
49
+ full_generation = {
50
+ 'semantic_prompt': x_semantic,
51
+ 'coarse_prompt': x_coarse_gen,
52
+ 'fine_prompt': x_fine_gen
53
+ }
54
+ return full_generation, codec_decode(x_fine_gen)
55
+ return codec_decode(x_fine_gen)
56
+
57
+ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, quick_generation, complete_settings, progress=gr.Progress(track_tqdm=True)):
58
+ if text == None or len(text) < 1:
59
+ raise gr.Error('No text entered!')
60
+
61
+ # Chunk the text into smaller pieces then combine the generated audio
62
+
63
+ # generation settings
64
+ if selected_speaker == 'None':
65
+ selected_speaker = None
66
+
67
+ voice_name = selected_speaker
68
+
69
+ semantic_temp = text_temp
70
+ semantic_top_k = 50
71
+ semantic_top_p = 0.95
72
+
73
+ coarse_temp = waveform_temp
74
+ coarse_top_k = 50
75
+ coarse_top_p = 0.95
76
+
77
+ fine_temp = 0.5
78
+
79
+ use_semantic_history_prompt = "Use semantic history" in complete_settings
80
+ use_coarse_history_prompt = "Use coarse history" in complete_settings
81
+ use_fine_history_prompt = "Use fine history" in complete_settings
82
+ use_last_generation_as_history = "Use last generation as history" in complete_settings
83
+ progress(0, desc="Generating")
84
+
85
+ silence = np.zeros(int(0.25 * SAMPLE_RATE), dtype=np.float32) # quarter second of silence
86
+
87
+ all_parts = []
88
+ text = text.lstrip()
89
+ if is_ssml(text):
90
+ list_speak = create_clips_from_ssml(text)
91
+ for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)):
92
+ selected_speaker = clip[0]
93
+ text = clip[1]
94
+ text = saxutils.unescape(text)
95
+ if selected_speaker == "None":
96
+ selected_speaker = None
97
+
98
+ print(f"\nGenerating Text ({i+1}/{len(list_speak)}) -> {selected_speaker}:`{text}`")
99
+ audio_array = generate_audio(text, selected_speaker, text_temp, waveform_temp)
100
+ if len(list_speak) > 1:
101
+ save_wav(audio_array, create_filename(OUTPUTFOLDER, "audioclip",".wav"))
102
+ all_parts += [audio_array, silence.copy()]
103
+ else:
104
+ texts = split_and_recombine_text(text)
105
+ for i, text in tqdm(enumerate(texts), total=len(texts)):
106
+ print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker}:`{text}`")
107
+ if quick_generation == True:
108
+ audio_array = generate_audio(text, selected_speaker, text_temp, waveform_temp)
109
+
110
+ else:
111
+ full_generation, audio_array = generate_with_settings(
112
+ text,
113
+ semantic_temp=semantic_temp,
114
+ semantic_top_k=semantic_top_k,
115
+ semantic_top_p=semantic_top_p,
116
+ coarse_temp=coarse_temp,
117
+ coarse_top_k=coarse_top_k,
118
+ coarse_top_p=coarse_top_p,
119
+ fine_temp=fine_temp,
120
+ voice_name=voice_name,
121
+ use_semantic_history_prompt=use_semantic_history_prompt,
122
+ use_coarse_history_prompt=use_coarse_history_prompt,
123
+ use_fine_history_prompt=use_fine_history_prompt,
124
+ output_full=True,
125
+ )
126
+
127
+ # Noticed this in the HF Demo - convert to 16bit int -32767/32767 - most used audio format
128
+ # audio_array = (audio_array * 32767).astype(np.int16)
129
+
130
+ if len(texts) > 1:
131
+ save_wav(audio_array, create_filename(OUTPUTFOLDER, "audioclip",".wav"))
132
+
133
+ if quick_generation == False & use_last_generation_as_history:
134
+ # save to npz
135
+ voice_name = create_filename(OUTPUTFOLDER, "audioclip", "")
136
+ save_voice(voice_name,
137
+ full_generation['semantic_prompt'],
138
+ full_generation['coarse_prompt'],
139
+ full_generation['fine_prompt'])
140
+ # loading voice from custom folder needs to have extension
141
+ voice_name = voice_name + ".npz"
142
+ all_parts += [audio_array, silence.copy()]
143
+
144
+ # save & play audio
145
+ result = create_filename(OUTPUTFOLDER, "final",".wav")
146
+ save_wav(np.concatenate(all_parts), result)
147
+ return result
148
+
149
+ def create_filename(path, name, extension):
150
+ now = datetime.now()
151
+ date_str = now.strftime("%m-%d-%Y")
152
+ outputs_folder = os.path.join(os.getcwd(), path)
153
+ if not os.path.exists(outputs_folder):
154
+ os.makedirs(outputs_folder)
155
+
156
+ sub_folder = os.path.join(outputs_folder, date_str)
157
+ if not os.path.exists(sub_folder):
158
+ os.makedirs(sub_folder)
159
+ now = datetime.now()
160
+ time_str = now.strftime("%H-%M-%S")
161
+ file_name = f"{name}_{time_str}{extension}"
162
+ return os.path.join(sub_folder, file_name)
163
+
164
+
165
+ def save_wav(audio_array, filename):
166
+ write_wav(filename, SAMPLE_RATE, audio_array)
167
+
168
+ def save_voice(filename, semantic_prompt, coarse_prompt, fine_prompt):
169
+ np.savez_compressed(
170
+ filename,
171
+ semantic_prompt=semantic_prompt,
172
+ coarse_prompt=coarse_prompt,
173
+ fine_prompt=fine_prompt
174
+ )
175
+
176
+
177
+ def on_quick_gen_changed(checkbox):
178
+ if checkbox == False:
179
+ return gr.CheckboxGroup.update(visible=True)
180
+ return gr.CheckboxGroup.update(visible=False)
181
+
182
+ def delete_output_files(checkbox_state):
183
+ if checkbox_state:
184
+ outputs_folder = os.path.join(os.getcwd(), OUTPUTFOLDER)
185
+ if os.path.exists(outputs_folder):
186
+ purgedir(outputs_folder)
187
+ return False
188
+
189
+
190
+ # https://stackoverflow.com/a/54494779
191
+ def purgedir(parent):
192
+ for root, dirs, files in os.walk(parent):
193
+ for item in files:
194
+ # Delete subordinate files
195
+ filespec = os.path.join(root, item)
196
+ os.unlink(filespec)
197
+ for item in dirs:
198
+ # Recursively perform this operation for subordinate directories
199
+ purgedir(os.path.join(root, item))
200
+
201
+ def convert_text_to_ssml(text, selected_speaker):
202
+ return build_ssml(text, selected_speaker)
203
+
204
+
205
+
206
+ logger = logging.getLogger(__name__)
207
+
208
+ autolaunch = False
209
+
210
+ if len(sys.argv) > 1:
211
+ autolaunch = "-autolaunch" in sys.argv
212
+
213
+
214
+ if torch.cuda.is_available() == False:
215
+ os.environ['BARK_FORCE_CPU'] = 'True'
216
+ logger.warning("No CUDA detected, fallback to CPU!")
217
+
218
+ print(f'smallmodels={os.environ.get("SUNO_USE_SMALL_MODELS", False)}')
219
+ print(f'enablemps={os.environ.get("SUNO_ENABLE_MPS", False)}')
220
+ print(f'offloadcpu={os.environ.get("SUNO_OFFLOAD_CPU", False)}')
221
+ print(f'forcecpu={os.environ.get("BARK_FORCE_CPU", False)}')
222
+ print(f'autolaunch={autolaunch}\n\n')
223
+
224
+ #print("Updating nltk\n")
225
+ #nltk.download('punkt')
226
+
227
+ print("Preloading Models\n")
228
+ preload_models()
229
+
230
+ # Collect all existing speakers/voices in dir
231
+ speakers_list = []
232
+
233
+ for root, dirs, files in os.walk("./bark/assets/prompts"):
234
+ for file in files:
235
+ if(file.endswith(".npz")):
236
+ pathpart = root.replace("./bark/assets/prompts", "")
237
+ if len(pathpart) < 1:
238
+ pathpart = "/"
239
+ speakers_list.append(os.path.join(pathpart, file[:-4]))
240
+
241
+ speakers_list = sorted(speakers_list, key=lambda x: x.lower())
242
+ speakers_list.insert(0, 'None')
243
+
244
+ # Create Gradio Blocks
245
+
246
+ with gr.Blocks(title="Bark Enhanced Gradio GUI", mode="Bark Enhanced") as barkgui:
247
+ gr.Markdown("### [Bark Enhanced](https://github.com/C0untFloyd/bark-gui)")
248
+ with gr.Tab("TTS"):
249
+ with gr.Row():
250
+ with gr.Column():
251
+ placeholder = "Enter text here."
252
+ input_text = gr.Textbox(label="Input Text", lines=4, placeholder=placeholder)
253
+ with gr.Column():
254
+ convert_to_ssml_button = gr.Button("Convert Text to SSML")
255
+ with gr.Row():
256
+ with gr.Column():
257
+ examples = [
258
+ "Special meanings: [laughter] [laughs] [sighs] [music] [gasps] [clears throat] MAN: WOMAN:",
259
+ "♪ Never gonna make you cry, never gonna say goodbye, never gonna tell a lie and hurt you ♪",
260
+ "And now — a picture of a larch [laughter]",
261
+ """
262
+ WOMAN: I would like an oatmilk latte please.
263
+ MAN: Wow, that's expensive!
264
+ """,
265
+ """<?xml version="1.0"?>
266
+ <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
267
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
268
+ xsi:schemaLocation="http://www.w3.org/2001/10/synthesis
269
+ http://www.w3.org/TR/speech-synthesis/synthesis.xsd"
270
+ xml:lang="en-US">
271
+ <voice name="/v2/en_speaker_9">Look at that drunk guy!</voice>
272
+ <voice name="/v2/en_speaker_3">Who is he?</voice>
273
+ <voice name="/v2/en_speaker_9">WOMAN: [clears throat] 10 years ago, he proposed me and I rejected him.</voice>
274
+ <voice name="/v2/en_speaker_3">Oh my God [laughs] he is still celebrating</voice>
275
+ </speak>"""
276
+ ]
277
+ examples = gr.Examples(examples=examples, inputs=input_text)
278
+
279
+ with gr.Row():
280
+ with gr.Column():
281
+ gr.Markdown("[Voice Prompt Library](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c)")
282
+ speaker = gr.Dropdown(speakers_list, value=speakers_list[0], label="Voice")
283
+ with gr.Column():
284
+ text_temp = gr.Slider(
285
+ 0.1,
286
+ 1.0,
287
+ value=0.7,
288
+ label="Generation Temperature",
289
+ info="1.0 more diverse, 0.1 more conservative"
290
+ )
291
+ waveform_temp = gr.Slider(0.1, 1.0, value=0.7, label="Waveform temperature", info="1.0 more diverse, 0.1 more conservative")
292
+
293
+ with gr.Row():
294
+ with gr.Column():
295
+ quick_gen_checkbox = gr.Checkbox(label="Quick Generation", value=True)
296
+ with gr.Column():
297
+ settings_checkboxes = ["Use semantic history", "Use coarse history", "Use fine history", "Use last generation as history"]
298
+ complete_settings = gr.CheckboxGroup(choices=settings_checkboxes, value=settings_checkboxes, label="Detailed Generation Settings", type="value", interactive=True, visible=False)
299
+ quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings)
300
+
301
+ with gr.Row():
302
+ with gr.Column():
303
+ tts_create_button = gr.Button("Create")
304
+ with gr.Column():
305
+ hidden_checkbox = gr.Checkbox(visible=False)
306
+ button_delete_files = gr.Button("Clear output folder")
307
+ with gr.Row():
308
+ output_audio = gr.Audio(label="Generated Audio", type="filepath")
309
+
310
+ with gr.Tab("Clone Voice"):
311
+ input_audio_filename = gr.Audio(label="Input audio.wav", source="upload", type="filepath")
312
+ transcription_text = gr.Textbox(label="Transcription Text", lines=1, placeholder="Enter Text of your Audio Sample here...")
313
+ initialname = "./bark/assets/prompts/custom/MeMyselfAndI"
314
+ #inputAudioFilename = gr.Textbox(label="Filename of Input Audio", lines=1, placeholder="audio.wav")
315
+ output_voice = gr.Textbox(label="Filename of trained Voice", lines=1, placeholder=initialname, value=initialname)
316
+ clone_voice_button = gr.Button("Create Voice")
317
+ dummy = gr.Text(label="Progress")
318
+
319
+ convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text)
320
+ tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, quick_gen_checkbox, complete_settings],outputs=output_audio)
321
+ # Javascript hack to display modal confirmation dialog
322
+ js = "(x) => confirm('Are you sure? This will remove all files from output folder')"
323
+ button_delete_files.click(None, None, hidden_checkbox, _js=js)
324
+ hidden_checkbox.change(delete_output_files, [hidden_checkbox], [hidden_checkbox])
325
+ clone_voice_button.click(clone_voice, inputs=[input_audio_filename, transcription_text, output_voice], outputs=dummy)
326
+
327
+ barkgui.queue().launch(show_error=True)