Commit
·
f81d4f2
1
Parent(s):
d8cc0b4
Use inference via python directly
Browse files
app.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import sys
|
| 2 |
-
import os,stat
|
| 3 |
import subprocess
|
| 4 |
import random
|
| 5 |
from zipfile import ZipFile
|
| 6 |
import uuid
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
| 9 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 10 |
|
|
@@ -13,9 +15,18 @@ os.environ["COQUI_TOS_AGREED"] = "1"
|
|
| 13 |
import langid
|
| 14 |
|
| 15 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
| 16 |
from TTS.api import TTS
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
| 18 |
from huggingface_hub import HfApi
|
|
|
|
| 19 |
# will use api to restart space on a unrecoverable error
|
| 20 |
api = HfApi(token=HF_TOKEN)
|
| 21 |
repo_id = "coqui/xtts"
|
|
@@ -29,8 +40,19 @@ os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
|
|
| 29 |
|
| 30 |
# Load TTS
|
| 31 |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
|
| 32 |
-
tts.to("cuda")
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# This is for debugging purposes only
|
| 36 |
DEVICE_ASSERT_DETECTED=0
|
|
@@ -40,14 +62,15 @@ DEVICE_ASSERT_LANG=None
|
|
| 40 |
def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
|
| 41 |
if agree == True:
|
| 42 |
supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
| 43 |
-
|
| 44 |
if language not in supported_languages:
|
| 45 |
-
gr.Warning("Language you put in is not in is not in our Supported Languages, please choose from dropdown")
|
| 46 |
|
| 47 |
return (
|
| 48 |
None,
|
| 49 |
None,
|
| 50 |
None,
|
|
|
|
| 51 |
)
|
| 52 |
|
| 53 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
|
@@ -72,6 +95,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
| 72 |
None,
|
| 73 |
None,
|
| 74 |
None,
|
|
|
|
| 75 |
)
|
| 76 |
|
| 77 |
|
|
@@ -84,6 +108,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
| 84 |
None,
|
| 85 |
None,
|
| 86 |
None,
|
|
|
|
| 87 |
)
|
| 88 |
|
| 89 |
else:
|
|
@@ -129,6 +154,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
| 129 |
None,
|
| 130 |
None,
|
| 131 |
None,
|
|
|
|
| 132 |
)
|
| 133 |
if len(prompt)>200:
|
| 134 |
gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
|
|
@@ -136,6 +162,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
| 136 |
None,
|
| 137 |
None,
|
| 138 |
None,
|
|
|
|
| 139 |
)
|
| 140 |
global DEVICE_ASSERT_DETECTED
|
| 141 |
if DEVICE_ASSERT_DETECTED:
|
|
@@ -145,12 +172,33 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
| 145 |
print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
|
| 146 |
|
| 147 |
try:
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
except RuntimeError as e :
|
| 155 |
if "device-side assert" in str(e):
|
| 156 |
# cannot do anything on cuda device side error, need tor estart
|
|
@@ -173,6 +221,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
| 173 |
audio="output.wav",
|
| 174 |
),
|
| 175 |
"output.wav",
|
|
|
|
| 176 |
speaker_wav,
|
| 177 |
)
|
| 178 |
else:
|
|
@@ -181,6 +230,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
| 181 |
None,
|
| 182 |
None,
|
| 183 |
None,
|
|
|
|
| 184 |
)
|
| 185 |
|
| 186 |
|
|
@@ -205,7 +255,7 @@ Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs,<br/>
|
|
| 205 |
Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
|
| 206 |
Russian: ru, Spanish: es, Turkish: tr <br/>
|
| 207 |
</p>
|
| 208 |
-
<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=
|
| 209 |
"""
|
| 210 |
|
| 211 |
article = """
|
|
@@ -234,7 +284,6 @@ examples = [
|
|
| 234 |
False,
|
| 235 |
False,
|
| 236 |
True,
|
| 237 |
-
False,
|
| 238 |
],
|
| 239 |
[
|
| 240 |
"Als ich sechs war, sah ich einmal ein wunderbares Bild",
|
|
@@ -399,7 +448,8 @@ gr.Interface(
|
|
| 399 |
],
|
| 400 |
outputs=[
|
| 401 |
gr.Video(label="Waveform Visual"),
|
| 402 |
-
gr.Audio(label="Synthesised Audio",
|
|
|
|
| 403 |
gr.Audio(label="Reference Audio Used"),
|
| 404 |
],
|
| 405 |
title=title,
|
|
|
|
| 1 |
import sys
|
| 2 |
+
import io, os, stat
|
| 3 |
import subprocess
|
| 4 |
import random
|
| 5 |
from zipfile import ZipFile
|
| 6 |
import uuid
|
| 7 |
+
import time
|
| 8 |
+
import torch
|
| 9 |
+
import torchaudio
|
| 10 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
| 11 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 12 |
|
|
|
|
| 15 |
import langid
|
| 16 |
|
| 17 |
import gradio as gr
|
| 18 |
+
from scipy.io.wavfile import write
|
| 19 |
+
from pydub import AudioSegment
|
| 20 |
+
|
| 21 |
from TTS.api import TTS
|
| 22 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
| 23 |
+
from TTS.tts.models.xtts import Xtts
|
| 24 |
+
from TTS.utils.generic_utils import get_user_data_dir
|
| 25 |
+
|
| 26 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 27 |
+
|
| 28 |
from huggingface_hub import HfApi
|
| 29 |
+
|
| 30 |
# will use api to restart space on a unrecoverable error
|
| 31 |
api = HfApi(token=HF_TOKEN)
|
| 32 |
repo_id = "coqui/xtts"
|
|
|
|
| 40 |
|
| 41 |
# Load TTS
|
| 42 |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
|
|
|
|
| 43 |
|
| 44 |
+
model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
|
| 45 |
+
config = XttsConfig()
|
| 46 |
+
config.load_json(os.path.join(model_path, "config.json"))
|
| 47 |
+
model = Xtts.init_from_config(config)
|
| 48 |
+
model.load_checkpoint(
|
| 49 |
+
config,
|
| 50 |
+
checkpoint_path=os.path.join(model_path, "model.pth"),
|
| 51 |
+
vocab_path=os.path.join(model_path, "vocab.json"),
|
| 52 |
+
eval=True,
|
| 53 |
+
use_deepspeed=True
|
| 54 |
+
)
|
| 55 |
+
model.cuda()
|
| 56 |
|
| 57 |
# This is for debugging purposes only
|
| 58 |
DEVICE_ASSERT_DETECTED=0
|
|
|
|
| 62 |
def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
|
| 63 |
if agree == True:
|
| 64 |
supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
| 65 |
+
|
| 66 |
if language not in supported_languages:
|
| 67 |
+
gr.Warning(f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown")
|
| 68 |
|
| 69 |
return (
|
| 70 |
None,
|
| 71 |
None,
|
| 72 |
None,
|
| 73 |
+
None,
|
| 74 |
)
|
| 75 |
|
| 76 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
|
|
|
| 95 |
None,
|
| 96 |
None,
|
| 97 |
None,
|
| 98 |
+
None,
|
| 99 |
)
|
| 100 |
|
| 101 |
|
|
|
|
| 108 |
None,
|
| 109 |
None,
|
| 110 |
None,
|
| 111 |
+
None,
|
| 112 |
)
|
| 113 |
|
| 114 |
else:
|
|
|
|
| 154 |
None,
|
| 155 |
None,
|
| 156 |
None,
|
| 157 |
+
None,
|
| 158 |
)
|
| 159 |
if len(prompt)>200:
|
| 160 |
gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
|
|
|
|
| 162 |
None,
|
| 163 |
None,
|
| 164 |
None,
|
| 165 |
+
None,
|
| 166 |
)
|
| 167 |
global DEVICE_ASSERT_DETECTED
|
| 168 |
if DEVICE_ASSERT_DETECTED:
|
|
|
|
| 172 |
print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
|
| 173 |
|
| 174 |
try:
|
| 175 |
+
metrics_text=""
|
| 176 |
+
t_latent=time.time()
|
| 177 |
+
|
| 178 |
+
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
| 179 |
+
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
|
| 180 |
+
latent_calculation_time = time.time() - t_latent
|
| 181 |
+
#metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
|
| 182 |
+
|
| 183 |
+
wav_chunks = []
|
| 184 |
+
|
| 185 |
+
print("I: Generating new audio...")
|
| 186 |
+
t0 = time.time()
|
| 187 |
+
out = model.inference(
|
| 188 |
+
prompt,
|
| 189 |
+
language,
|
| 190 |
+
gpt_cond_latent,
|
| 191 |
+
speaker_embedding,
|
| 192 |
+
diffusion_conditioning
|
| 193 |
)
|
| 194 |
+
inference_time = time.time() - t0
|
| 195 |
+
print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
|
| 196 |
+
metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
|
| 197 |
+
real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
|
| 198 |
+
print(f"Real-time factor (RTF): {real_time_factor}")
|
| 199 |
+
metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
|
| 200 |
+
torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
| 201 |
+
|
| 202 |
except RuntimeError as e :
|
| 203 |
if "device-side assert" in str(e):
|
| 204 |
# cannot do anything on cuda device side error, need tor estart
|
|
|
|
| 221 |
audio="output.wav",
|
| 222 |
),
|
| 223 |
"output.wav",
|
| 224 |
+
metrics_text,
|
| 225 |
speaker_wav,
|
| 226 |
)
|
| 227 |
else:
|
|
|
|
| 230 |
None,
|
| 231 |
None,
|
| 232 |
None,
|
| 233 |
+
None,
|
| 234 |
)
|
| 235 |
|
| 236 |
|
|
|
|
| 255 |
Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
|
| 256 |
Russian: ru, Spanish: es, Turkish: tr <br/>
|
| 257 |
</p>
|
| 258 |
+
<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8946ef36-c454-4a8e-a9c9-8a8dd735fabd" />
|
| 259 |
"""
|
| 260 |
|
| 261 |
article = """
|
|
|
|
| 284 |
False,
|
| 285 |
False,
|
| 286 |
True,
|
|
|
|
| 287 |
],
|
| 288 |
[
|
| 289 |
"Als ich sechs war, sah ich einmal ein wunderbares Bild",
|
|
|
|
| 448 |
],
|
| 449 |
outputs=[
|
| 450 |
gr.Video(label="Waveform Visual"),
|
| 451 |
+
gr.Audio(label="Synthesised Audio",autoplay=True),
|
| 452 |
+
gr.Text(label="Metrics"),
|
| 453 |
gr.Audio(label="Reference Audio Used"),
|
| 454 |
],
|
| 455 |
title=title,
|