Spaces:
Runtime error
Runtime error
Commit
·
2b2b539
1
Parent(s):
ca0feab
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,6 +13,7 @@ import torch
|
|
| 13 |
import nltk # we'll use this to split into sentences
|
| 14 |
nltk.download("punkt")
|
| 15 |
|
|
|
|
| 16 |
import langid
|
| 17 |
import uuid
|
| 18 |
|
|
@@ -114,8 +115,8 @@ import numpy as np
|
|
| 114 |
from gradio_client import Client
|
| 115 |
from huggingface_hub import InferenceClient
|
| 116 |
|
| 117 |
-
WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT",
|
| 118 |
-
whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
|
| 119 |
text_client = InferenceClient(
|
| 120 |
"mistralai/Mistral-7B-Instruct-v0.1",
|
| 121 |
timeout=WHISPER_TIMEOUT,
|
|
@@ -133,8 +134,25 @@ def get_latents(speaker_wav):
|
|
| 133 |
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
| 134 |
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
| 135 |
|
| 136 |
-
def get_latents(speaker_wav):
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
(
|
| 139 |
gpt_cond_latent,
|
| 140 |
diffusion_conditioning,
|
|
@@ -161,11 +179,9 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
|
|
| 161 |
return wav_buf.read()
|
| 162 |
|
| 163 |
xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
| 164 |
-
def
|
| 165 |
-
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
| 166 |
-
|
| 167 |
# Fast language autodetection
|
| 168 |
-
if len(prompt)>15
|
| 169 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
| 170 |
if language_predicted == "zh":
|
| 171 |
#we use zh-cn on xtts
|
|
@@ -181,7 +197,12 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
|
| 181 |
# Hard to detect language fast in short sentence, use english default
|
| 182 |
language = "en"
|
| 183 |
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
|
|
|
|
|
|
| 184 |
|
|
|
|
|
|
|
|
|
|
| 185 |
try:
|
| 186 |
t0 = time.time()
|
| 187 |
chunks = model.inference_stream(
|
|
@@ -197,7 +218,7 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
|
| 197 |
first_chunk_time = time.time() - t0
|
| 198 |
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
| 199 |
first_chunk = False
|
| 200 |
-
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
| 201 |
|
| 202 |
# In case output is required to be multiple voice files
|
| 203 |
# out_file = f'{char}_{i}.wav'
|
|
@@ -368,22 +389,48 @@ def get_sentence(history, system_prompt=""):
|
|
| 368 |
sentence_hash_list = []
|
| 369 |
|
| 370 |
text_to_generate = ""
|
|
|
|
|
|
|
| 371 |
for character in generate(history[-1][0], history[:-1]):
|
| 372 |
history[-1][1] = character
|
| 373 |
# It is coming word by word
|
| 374 |
|
| 375 |
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
|
| 376 |
-
|
| 377 |
if len(text_to_generate) > 1:
|
| 378 |
dif = len(text_to_generate) - len(sentence_list)
|
| 379 |
|
| 380 |
if dif == 1 and len(sentence_list) != 0:
|
| 381 |
continue
|
| 382 |
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
sentence_hash = hash(sentence)
|
| 386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
if sentence_hash not in sentence_hash_list:
|
| 388 |
sentence_hash_list.append(sentence_hash)
|
| 389 |
sentence_list.append(sentence)
|
|
@@ -394,9 +441,14 @@ def get_sentence(history, system_prompt=""):
|
|
| 394 |
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
|
| 395 |
sentence_hash = hash(last_sentence)
|
| 396 |
if sentence_hash not in sentence_hash_list:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
sentence_hash_list.append(sentence_hash)
|
| 398 |
sentence_list.append(last_sentence)
|
| 399 |
-
print("
|
| 400 |
|
| 401 |
yield (last_sentence, history)
|
| 402 |
|
|
@@ -408,6 +460,7 @@ def generate_speech(history):
|
|
| 408 |
wav_bytestream = b""
|
| 409 |
for sentence, history in get_sentence(history):
|
| 410 |
print(sentence)
|
|
|
|
| 411 |
# Sometimes prompt </s> coming on output remove it
|
| 412 |
# Some post process for speech only
|
| 413 |
sentence = sentence.replace("</s>", "")
|
|
@@ -417,9 +470,9 @@ def generate_speech(history):
|
|
| 417 |
sentence = sentence.replace("```", "")
|
| 418 |
sentence = sentence.replace("(", " ")
|
| 419 |
sentence = sentence.replace(")", " ")
|
| 420 |
-
|
| 421 |
# A fast fix for last chacter, may produce weird sounds if it is with text
|
| 422 |
-
if sentence[-1] in ["!", "?", ".", ","]:
|
| 423 |
# just add a space
|
| 424 |
sentence = sentence[:-1] + " " + sentence[-1]
|
| 425 |
print("Sentence for speech:", sentence)
|
|
@@ -436,7 +489,12 @@ def generate_speech(history):
|
|
| 436 |
print("SPLITTED LONG SENTENCE:",sentence_list)
|
| 437 |
|
| 438 |
for sentence in sentence_list:
|
|
|
|
| 439 |
if any(c.isalnum() for c in sentence):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
#exists at least 1 alphanumeric (utf-8)
|
| 441 |
audio_stream = get_voice_streaming(
|
| 442 |
sentence, language, latent_map["Female_Voice"]
|
|
@@ -511,7 +569,7 @@ def generate_speech(history):
|
|
| 511 |
print("RuntimeError: non device-side assert error:", str(e))
|
| 512 |
raise e
|
| 513 |
|
| 514 |
-
time.sleep(1
|
| 515 |
wav_bytestream = wave_header_chunk() + wav_bytestream
|
| 516 |
outfile = "combined.wav"
|
| 517 |
with open(outfile, "wb") as f:
|
|
@@ -587,4 +645,4 @@ Note:
|
|
| 587 |
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
|
| 588 |
)
|
| 589 |
demo.queue()
|
| 590 |
-
demo.launch(debug=True)
|
|
|
|
| 13 |
import nltk # we'll use this to split into sentences
|
| 14 |
nltk.download("punkt")
|
| 15 |
|
| 16 |
+
import subprocess
|
| 17 |
import langid
|
| 18 |
import uuid
|
| 19 |
|
|
|
|
| 115 |
from gradio_client import Client
|
| 116 |
from huggingface_hub import InferenceClient
|
| 117 |
|
| 118 |
+
WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 45))
|
| 119 |
+
whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/",timeout=WHISPER_TIMEOUT)
|
| 120 |
text_client = InferenceClient(
|
| 121 |
"mistralai/Mistral-7B-Instruct-v0.1",
|
| 122 |
timeout=WHISPER_TIMEOUT,
|
|
|
|
| 134 |
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
| 135 |
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
| 136 |
|
| 137 |
+
def get_latents(speaker_wav,voice_cleanup=False):
|
| 138 |
+
if (voice_cleanup):
|
| 139 |
+
try:
|
| 140 |
+
cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
|
| 141 |
+
resample_filter="-ac 1 -ar 22050"
|
| 142 |
+
out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
|
| 143 |
+
#we will use newer ffmpeg as that has afftn denoise filter
|
| 144 |
+
shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
|
| 145 |
+
|
| 146 |
+
command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
|
| 147 |
+
speaker_wav=out_filename
|
| 148 |
+
print("Filtered microphone input")
|
| 149 |
+
except subprocess.CalledProcessError:
|
| 150 |
+
# There was an error - command exited with non-zero code
|
| 151 |
+
print("Error: failed filtering, use original microphone input")
|
| 152 |
+
else:
|
| 153 |
+
speaker_wav=speaker_wav
|
| 154 |
+
|
| 155 |
+
# create as function as we can populate here with voice cleanup/filtering
|
| 156 |
(
|
| 157 |
gpt_cond_latent,
|
| 158 |
diffusion_conditioning,
|
|
|
|
| 179 |
return wav_buf.read()
|
| 180 |
|
| 181 |
xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
| 182 |
+
def detect_language(prompt):
|
|
|
|
|
|
|
| 183 |
# Fast language autodetection
|
| 184 |
+
if len(prompt)>15:
|
| 185 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
| 186 |
if language_predicted == "zh":
|
| 187 |
#we use zh-cn on xtts
|
|
|
|
| 197 |
# Hard to detect language fast in short sentence, use english default
|
| 198 |
language = "en"
|
| 199 |
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
| 200 |
+
|
| 201 |
+
return language
|
| 202 |
|
| 203 |
+
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
| 204 |
+
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
| 205 |
+
|
| 206 |
try:
|
| 207 |
t0 = time.time()
|
| 208 |
chunks = model.inference_stream(
|
|
|
|
| 218 |
first_chunk_time = time.time() - t0
|
| 219 |
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
| 220 |
first_chunk = False
|
| 221 |
+
#print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
| 222 |
|
| 223 |
# In case output is required to be multiple voice files
|
| 224 |
# out_file = f'{char}_{i}.wav'
|
|
|
|
| 389 |
sentence_hash_list = []
|
| 390 |
|
| 391 |
text_to_generate = ""
|
| 392 |
+
stored_sentence = None
|
| 393 |
+
stored_sentence_hash = None
|
| 394 |
for character in generate(history[-1][0], history[:-1]):
|
| 395 |
history[-1][1] = character
|
| 396 |
# It is coming word by word
|
| 397 |
|
| 398 |
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
|
|
|
|
| 399 |
if len(text_to_generate) > 1:
|
| 400 |
dif = len(text_to_generate) - len(sentence_list)
|
| 401 |
|
| 402 |
if dif == 1 and len(sentence_list) != 0:
|
| 403 |
continue
|
| 404 |
|
| 405 |
+
if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
|
| 406 |
+
continue
|
|
|
|
| 407 |
|
| 408 |
+
# All this complexity due to trying append first short sentence to next one for proper language auto-detect
|
| 409 |
+
if stored_sentence is not None and stored_sentence_hash is None and dif>1:
|
| 410 |
+
#means we consumed stored sentence and should look at next sentence to generate
|
| 411 |
+
sentence = text_to_generate[len(sentence_list)+1]
|
| 412 |
+
elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
|
| 413 |
+
print("Appending stored")
|
| 414 |
+
sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
|
| 415 |
+
stored_sentence_hash = None
|
| 416 |
+
else:
|
| 417 |
+
sentence = text_to_generate[len(sentence_list)]
|
| 418 |
+
|
| 419 |
+
# too short sentence just append to next one if there is any
|
| 420 |
+
# this is for proper language detection
|
| 421 |
+
if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
|
| 422 |
+
if sentence[-1] in [".","!","?"]:
|
| 423 |
+
if stored_sentence_hash != hash(sentence):
|
| 424 |
+
stored_sentence = sentence
|
| 425 |
+
stored_sentence_hash = hash(sentence)
|
| 426 |
+
print("Storing:",stored_sentence)
|
| 427 |
+
continue
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
sentence_hash = hash(sentence)
|
| 431 |
+
if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
|
| 432 |
+
continue
|
| 433 |
+
|
| 434 |
if sentence_hash not in sentence_hash_list:
|
| 435 |
sentence_hash_list.append(sentence_hash)
|
| 436 |
sentence_list.append(sentence)
|
|
|
|
| 441 |
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
|
| 442 |
sentence_hash = hash(last_sentence)
|
| 443 |
if sentence_hash not in sentence_hash_list:
|
| 444 |
+
if stored_sentence is not None and stored_sentence_hash is not None:
|
| 445 |
+
last_sentence = stored_sentence + last_sentence
|
| 446 |
+
stored_sentence = stored_sentence_hash = None
|
| 447 |
+
print("Last Sentence with stored:",last_sentence)
|
| 448 |
+
|
| 449 |
sentence_hash_list.append(sentence_hash)
|
| 450 |
sentence_list.append(last_sentence)
|
| 451 |
+
print("Last Sentence: ", last_sentence)
|
| 452 |
|
| 453 |
yield (last_sentence, history)
|
| 454 |
|
|
|
|
| 460 |
wav_bytestream = b""
|
| 461 |
for sentence, history in get_sentence(history):
|
| 462 |
print(sentence)
|
| 463 |
+
|
| 464 |
# Sometimes prompt </s> coming on output remove it
|
| 465 |
# Some post process for speech only
|
| 466 |
sentence = sentence.replace("</s>", "")
|
|
|
|
| 470 |
sentence = sentence.replace("```", "")
|
| 471 |
sentence = sentence.replace("(", " ")
|
| 472 |
sentence = sentence.replace(")", " ")
|
| 473 |
+
|
| 474 |
# A fast fix for last chacter, may produce weird sounds if it is with text
|
| 475 |
+
if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
|
| 476 |
# just add a space
|
| 477 |
sentence = sentence[:-1] + " " + sentence[-1]
|
| 478 |
print("Sentence for speech:", sentence)
|
|
|
|
| 489 |
print("SPLITTED LONG SENTENCE:",sentence_list)
|
| 490 |
|
| 491 |
for sentence in sentence_list:
|
| 492 |
+
|
| 493 |
if any(c.isalnum() for c in sentence):
|
| 494 |
+
if language=="autodetect":
|
| 495 |
+
#on first call autodetect, nexts sentence calls will use same language
|
| 496 |
+
language = detect_language(sentence)
|
| 497 |
+
|
| 498 |
#exists at least 1 alphanumeric (utf-8)
|
| 499 |
audio_stream = get_voice_streaming(
|
| 500 |
sentence, language, latent_map["Female_Voice"]
|
|
|
|
| 569 |
print("RuntimeError: non device-side assert error:", str(e))
|
| 570 |
raise e
|
| 571 |
|
| 572 |
+
time.sleep(1)
|
| 573 |
wav_bytestream = wave_header_chunk() + wav_bytestream
|
| 574 |
outfile = "combined.wav"
|
| 575 |
with open(outfile, "wb") as f:
|
|
|
|
| 645 |
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
|
| 646 |
)
|
| 647 |
demo.queue()
|
| 648 |
+
demo.launch(debug=True)
|