Spaces:
Running
Running
Testing First Api version
Browse files- app.py +226 -56
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,24 +1,51 @@
|
|
1 |
from __future__ import annotations
|
|
|
|
|
2 |
import os
|
|
|
|
|
|
|
|
|
3 |
#Use GPU
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
#download for mecab
|
7 |
-
os.system(
|
|
|
8 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
9 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
10 |
# NOTE: for streaming will require gradio audio streaming fix
|
11 |
# pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
|
12 |
#Now you’re ready to install 🤗 Transformers with the following command:
|
|
|
|
|
13 |
if not is_gpu:
|
14 |
-
#For CPU-support only, Transformers and PyTorch with:
|
15 |
-
os.system('pip install transformers[torch]')
|
16 |
#pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
|
17 |
-
os.system('pip install llama-cpp-python==0.2.11')
|
18 |
else:
|
19 |
# we need to compile a CUBLAS version
|
20 |
# Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
|
21 |
-
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
|
|
|
|
|
22 |
import textwrap
|
23 |
from scipy.io.wavfile import write
|
24 |
from pydub import AudioSegment
|
@@ -61,22 +88,24 @@ import numpy as np
|
|
61 |
from gradio_client import Client
|
62 |
from huggingface_hub import InferenceClient
|
63 |
|
|
|
|
|
|
|
|
|
|
|
64 |
# This will trigger downloading model
|
65 |
print("Downloading if not downloaded Coqui XTTS V2")
|
66 |
-
|
67 |
from TTS.utils.manage import ModelManager
|
68 |
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
69 |
ModelManager().download_model(model_name)
|
70 |
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
71 |
print("XTTS downloaded")
|
72 |
|
73 |
-
|
74 |
if is_gpu:
|
75 |
use_deepspeed=True
|
76 |
else:
|
77 |
use_deepspeed=False
|
78 |
|
79 |
-
|
80 |
print("Loading XTTS")
|
81 |
config = XttsConfig()
|
82 |
config.load_json(os.path.join(model_path, "config.json"))
|
@@ -89,10 +118,16 @@ model.load_checkpoint(
|
|
89 |
eval=True,
|
90 |
use_deepspeed=use_deepspeed,
|
91 |
)
|
92 |
-
|
93 |
-
|
|
|
|
|
94 |
print("Done loading TTS")
|
95 |
|
|
|
|
|
|
|
|
|
96 |
#####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
|
97 |
|
98 |
title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
|
@@ -141,19 +176,40 @@ pirate_system_message = f"You as {character_name}. {character_scenario} Print ou
|
|
141 |
ROLE_PROMPTS["Pirate"]= pirate_system_message
|
142 |
##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
|
143 |
|
|
|
144 |
|
145 |
-
|
146 |
-
### WILL USE LOCAL MISTRAL OR ZEPHYR
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
from huggingface_hub import hf_hub_download
|
|
|
149 |
print("Downloading LLM")
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
-
print("Downloading Zephyr")
|
153 |
-
#Zephyr
|
154 |
-
hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
|
155 |
-
# use new gguf format
|
156 |
-
zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
|
157 |
|
158 |
from llama_cpp import Llama
|
159 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
@@ -166,7 +222,7 @@ else:
|
|
166 |
LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
|
167 |
LLAMA_VERBOSE=False
|
168 |
|
169 |
-
|
170 |
llm_zephyr = Llama(model_path=zephyr_model_path,
|
171 |
n_gpu_layers=GPU_LAYERS,
|
172 |
max_new_tokens=512,
|
@@ -175,6 +231,12 @@ llm_zephyr = Llama(model_path=zephyr_model_path,
|
|
175 |
n_batch=128,
|
176 |
verbose=LLAMA_VERBOSE)
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
def split_sentences(text, max_len):
|
179 |
# Apply custom rules to enforce sentence breaks with double punctuation
|
180 |
text = re.sub(r"(\s*\.{2})\s*", r".\1 ", text) # for '..'
|
@@ -241,7 +303,11 @@ def pcm_to_wav(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
|
|
241 |
|
242 |
return wav_header + fmt_subchunk + data_subchunk + pcm_data
|
243 |
|
244 |
-
|
|
|
|
|
|
|
|
|
245 |
prompt,
|
246 |
history,
|
247 |
system_message=None,
|
@@ -302,6 +368,82 @@ def generate_local(
|
|
302 |
|
303 |
return output
|
304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
def get_latents(speaker_wav,voice_cleanup=False):
|
306 |
if (voice_cleanup):
|
307 |
try:
|
@@ -367,7 +509,11 @@ def detect_language(prompt):
|
|
367 |
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
368 |
|
369 |
return language
|
370 |
-
|
|
|
|
|
|
|
|
|
371 |
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
372 |
gpt_cond_latent, speaker_embedding = latent_tuple
|
373 |
|
@@ -376,9 +522,9 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
|
376 |
chunks = model.inference_stream(
|
377 |
prompt,
|
378 |
language,
|
379 |
-
gpt_cond_latent,
|
380 |
-
speaker_embedding,
|
381 |
-
#repetition_penalty=5.0,
|
382 |
temperature=0.85,
|
383 |
)
|
384 |
|
@@ -388,9 +534,10 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
|
388 |
first_chunk_time = time.time() - t0
|
389 |
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
390 |
first_chunk = False
|
391 |
-
#print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
392 |
|
393 |
-
#
|
|
|
|
|
394 |
chunk = chunk.detach().cpu().numpy().squeeze()
|
395 |
chunk = (chunk * 32767).astype(np.int16)
|
396 |
|
@@ -398,26 +545,32 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
|
398 |
|
399 |
except RuntimeError as e:
|
400 |
if "device-side assert" in str(e):
|
401 |
-
# cannot do anything on cuda device side error, need
|
402 |
-
print(
|
403 |
-
f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
|
404 |
-
flush=True,
|
405 |
-
)
|
406 |
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
407 |
print("Cuda device-assert Runtime encountered need restart")
|
408 |
|
409 |
-
# HF Space specific.. This error is unrecoverable need to restart space
|
410 |
api.restart_space(repo_id=repo_id)
|
411 |
else:
|
412 |
print("RuntimeError: non device-side assert error:", str(e))
|
413 |
-
# Does not require warning happens on empty chunk and at end
|
414 |
###gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
415 |
return None
|
416 |
return None
|
417 |
except:
|
418 |
return None
|
419 |
|
420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
# Will be triggered on text submit (will send to generate_speech)
|
422 |
def add_text(history, text):
|
423 |
history = [] if history is None else history
|
@@ -522,31 +675,33 @@ def get_sentence(history, chatbot_role):
|
|
522 |
print("ERROR on last sentence history is :", history)
|
523 |
|
524 |
|
|
|
|
|
|
|
525 |
from scipy.io.wavfile import write
|
526 |
from pydub import AudioSegment
|
527 |
|
528 |
second_of_silence = AudioSegment.silent() # use default
|
529 |
second_of_silence.export("sil.wav", format='wav')
|
530 |
|
531 |
-
|
|
|
|
|
|
|
|
|
532 |
def generate_speech_from_history(history, chatbot_role, sentence):
|
533 |
language = "autodetect"
|
534 |
-
|
535 |
# total_wav_bytestream = b""
|
536 |
-
|
537 |
if len(sentence)==0:
|
538 |
print("EMPTY SENTENCE")
|
539 |
return
|
540 |
-
|
541 |
# Sometimes prompt </s> coming on output remove it
|
542 |
# Some post process for speech only
|
543 |
sentence = sentence.replace("</s>", "")
|
544 |
# remove code from speech
|
545 |
sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
|
546 |
sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
|
547 |
-
|
548 |
sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
|
549 |
-
|
550 |
sentence = sentence.replace("```", "")
|
551 |
sentence = sentence.replace("...", " ")
|
552 |
sentence = sentence.replace("(", " ")
|
@@ -555,8 +710,8 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
555 |
|
556 |
if len(sentence)==0:
|
557 |
print("EMPTY SENTENCE after processing")
|
558 |
-
return
|
559 |
-
|
560 |
# A fast fix for last character, may produce weird sounds if it is with text
|
561 |
#if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
|
562 |
# # just add a space
|
@@ -579,19 +734,16 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
579 |
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
580 |
# sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
|
581 |
sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
|
582 |
-
|
583 |
print("detected sentences:", sentence_list)
|
584 |
-
|
585 |
for sentence in sentence_list:
|
586 |
-
|
587 |
print("- sentence = ", sentence)
|
588 |
-
|
589 |
if any(c.isalnum() for c in sentence):
|
590 |
if language=="autodetect":
|
591 |
#on first call autodetect, nexts sentence calls will use same language
|
592 |
language = detect_language(sentence)
|
593 |
-
|
594 |
-
|
|
|
595 |
audio_stream = get_voice_streaming(
|
596 |
sentence, language, latent_map[chatbot_role]
|
597 |
)
|
@@ -604,7 +756,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
604 |
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
605 |
if audio_stream is not None:
|
606 |
sentence_wav_bytestream = b""
|
607 |
-
|
608 |
# frame_length = 0
|
609 |
for chunk in audio_stream:
|
610 |
try:
|
@@ -626,7 +778,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
626 |
sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
|
627 |
except:
|
628 |
print("failed to remove noise")
|
629 |
-
|
630 |
# Directly encode the WAV bytestream to base64
|
631 |
base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
|
632 |
|
@@ -653,18 +805,35 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
653 |
|
654 |
return results
|
655 |
|
|
|
|
|
|
|
|
|
656 |
latent_map = {}
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
661 |
|
662 |
# Define the main function for the API endpoint that takes the input text and chatbot role
|
663 |
def generate_story_and_speech(secret_token, input_text, chatbot_role):
|
664 |
if secret_token != SECRET_TOKEN:
|
665 |
raise gr.Error(
|
666 |
f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
|
667 |
-
|
668 |
# Initialize a list of lists for history with the user input as the first entry
|
669 |
history = [[input_text, None]]
|
670 |
story_sentences = get_sentence(history, chatbot_role) # get_sentence function generates text
|
@@ -695,4 +864,5 @@ demo = gr.Interface(
|
|
695 |
)
|
696 |
|
697 |
demo.queue()
|
698 |
-
demo.launch(debug=True)
|
|
|
|
1 |
from __future__ import annotations
|
2 |
+
from IPython.display import clear_output
|
3 |
+
from IPython import get_ipython
|
4 |
import os
|
5 |
+
#os.system('pip install -r requirements.txt')
|
6 |
+
#os.system('pip install gradio==3.48.0')
|
7 |
+
#os.system('pip install python-dotenv')
|
8 |
+
# In[1]:
|
9 |
#Use GPU
|
10 |
+
gpu_info = get_ipython().getoutput('nvidia-smi')
|
11 |
+
gpu_info = '\n'.join(gpu_info)
|
12 |
+
if gpu_info.find('failed') >= 0:
|
13 |
+
print('Not connected to a GPU')
|
14 |
+
is_gpu=False
|
15 |
+
else:
|
16 |
+
print(gpu_info)
|
17 |
+
is_gpu=True
|
18 |
|
19 |
+
#is_gpu=False
|
20 |
+
# In[2]:
|
21 |
+
# In[3]:
|
22 |
+
import os
|
23 |
+
import dotenv
|
24 |
+
# Load the environment variables from the .env file
|
25 |
+
dotenv.load_dotenv()
|
26 |
+
# Access the value of the SECRET_TOKEN variable
|
27 |
+
secret_token = os.getenv("SECRET_TOKEN")
|
28 |
+
# In[7]:
|
29 |
+
import os
|
30 |
#download for mecab
|
31 |
+
os.system("python -m unidic download")
|
32 |
+
# In[5]:
|
33 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
34 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
35 |
# NOTE: for streaming will require gradio audio streaming fix
|
36 |
# pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
|
37 |
#Now you’re ready to install 🤗 Transformers with the following command:
|
38 |
+
#For CPU-support only, Transformers and PyTorch with:
|
39 |
+
os.system('pip install transformers[torch]')
|
40 |
if not is_gpu:
|
|
|
|
|
41 |
#pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
|
42 |
+
os.system('pip install llama-cpp-python==0.2.11')
|
43 |
else:
|
44 |
# we need to compile a CUBLAS version
|
45 |
# Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
|
46 |
+
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
|
47 |
+
|
48 |
+
# In[8]:
|
49 |
import textwrap
|
50 |
from scipy.io.wavfile import write
|
51 |
from pydub import AudioSegment
|
|
|
88 |
from gradio_client import Client
|
89 |
from huggingface_hub import InferenceClient
|
90 |
|
91 |
+
|
92 |
+
|
93 |
+
# In[9]:
|
94 |
+
|
95 |
+
|
96 |
# This will trigger downloading model
|
97 |
print("Downloading if not downloaded Coqui XTTS V2")
|
|
|
98 |
from TTS.utils.manage import ModelManager
|
99 |
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
100 |
ModelManager().download_model(model_name)
|
101 |
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
102 |
print("XTTS downloaded")
|
103 |
|
|
|
104 |
if is_gpu:
|
105 |
use_deepspeed=True
|
106 |
else:
|
107 |
use_deepspeed=False
|
108 |
|
|
|
109 |
print("Loading XTTS")
|
110 |
config = XttsConfig()
|
111 |
config.load_json(os.path.join(model_path, "config.json"))
|
|
|
118 |
eval=True,
|
119 |
use_deepspeed=use_deepspeed,
|
120 |
)
|
121 |
+
|
122 |
+
#if is_gpu:
|
123 |
+
# model.cuda()
|
124 |
+
|
125 |
print("Done loading TTS")
|
126 |
|
127 |
+
|
128 |
+
# In[60]:
|
129 |
+
|
130 |
+
|
131 |
#####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
|
132 |
|
133 |
title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
|
|
|
176 |
ROLE_PROMPTS["Pirate"]= pirate_system_message
|
177 |
##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
|
178 |
|
179 |
+
# In[49]:
|
180 |
|
|
|
|
|
181 |
|
182 |
+
|
183 |
+
|
184 |
+
|
185 |
+
# In[15]:
|
186 |
+
|
187 |
+
|
188 |
+
### WILL USE LOCAL MISTRAL OR ZEPHYR
|
189 |
+
import os
|
190 |
from huggingface_hub import hf_hub_download
|
191 |
+
|
192 |
print("Downloading LLM")
|
193 |
|
194 |
+
# Get the current directory
|
195 |
+
current_dir = os.getcwd()
|
196 |
+
# Append the current directory to the zephyr_model_path
|
197 |
+
zephyr_model_path = os.path.join(current_dir, "zephyr-7b-beta.Q5_K_M.gguf")
|
198 |
+
if not os.path.isfile(zephyr_model_path):
|
199 |
+
print("Downloading Zephyr")
|
200 |
+
hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=current_dir, filename="zephyr-7b-beta.Q5_K_M.gguf")
|
201 |
+
else:
|
202 |
+
print("Zephyr it is already downloaded")
|
203 |
+
|
204 |
+
|
205 |
+
# In[ ]:
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
|
210 |
+
|
211 |
+
# In[16]:
|
212 |
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
from llama_cpp import Llama
|
215 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
|
|
222 |
LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
|
223 |
LLAMA_VERBOSE=False
|
224 |
|
225 |
+
|
226 |
llm_zephyr = Llama(model_path=zephyr_model_path,
|
227 |
n_gpu_layers=GPU_LAYERS,
|
228 |
max_new_tokens=512,
|
|
|
231 |
n_batch=128,
|
232 |
verbose=LLAMA_VERBOSE)
|
233 |
|
234 |
+
print("Running LLM Zephyr")
|
235 |
+
|
236 |
+
|
237 |
+
# In[17]:
|
238 |
+
|
239 |
+
|
240 |
def split_sentences(text, max_len):
|
241 |
# Apply custom rules to enforce sentence breaks with double punctuation
|
242 |
text = re.sub(r"(\s*\.{2})\s*", r".\1 ", text) # for '..'
|
|
|
303 |
|
304 |
return wav_header + fmt_subchunk + data_subchunk + pcm_data
|
305 |
|
306 |
+
|
307 |
+
# In[23]:
|
308 |
+
|
309 |
+
|
310 |
+
def generate_local_llm(
|
311 |
prompt,
|
312 |
history,
|
313 |
system_message=None,
|
|
|
368 |
|
369 |
return output
|
370 |
|
371 |
+
|
372 |
+
# In[28]:
|
373 |
+
|
374 |
+
|
375 |
+
get_ipython().system('pip install OpenAI')
|
376 |
+
|
377 |
+
|
378 |
+
# In[103]:
|
379 |
+
|
380 |
+
|
381 |
+
def generate_stream(prompt, model="mixtral-8x7b"):
|
382 |
+
base_url = "https://ruslanmv-hf-llm-api.hf.space"
|
383 |
+
api_key = "sk-xxxxx"
|
384 |
+
client = OpenAI(base_url=base_url, api_key=api_key)
|
385 |
+
response = client.chat.completions.create(
|
386 |
+
model=model,
|
387 |
+
messages=[
|
388 |
+
{
|
389 |
+
"role": "user",
|
390 |
+
"content": "{}".format(prompt),
|
391 |
+
}
|
392 |
+
],
|
393 |
+
stream=True,
|
394 |
+
)
|
395 |
+
return response
|
396 |
+
def generate_local(
|
397 |
+
prompt,
|
398 |
+
history,
|
399 |
+
system_message=None,
|
400 |
+
temperature=0.8,
|
401 |
+
max_tokens=256,
|
402 |
+
top_p=0.95,
|
403 |
+
stop=None,
|
404 |
+
):
|
405 |
+
|
406 |
+
formatted_prompt = format_prompt_zephyr(prompt, history, system_message=system_message)
|
407 |
+
try:
|
408 |
+
print("LLM Input:", formatted_prompt)
|
409 |
+
output = ""
|
410 |
+
stream=generate_stream(formatted_prompt)
|
411 |
+
for response in stream:
|
412 |
+
character=response.choices[0].delta.content
|
413 |
+
if "<|user|>" in character:
|
414 |
+
# end of context
|
415 |
+
return
|
416 |
+
if emoji.is_emoji(character):
|
417 |
+
# Bad emoji not a meaning messes chat from next lines
|
418 |
+
return
|
419 |
+
if character is not None:
|
420 |
+
print(character, end="", flush=True)
|
421 |
+
output += character
|
422 |
+
elif response.choices[0].finish_reason == "stop":
|
423 |
+
print()
|
424 |
+
else:
|
425 |
+
pass
|
426 |
+
yield output
|
427 |
+
|
428 |
+
except Exception as e:
|
429 |
+
if "Too Many Requests" in str(e):
|
430 |
+
print("ERROR: Too many requests on mistral client")
|
431 |
+
#gr.Warning("Unfortunately Mistral is unable to process")
|
432 |
+
output = "Unfortunately I am not able to process your request now !"
|
433 |
+
else:
|
434 |
+
print("Unhandled Exception: ", str(e))
|
435 |
+
#gr.Warning("Unfortunately Mistral is unable to process")
|
436 |
+
output = "I do not know what happened but I could not understand you ."
|
437 |
+
|
438 |
+
return output
|
439 |
+
|
440 |
+
|
441 |
+
# In[ ]:
|
442 |
+
|
443 |
+
|
444 |
+
# In[17]:
|
445 |
+
|
446 |
+
|
447 |
def get_latents(speaker_wav,voice_cleanup=False):
|
448 |
if (voice_cleanup):
|
449 |
try:
|
|
|
509 |
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
510 |
|
511 |
return language
|
512 |
+
|
513 |
+
|
514 |
+
# In[18]:
|
515 |
+
|
516 |
+
|
517 |
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
518 |
gpt_cond_latent, speaker_embedding = latent_tuple
|
519 |
|
|
|
522 |
chunks = model.inference_stream(
|
523 |
prompt,
|
524 |
language,
|
525 |
+
gpt_cond_latent.to(device), # Ensure gpt_cond_latent is on the same device
|
526 |
+
speaker_embedding.to(device), # Ensure speaker_embedding is on the same device
|
527 |
+
# repetition_penalty=5.0,
|
528 |
temperature=0.85,
|
529 |
)
|
530 |
|
|
|
534 |
first_chunk_time = time.time() - t0
|
535 |
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
536 |
first_chunk = False
|
|
|
537 |
|
538 |
+
# print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
539 |
+
|
540 |
+
# Ensure chunk is on the same device and convert to numpy array
|
541 |
chunk = chunk.detach().cpu().numpy().squeeze()
|
542 |
chunk = (chunk * 32767).astype(np.int16)
|
543 |
|
|
|
545 |
|
546 |
except RuntimeError as e:
|
547 |
if "device-side assert" in str(e):
|
548 |
+
# cannot do anything on cuda device side error, need to restart
|
549 |
+
print(f"Exit due to: Unrecoverable exception caused by prompt: {prompt}", flush=True)
|
|
|
|
|
|
|
550 |
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
551 |
print("Cuda device-assert Runtime encountered need restart")
|
552 |
|
553 |
+
# HF Space specific.. This error is unrecoverable; need to restart space
|
554 |
api.restart_space(repo_id=repo_id)
|
555 |
else:
|
556 |
print("RuntimeError: non device-side assert error:", str(e))
|
557 |
+
# Does not require warning; happens on empty chunk and at the end
|
558 |
###gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
559 |
return None
|
560 |
return None
|
561 |
except:
|
562 |
return None
|
563 |
|
564 |
+
|
565 |
+
# In[ ]:
|
566 |
+
|
567 |
+
|
568 |
+
|
569 |
+
|
570 |
+
|
571 |
+
# In[19]:
|
572 |
+
|
573 |
+
|
574 |
# Will be triggered on text submit (will send to generate_speech)
|
575 |
def add_text(history, text):
|
576 |
history = [] if history is None else history
|
|
|
675 |
print("ERROR on last sentence history is :", history)
|
676 |
|
677 |
|
678 |
+
# In[19]:
|
679 |
+
|
680 |
+
|
681 |
from scipy.io.wavfile import write
|
682 |
from pydub import AudioSegment
|
683 |
|
684 |
second_of_silence = AudioSegment.silent() # use default
|
685 |
second_of_silence.export("sil.wav", format='wav')
|
686 |
|
687 |
+
|
688 |
+
|
689 |
+
# In[20]:
|
690 |
+
|
691 |
+
|
692 |
def generate_speech_from_history(history, chatbot_role, sentence):
|
693 |
language = "autodetect"
|
|
|
694 |
# total_wav_bytestream = b""
|
|
|
695 |
if len(sentence)==0:
|
696 |
print("EMPTY SENTENCE")
|
697 |
return
|
|
|
698 |
# Sometimes prompt </s> coming on output remove it
|
699 |
# Some post process for speech only
|
700 |
sentence = sentence.replace("</s>", "")
|
701 |
# remove code from speech
|
702 |
sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
|
703 |
sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
|
|
|
704 |
sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
|
|
|
705 |
sentence = sentence.replace("```", "")
|
706 |
sentence = sentence.replace("...", " ")
|
707 |
sentence = sentence.replace("(", " ")
|
|
|
710 |
|
711 |
if len(sentence)==0:
|
712 |
print("EMPTY SENTENCE after processing")
|
713 |
+
return
|
714 |
+
|
715 |
# A fast fix for last character, may produce weird sounds if it is with text
|
716 |
#if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
|
717 |
# # just add a space
|
|
|
734 |
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
735 |
# sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
|
736 |
sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
|
|
|
737 |
print("detected sentences:", sentence_list)
|
|
|
738 |
for sentence in sentence_list:
|
|
|
739 |
print("- sentence = ", sentence)
|
|
|
740 |
if any(c.isalnum() for c in sentence):
|
741 |
if language=="autodetect":
|
742 |
#on first call autodetect, nexts sentence calls will use same language
|
743 |
language = detect_language(sentence)
|
744 |
+
#exists at least 1 alphanumeric (utf-8)
|
745 |
+
|
746 |
+
#print("Inserting data to get_voice_streaming:")
|
747 |
audio_stream = get_voice_streaming(
|
748 |
sentence, language, latent_map[chatbot_role]
|
749 |
)
|
|
|
756 |
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
757 |
if audio_stream is not None:
|
758 |
sentence_wav_bytestream = b""
|
759 |
+
|
760 |
# frame_length = 0
|
761 |
for chunk in audio_stream:
|
762 |
try:
|
|
|
778 |
sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
|
779 |
except:
|
780 |
print("failed to remove noise")
|
781 |
+
|
782 |
# Directly encode the WAV bytestream to base64
|
783 |
base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
|
784 |
|
|
|
805 |
|
806 |
return results
|
807 |
|
808 |
+
|
809 |
+
# In[21]:
|
810 |
+
|
811 |
+
|
812 |
latent_map = {}
|
813 |
+
try:
|
814 |
+
# get the current working directory
|
815 |
+
path= os.getcwd()
|
816 |
+
name1="voices/cloee-1.wav"
|
817 |
+
name2="voices/julian-bedtime-style-1.wav"
|
818 |
+
name3="voices/pirate_by_coqui.wav"
|
819 |
+
name4="voices/thera-1.wav"
|
820 |
+
latent_map["Cloée"] = get_latents(os.path.join(path, name1))
|
821 |
+
latent_map["Julian"] = get_latents(os.path.join(path, name2))
|
822 |
+
latent_map["Pirate"] = get_latents(os.path.join(path, name3))
|
823 |
+
latent_map["Thera"] = get_latents(os.path.join(path, name4))
|
824 |
+
|
825 |
+
except Exception as e:
|
826 |
+
print("Error:", str(e))
|
827 |
+
|
828 |
+
|
829 |
+
# In[ ]:
|
830 |
+
|
831 |
|
832 |
# Define the main function for the API endpoint that takes the input text and chatbot role
|
833 |
def generate_story_and_speech(secret_token, input_text, chatbot_role):
|
834 |
if secret_token != SECRET_TOKEN:
|
835 |
raise gr.Error(
|
836 |
f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
|
|
|
837 |
# Initialize a list of lists for history with the user input as the first entry
|
838 |
history = [[input_text, None]]
|
839 |
story_sentences = get_sentence(history, chatbot_role) # get_sentence function generates text
|
|
|
864 |
)
|
865 |
|
866 |
demo.queue()
|
867 |
+
demo.launch(debug=True)
|
868 |
+
|
requirements.txt
CHANGED
@@ -19,4 +19,5 @@ asyncio
|
|
19 |
noisereduce==3.0.0
|
20 |
#deepspeed
|
21 |
#deepspeed==0.12.6
|
22 |
-
deepspeed==0.10.0
|
|
|
|
19 |
noisereduce==3.0.0
|
20 |
#deepspeed
|
21 |
#deepspeed==0.12.6
|
22 |
+
deepspeed==0.10.0
|
23 |
+
ipython
|