ruslanmv commited on
Commit
80df7a3
·
1 Parent(s): 10e2779

Testing First Api version

Browse files
Files changed (2) hide show
  1. app.py +226 -56
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,24 +1,51 @@
1
  from __future__ import annotations
 
 
2
  import os
 
 
 
 
3
  #Use GPU
4
- is_gpu=False
 
 
 
 
 
 
 
5
 
 
 
 
 
 
 
 
 
 
 
 
6
  #download for mecab
7
- os.system('python -m unidic download')
 
8
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
9
  os.environ["COQUI_TOS_AGREED"] = "1"
10
  # NOTE: for streaming will require gradio audio streaming fix
11
  # pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
12
  #Now you’re ready to install 🤗 Transformers with the following command:
 
 
13
  if not is_gpu:
14
- #For CPU-support only, Transformers and PyTorch with:
15
- os.system('pip install transformers[torch]')
16
  #pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
17
- os.system('pip install llama-cpp-python==0.2.11')
18
  else:
19
  # we need to compile a CUBLAS version
20
  # Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
21
- os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
 
 
22
  import textwrap
23
  from scipy.io.wavfile import write
24
  from pydub import AudioSegment
@@ -61,22 +88,24 @@ import numpy as np
61
  from gradio_client import Client
62
  from huggingface_hub import InferenceClient
63
 
 
 
 
 
 
64
  # This will trigger downloading model
65
  print("Downloading if not downloaded Coqui XTTS V2")
66
-
67
  from TTS.utils.manage import ModelManager
68
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
69
  ModelManager().download_model(model_name)
70
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
71
  print("XTTS downloaded")
72
 
73
-
74
  if is_gpu:
75
  use_deepspeed=True
76
  else:
77
  use_deepspeed=False
78
 
79
-
80
  print("Loading XTTS")
81
  config = XttsConfig()
82
  config.load_json(os.path.join(model_path, "config.json"))
@@ -89,10 +118,16 @@ model.load_checkpoint(
89
  eval=True,
90
  use_deepspeed=use_deepspeed,
91
  )
92
- if is_gpu:
93
- model.cuda()
 
 
94
  print("Done loading TTS")
95
 
 
 
 
 
96
  #####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
97
 
98
  title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
@@ -141,19 +176,40 @@ pirate_system_message = f"You as {character_name}. {character_scenario} Print ou
141
  ROLE_PROMPTS["Pirate"]= pirate_system_message
142
  ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
143
 
 
144
 
145
-
146
- ### WILL USE LOCAL MISTRAL OR ZEPHYR
147
 
 
 
 
 
 
 
 
 
148
  from huggingface_hub import hf_hub_download
 
149
  print("Downloading LLM")
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- print("Downloading Zephyr")
153
- #Zephyr
154
- hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
155
- # use new gguf format
156
- zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
157
 
158
  from llama_cpp import Llama
159
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
@@ -166,7 +222,7 @@ else:
166
  LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
167
  LLAMA_VERBOSE=False
168
 
169
- print("Running LLM Zephyr")
170
  llm_zephyr = Llama(model_path=zephyr_model_path,
171
  n_gpu_layers=GPU_LAYERS,
172
  max_new_tokens=512,
@@ -175,6 +231,12 @@ llm_zephyr = Llama(model_path=zephyr_model_path,
175
  n_batch=128,
176
  verbose=LLAMA_VERBOSE)
177
 
 
 
 
 
 
 
178
  def split_sentences(text, max_len):
179
  # Apply custom rules to enforce sentence breaks with double punctuation
180
  text = re.sub(r"(\s*\.{2})\s*", r".\1 ", text) # for '..'
@@ -241,7 +303,11 @@ def pcm_to_wav(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
241
 
242
  return wav_header + fmt_subchunk + data_subchunk + pcm_data
243
 
244
- def generate_local(
 
 
 
 
245
  prompt,
246
  history,
247
  system_message=None,
@@ -302,6 +368,82 @@ def generate_local(
302
 
303
  return output
304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  def get_latents(speaker_wav,voice_cleanup=False):
306
  if (voice_cleanup):
307
  try:
@@ -367,7 +509,11 @@ def detect_language(prompt):
367
  print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
368
 
369
  return language
370
-
 
 
 
 
371
  def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
372
  gpt_cond_latent, speaker_embedding = latent_tuple
373
 
@@ -376,9 +522,9 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
376
  chunks = model.inference_stream(
377
  prompt,
378
  language,
379
- gpt_cond_latent,
380
- speaker_embedding,
381
- #repetition_penalty=5.0,
382
  temperature=0.85,
383
  )
384
 
@@ -388,9 +534,10 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
388
  first_chunk_time = time.time() - t0
389
  metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
390
  first_chunk = False
391
- #print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
392
 
393
- # directly return chunk as bytes for streaming
 
 
394
  chunk = chunk.detach().cpu().numpy().squeeze()
395
  chunk = (chunk * 32767).astype(np.int16)
396
 
@@ -398,26 +545,32 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
398
 
399
  except RuntimeError as e:
400
  if "device-side assert" in str(e):
401
- # cannot do anything on cuda device side error, need tor estart
402
- print(
403
- f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
404
- flush=True,
405
- )
406
  gr.Warning("Unhandled Exception encounter, please retry in a minute")
407
  print("Cuda device-assert Runtime encountered need restart")
408
 
409
- # HF Space specific.. This error is unrecoverable need to restart space
410
  api.restart_space(repo_id=repo_id)
411
  else:
412
  print("RuntimeError: non device-side assert error:", str(e))
413
- # Does not require warning happens on empty chunk and at end
414
  ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
415
  return None
416
  return None
417
  except:
418
  return None
419
 
420
-
 
 
 
 
 
 
 
 
 
421
  # Will be triggered on text submit (will send to generate_speech)
422
  def add_text(history, text):
423
  history = [] if history is None else history
@@ -522,31 +675,33 @@ def get_sentence(history, chatbot_role):
522
  print("ERROR on last sentence history is :", history)
523
 
524
 
 
 
 
525
  from scipy.io.wavfile import write
526
  from pydub import AudioSegment
527
 
528
  second_of_silence = AudioSegment.silent() # use default
529
  second_of_silence.export("sil.wav", format='wav')
530
 
531
-
 
 
 
 
532
  def generate_speech_from_history(history, chatbot_role, sentence):
533
  language = "autodetect"
534
-
535
  # total_wav_bytestream = b""
536
-
537
  if len(sentence)==0:
538
  print("EMPTY SENTENCE")
539
  return
540
-
541
  # Sometimes prompt </s> coming on output remove it
542
  # Some post process for speech only
543
  sentence = sentence.replace("</s>", "")
544
  # remove code from speech
545
  sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
546
  sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
547
-
548
  sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
549
-
550
  sentence = sentence.replace("```", "")
551
  sentence = sentence.replace("...", " ")
552
  sentence = sentence.replace("(", " ")
@@ -555,8 +710,8 @@ def generate_speech_from_history(history, chatbot_role, sentence):
555
 
556
  if len(sentence)==0:
557
  print("EMPTY SENTENCE after processing")
558
- return
559
-
560
  # A fast fix for last character, may produce weird sounds if it is with text
561
  #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
562
  # # just add a space
@@ -579,19 +734,16 @@ def generate_speech_from_history(history, chatbot_role, sentence):
579
  # Do whatever necessary, first break at hypens then spaces and then even split very long words
580
  # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
581
  sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
582
-
583
  print("detected sentences:", sentence_list)
584
-
585
  for sentence in sentence_list:
586
-
587
  print("- sentence = ", sentence)
588
-
589
  if any(c.isalnum() for c in sentence):
590
  if language=="autodetect":
591
  #on first call autodetect, nexts sentence calls will use same language
592
  language = detect_language(sentence)
593
-
594
- #exists at least 1 alphanumeric (utf-8)
 
595
  audio_stream = get_voice_streaming(
596
  sentence, language, latent_map[chatbot_role]
597
  )
@@ -604,7 +756,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
604
  # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
605
  if audio_stream is not None:
606
  sentence_wav_bytestream = b""
607
-
608
  # frame_length = 0
609
  for chunk in audio_stream:
610
  try:
@@ -626,7 +778,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
626
  sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
627
  except:
628
  print("failed to remove noise")
629
-
630
  # Directly encode the WAV bytestream to base64
631
  base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
632
 
@@ -653,18 +805,35 @@ def generate_speech_from_history(history, chatbot_role, sentence):
653
 
654
  return results
655
 
 
 
 
 
656
  latent_map = {}
657
- latent_map["Cloée"] = get_latents("voices/cloee-1.wav")
658
- latent_map["Julian"] = get_latents("voices/julian-bedtime-style-1.wav")
659
- latent_map["Pirate"] = get_latents("voices/pirate_by_coqui.wav")
660
- latent_map["Thera"] = get_latents("voices/thera-1.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
 
662
  # Define the main function for the API endpoint that takes the input text and chatbot role
663
  def generate_story_and_speech(secret_token, input_text, chatbot_role):
664
  if secret_token != SECRET_TOKEN:
665
  raise gr.Error(
666
  f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
667
-
668
  # Initialize a list of lists for history with the user input as the first entry
669
  history = [[input_text, None]]
670
  story_sentences = get_sentence(history, chatbot_role) # get_sentence function generates text
@@ -695,4 +864,5 @@ demo = gr.Interface(
695
  )
696
 
697
  demo.queue()
698
- demo.launch(debug=True)
 
 
1
  from __future__ import annotations
2
+ from IPython.display import clear_output
3
+ from IPython import get_ipython
4
  import os
5
+ #os.system('pip install -r requirements.txt')
6
+ #os.system('pip install gradio==3.48.0')
7
+ #os.system('pip install python-dotenv')
8
+ # In[1]:
9
  #Use GPU
10
+ gpu_info = get_ipython().getoutput('nvidia-smi')
11
+ gpu_info = '\n'.join(gpu_info)
12
+ if gpu_info.find('failed') >= 0:
13
+ print('Not connected to a GPU')
14
+ is_gpu=False
15
+ else:
16
+ print(gpu_info)
17
+ is_gpu=True
18
 
19
+ #is_gpu=False
20
+ # In[2]:
21
+ # In[3]:
22
+ import os
23
+ import dotenv
24
+ # Load the environment variables from the .env file
25
+ dotenv.load_dotenv()
26
+ # Access the value of the SECRET_TOKEN variable
27
+ secret_token = os.getenv("SECRET_TOKEN")
28
+ # In[7]:
29
+ import os
30
  #download for mecab
31
+ os.system("python -m unidic download")
32
+ # In[5]:
33
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
34
  os.environ["COQUI_TOS_AGREED"] = "1"
35
  # NOTE: for streaming will require gradio audio streaming fix
36
  # pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
37
  #Now you’re ready to install 🤗 Transformers with the following command:
38
+ #For CPU-support only, Transformers and PyTorch with:
39
+ os.system('pip install transformers[torch]')
40
  if not is_gpu:
 
 
41
  #pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
42
+ os.system('pip install llama-cpp-python==0.2.11')
43
  else:
44
  # we need to compile a CUBLAS version
45
  # Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
46
+ os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
47
+
48
+ # In[8]:
49
  import textwrap
50
  from scipy.io.wavfile import write
51
  from pydub import AudioSegment
 
88
  from gradio_client import Client
89
  from huggingface_hub import InferenceClient
90
 
91
+
92
+
93
+ # In[9]:
94
+
95
+
96
  # This will trigger downloading model
97
  print("Downloading if not downloaded Coqui XTTS V2")
 
98
  from TTS.utils.manage import ModelManager
99
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
100
  ModelManager().download_model(model_name)
101
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
102
  print("XTTS downloaded")
103
 
 
104
  if is_gpu:
105
  use_deepspeed=True
106
  else:
107
  use_deepspeed=False
108
 
 
109
  print("Loading XTTS")
110
  config = XttsConfig()
111
  config.load_json(os.path.join(model_path, "config.json"))
 
118
  eval=True,
119
  use_deepspeed=use_deepspeed,
120
  )
121
+
122
+ #if is_gpu:
123
+ # model.cuda()
124
+
125
  print("Done loading TTS")
126
 
127
+
128
+ # In[60]:
129
+
130
+
131
  #####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
132
 
133
  title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
 
176
  ROLE_PROMPTS["Pirate"]= pirate_system_message
177
  ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
178
 
179
+ # In[49]:
180
 
 
 
181
 
182
+
183
+
184
+
185
+ # In[15]:
186
+
187
+
188
+ ### WILL USE LOCAL MISTRAL OR ZEPHYR
189
+ import os
190
  from huggingface_hub import hf_hub_download
191
+
192
  print("Downloading LLM")
193
 
194
+ # Get the current directory
195
+ current_dir = os.getcwd()
196
+ # Append the current directory to the zephyr_model_path
197
+ zephyr_model_path = os.path.join(current_dir, "zephyr-7b-beta.Q5_K_M.gguf")
198
+ if not os.path.isfile(zephyr_model_path):
199
+ print("Downloading Zephyr")
200
+ hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=current_dir, filename="zephyr-7b-beta.Q5_K_M.gguf")
201
+ else:
202
+ print("Zephyr it is already downloaded")
203
+
204
+
205
+ # In[ ]:
206
+
207
+
208
+
209
+
210
+
211
+ # In[16]:
212
 
 
 
 
 
 
213
 
214
  from llama_cpp import Llama
215
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 
222
  LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
223
  LLAMA_VERBOSE=False
224
 
225
+
226
  llm_zephyr = Llama(model_path=zephyr_model_path,
227
  n_gpu_layers=GPU_LAYERS,
228
  max_new_tokens=512,
 
231
  n_batch=128,
232
  verbose=LLAMA_VERBOSE)
233
 
234
+ print("Running LLM Zephyr")
235
+
236
+
237
+ # In[17]:
238
+
239
+
240
  def split_sentences(text, max_len):
241
  # Apply custom rules to enforce sentence breaks with double punctuation
242
  text = re.sub(r"(\s*\.{2})\s*", r".\1 ", text) # for '..'
 
303
 
304
  return wav_header + fmt_subchunk + data_subchunk + pcm_data
305
 
306
+
307
+ # In[23]:
308
+
309
+
310
+ def generate_local_llm(
311
  prompt,
312
  history,
313
  system_message=None,
 
368
 
369
  return output
370
 
371
+
372
+ # In[28]:
373
+
374
+
375
+ get_ipython().system('pip install OpenAI')
376
+
377
+
378
+ # In[103]:
379
+
380
+
381
+ def generate_stream(prompt, model="mixtral-8x7b"):
382
+ base_url = "https://ruslanmv-hf-llm-api.hf.space"
383
+ api_key = "sk-xxxxx"
384
+ client = OpenAI(base_url=base_url, api_key=api_key)
385
+ response = client.chat.completions.create(
386
+ model=model,
387
+ messages=[
388
+ {
389
+ "role": "user",
390
+ "content": "{}".format(prompt),
391
+ }
392
+ ],
393
+ stream=True,
394
+ )
395
+ return response
396
+ def generate_local(
397
+ prompt,
398
+ history,
399
+ system_message=None,
400
+ temperature=0.8,
401
+ max_tokens=256,
402
+ top_p=0.95,
403
+ stop=None,
404
+ ):
405
+
406
+ formatted_prompt = format_prompt_zephyr(prompt, history, system_message=system_message)
407
+ try:
408
+ print("LLM Input:", formatted_prompt)
409
+ output = ""
410
+ stream=generate_stream(formatted_prompt)
411
+ for response in stream:
412
+ character=response.choices[0].delta.content
413
+ if "<|user|>" in character:
414
+ # end of context
415
+ return
416
+ if emoji.is_emoji(character):
417
+ # Bad emoji not a meaning messes chat from next lines
418
+ return
419
+ if character is not None:
420
+ print(character, end="", flush=True)
421
+ output += character
422
+ elif response.choices[0].finish_reason == "stop":
423
+ print()
424
+ else:
425
+ pass
426
+ yield output
427
+
428
+ except Exception as e:
429
+ if "Too Many Requests" in str(e):
430
+ print("ERROR: Too many requests on mistral client")
431
+ #gr.Warning("Unfortunately Mistral is unable to process")
432
+ output = "Unfortunately I am not able to process your request now !"
433
+ else:
434
+ print("Unhandled Exception: ", str(e))
435
+ #gr.Warning("Unfortunately Mistral is unable to process")
436
+ output = "I do not know what happened but I could not understand you ."
437
+
438
+ return output
439
+
440
+
441
+ # In[ ]:
442
+
443
+
444
+ # In[17]:
445
+
446
+
447
  def get_latents(speaker_wav,voice_cleanup=False):
448
  if (voice_cleanup):
449
  try:
 
509
  print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
510
 
511
  return language
512
+
513
+
514
+ # In[18]:
515
+
516
+
517
  def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
518
  gpt_cond_latent, speaker_embedding = latent_tuple
519
 
 
522
  chunks = model.inference_stream(
523
  prompt,
524
  language,
525
+ gpt_cond_latent.to(device), # Ensure gpt_cond_latent is on the same device
526
+ speaker_embedding.to(device), # Ensure speaker_embedding is on the same device
527
+ # repetition_penalty=5.0,
528
  temperature=0.85,
529
  )
530
 
 
534
  first_chunk_time = time.time() - t0
535
  metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
536
  first_chunk = False
 
537
 
538
+ # print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
539
+
540
+ # Ensure chunk is on the same device and convert to numpy array
541
  chunk = chunk.detach().cpu().numpy().squeeze()
542
  chunk = (chunk * 32767).astype(np.int16)
543
 
 
545
 
546
  except RuntimeError as e:
547
  if "device-side assert" in str(e):
548
+ # cannot do anything on cuda device side error, need to restart
549
+ print(f"Exit due to: Unrecoverable exception caused by prompt: {prompt}", flush=True)
 
 
 
550
  gr.Warning("Unhandled Exception encounter, please retry in a minute")
551
  print("Cuda device-assert Runtime encountered need restart")
552
 
553
+ # HF Space specific.. This error is unrecoverable; need to restart space
554
  api.restart_space(repo_id=repo_id)
555
  else:
556
  print("RuntimeError: non device-side assert error:", str(e))
557
+ # Does not require warning; happens on empty chunk and at the end
558
  ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
559
  return None
560
  return None
561
  except:
562
  return None
563
 
564
+
565
+ # In[ ]:
566
+
567
+
568
+
569
+
570
+
571
+ # In[19]:
572
+
573
+
574
  # Will be triggered on text submit (will send to generate_speech)
575
  def add_text(history, text):
576
  history = [] if history is None else history
 
675
  print("ERROR on last sentence history is :", history)
676
 
677
 
678
+ # In[19]:
679
+
680
+
681
  from scipy.io.wavfile import write
682
  from pydub import AudioSegment
683
 
684
  second_of_silence = AudioSegment.silent() # use default
685
  second_of_silence.export("sil.wav", format='wav')
686
 
687
+
688
+
689
+ # In[20]:
690
+
691
+
692
  def generate_speech_from_history(history, chatbot_role, sentence):
693
  language = "autodetect"
 
694
  # total_wav_bytestream = b""
 
695
  if len(sentence)==0:
696
  print("EMPTY SENTENCE")
697
  return
 
698
  # Sometimes prompt </s> coming on output remove it
699
  # Some post process for speech only
700
  sentence = sentence.replace("</s>", "")
701
  # remove code from speech
702
  sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
703
  sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
 
704
  sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
 
705
  sentence = sentence.replace("```", "")
706
  sentence = sentence.replace("...", " ")
707
  sentence = sentence.replace("(", " ")
 
710
 
711
  if len(sentence)==0:
712
  print("EMPTY SENTENCE after processing")
713
+ return
714
+
715
  # A fast fix for last character, may produce weird sounds if it is with text
716
  #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
717
  # # just add a space
 
734
  # Do whatever necessary, first break at hypens then spaces and then even split very long words
735
  # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
736
  sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
 
737
  print("detected sentences:", sentence_list)
 
738
  for sentence in sentence_list:
 
739
  print("- sentence = ", sentence)
 
740
  if any(c.isalnum() for c in sentence):
741
  if language=="autodetect":
742
  #on first call autodetect, nexts sentence calls will use same language
743
  language = detect_language(sentence)
744
+ #exists at least 1 alphanumeric (utf-8)
745
+
746
+ #print("Inserting data to get_voice_streaming:")
747
  audio_stream = get_voice_streaming(
748
  sentence, language, latent_map[chatbot_role]
749
  )
 
756
  # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
757
  if audio_stream is not None:
758
  sentence_wav_bytestream = b""
759
+
760
  # frame_length = 0
761
  for chunk in audio_stream:
762
  try:
 
778
  sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
779
  except:
780
  print("failed to remove noise")
781
+
782
  # Directly encode the WAV bytestream to base64
783
  base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
784
 
 
805
 
806
  return results
807
 
808
+
809
+ # In[21]:
810
+
811
+
812
  latent_map = {}
813
+ try:
814
+ # get the current working directory
815
+ path= os.getcwd()
816
+ name1="voices/cloee-1.wav"
817
+ name2="voices/julian-bedtime-style-1.wav"
818
+ name3="voices/pirate_by_coqui.wav"
819
+ name4="voices/thera-1.wav"
820
+ latent_map["Cloée"] = get_latents(os.path.join(path, name1))
821
+ latent_map["Julian"] = get_latents(os.path.join(path, name2))
822
+ latent_map["Pirate"] = get_latents(os.path.join(path, name3))
823
+ latent_map["Thera"] = get_latents(os.path.join(path, name4))
824
+
825
+ except Exception as e:
826
+ print("Error:", str(e))
827
+
828
+
829
+ # In[ ]:
830
+
831
 
832
  # Define the main function for the API endpoint that takes the input text and chatbot role
833
  def generate_story_and_speech(secret_token, input_text, chatbot_role):
834
  if secret_token != SECRET_TOKEN:
835
  raise gr.Error(
836
  f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
 
837
  # Initialize a list of lists for history with the user input as the first entry
838
  history = [[input_text, None]]
839
  story_sentences = get_sentence(history, chatbot_role) # get_sentence function generates text
 
864
  )
865
 
866
  demo.queue()
867
+ demo.launch(debug=True)
868
+
requirements.txt CHANGED
@@ -19,4 +19,5 @@ asyncio
19
  noisereduce==3.0.0
20
  #deepspeed
21
  #deepspeed==0.12.6
22
- deepspeed==0.10.0
 
 
19
  noisereduce==3.0.0
20
  #deepspeed
21
  #deepspeed==0.12.6
22
+ deepspeed==0.10.0
23
+ ipython