Spaces:

ruslanmv
/

ai-story-server

Running

App Files Files Community

ruslanmv commited on Feb 8, 2024

Commit

80df7a3

1 Parent(s): 10e2779

Testing First Api version

Browse files

Files changed (2) hide show

app.py +226 -56
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,24 +1,51 @@
 from __future__ import annotations
 import os
 #Use GPU
-is_gpu=False
 #download for mecab
-os.system('python -m unidic download')
 # By using XTTS you agree to CPML license https://coqui.ai/cpml
 os.environ["COQUI_TOS_AGREED"] = "1"
 # NOTE: for streaming will require gradio audio streaming fix
 # pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
 #Now you’re ready to install 🤗 Transformers with the following command:
 if not is_gpu:
-  #For CPU-support only, Transformers and PyTorch with:
-  os.system('pip install transformers[torch]')
   #pip install 'transformers[tf-cpu]'   #Transformers and TensorFlow 2.0:
-  os.system('pip install llama-cpp-python==0.2.11')
 else:
     # we need to compile a CUBLAS version
     # Or get it from  https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
-    os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
 import textwrap
 from scipy.io.wavfile import write
 from pydub import AudioSegment
@@ -61,22 +88,24 @@ import numpy as np
 from gradio_client import Client
 from huggingface_hub import InferenceClient
 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V2")
 from TTS.utils.manage import ModelManager
 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 ModelManager().download_model(model_name)
 model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 print("XTTS downloaded")
 if is_gpu:
     use_deepspeed=True
 else:
     use_deepspeed=False
 print("Loading XTTS")
 config = XttsConfig()
 config.load_json(os.path.join(model_path, "config.json"))
@@ -89,10 +118,16 @@ model.load_checkpoint(
     eval=True,
     use_deepspeed=use_deepspeed,
 )
-if is_gpu:
-    model.cuda()
 print("Done loading TTS")
 #####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
 title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
@@ -141,19 +176,40 @@ pirate_system_message = f"You as {character_name}. {character_scenario} Print ou
 ROLE_PROMPTS["Pirate"]= pirate_system_message
 ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
-### WILL USE LOCAL MISTRAL OR ZEPHYR
 from huggingface_hub import hf_hub_download
 print("Downloading LLM")
-print("Downloading Zephyr")
-#Zephyr
-hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
-# use new gguf format
-zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
 from llama_cpp import Llama
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
@@ -166,7 +222,7 @@ else:
 LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
 LLAMA_VERBOSE=False
-print("Running LLM Zephyr")
 llm_zephyr = Llama(model_path=zephyr_model_path,
                    n_gpu_layers=GPU_LAYERS,
                    max_new_tokens=512,
@@ -175,6 +231,12 @@ llm_zephyr = Llama(model_path=zephyr_model_path,
                    n_batch=128,
                    verbose=LLAMA_VERBOSE)
 def split_sentences(text, max_len):
     # Apply custom rules to enforce sentence breaks with double punctuation
     text = re.sub(r"(\s*\.{2})\s*", r".\1 ", text)  # for '..'
@@ -241,7 +303,11 @@ def pcm_to_wav(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
     return wav_header + fmt_subchunk + data_subchunk + pcm_data
-def generate_local(
     prompt,
     history,
     system_message=None,
@@ -302,6 +368,82 @@ def generate_local(
     return output
 def get_latents(speaker_wav,voice_cleanup=False):
     if (voice_cleanup):
         try:
@@ -367,7 +509,11 @@ def detect_language(prompt):
         print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
     return language
 def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
     gpt_cond_latent, speaker_embedding = latent_tuple
@@ -376,9 +522,9 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
         chunks = model.inference_stream(
             prompt,
             language,
-            gpt_cond_latent,
-            speaker_embedding,
-            #repetition_penalty=5.0,
             temperature=0.85,
         )
@@ -388,9 +534,10 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
                 first_chunk_time = time.time() - t0
                 metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
                 first_chunk = False
-            #print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
-            # directly return chunk as bytes for streaming
             chunk = chunk.detach().cpu().numpy().squeeze()
             chunk = (chunk * 32767).astype(np.int16)
@@ -398,26 +545,32 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
     except RuntimeError as e:
         if "device-side assert" in str(e):
-            # cannot do anything on cuda device side error, need tor estart
-            print(
-                f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
-                flush=True,
-            )
             gr.Warning("Unhandled Exception encounter, please retry in a minute")
             print("Cuda device-assert Runtime encountered need restart")
-            # HF Space specific.. This error is unrecoverable need to restart space
             api.restart_space(repo_id=repo_id)
         else:
             print("RuntimeError: non device-side assert error:", str(e))
-            # Does not require warning happens on empty chunk and at end
             ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
             return None
         return None
     except:
         return None
 # Will be triggered on text submit (will send to generate_speech)
 def add_text(history, text):
     history = [] if history is None else history
@@ -522,31 +675,33 @@ def get_sentence(history, chatbot_role):
         print("ERROR on last sentence history is :", history)
 from scipy.io.wavfile import write
 from pydub import AudioSegment
 second_of_silence = AudioSegment.silent() # use default
 second_of_silence.export("sil.wav", format='wav')
 def generate_speech_from_history(history, chatbot_role, sentence):
     language = "autodetect"
     # total_wav_bytestream = b""
     if len(sentence)==0:
         print("EMPTY SENTENCE")
         return
     # Sometimes prompt </s> coming on output remove it
     # Some post process for speech only
     sentence = sentence.replace("</s>", "")
     # remove code from speech
     sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
     sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
     sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
     sentence = sentence.replace("```", "")
     sentence = sentence.replace("...", " ")
     sentence = sentence.replace("(", " ")
@@ -555,8 +710,8 @@ def generate_speech_from_history(history, chatbot_role, sentence):
     if len(sentence)==0:
         print("EMPTY SENTENCE after processing")
-        return
     # A fast fix for last character, may produce weird sounds if it is with text
     #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
     #    # just add a space
@@ -579,19 +734,16 @@ def generate_speech_from_history(history, chatbot_role, sentence):
             # Do whatever necessary, first break at hypens then spaces and then even split very long words
             # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
             sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
         print("detected sentences:", sentence_list)
         for sentence in sentence_list:
             print("- sentence = ", sentence)
             if any(c.isalnum() for c in sentence):
                 if language=="autodetect":
                     #on first call autodetect, nexts sentence calls will use same language
                     language = detect_language(sentence)
-                #exists at least 1 alphanumeric (utf-8)
                 audio_stream = get_voice_streaming(
                         sentence, language, latent_map[chatbot_role]
                     )
@@ -604,7 +756,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
             # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
             if audio_stream is not None:
                 sentence_wav_bytestream = b""
                 # frame_length = 0
                 for chunk in audio_stream:
                     try:
@@ -626,7 +778,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
                         sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
                     except:
                         print("failed to remove noise")
                 # Directly encode the WAV bytestream to base64
                 base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
@@ -653,18 +805,35 @@ def generate_speech_from_history(history, chatbot_role, sentence):
     return results
 latent_map = {}
-latent_map["Cloée"] = get_latents("voices/cloee-1.wav")
-latent_map["Julian"] = get_latents("voices/julian-bedtime-style-1.wav")
-latent_map["Pirate"] = get_latents("voices/pirate_by_coqui.wav")
-latent_map["Thera"] = get_latents("voices/thera-1.wav")
 # Define the main function for the API endpoint that takes the input text and chatbot role
 def generate_story_and_speech(secret_token, input_text, chatbot_role):
     if secret_token != SECRET_TOKEN:
         raise gr.Error(
             f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
     # Initialize a list of lists for history with the user input as the first entry
     history = [[input_text, None]]
     story_sentences = get_sentence(history, chatbot_role)  # get_sentence function generates text
@@ -695,4 +864,5 @@ demo = gr.Interface(
 )
 demo.queue()
-demo.launch(debug=True)

 from __future__ import annotations
+from IPython.display import clear_output
+from IPython import get_ipython
 import os
+#os.system('pip install -r requirements.txt')
+#os.system('pip install gradio==3.48.0')
+#os.system('pip install python-dotenv')
+# In[1]:
 #Use GPU
+gpu_info = get_ipython().getoutput('nvidia-smi')
+gpu_info = '\n'.join(gpu_info)
+if gpu_info.find('failed') >= 0:
+  print('Not connected to a GPU')
+  is_gpu=False
+else:
+  print(gpu_info)
+  is_gpu=True
+#is_gpu=False
+# In[2]:
+# In[3]:
+import os
+import dotenv
+# Load the environment variables from the .env file
+dotenv.load_dotenv()
+# Access the value of the SECRET_TOKEN variable
+secret_token = os.getenv("SECRET_TOKEN")
+# In[7]:
+import os
 #download for mecab
+os.system("python -m unidic download")
+# In[5]:
 # By using XTTS you agree to CPML license https://coqui.ai/cpml
 os.environ["COQUI_TOS_AGREED"] = "1"
 # NOTE: for streaming will require gradio audio streaming fix
 # pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
 #Now you’re ready to install 🤗 Transformers with the following command:
+#For CPU-support only, Transformers and PyTorch with:
+os.system('pip install transformers[torch]')
 if not is_gpu:
   #pip install 'transformers[tf-cpu]'   #Transformers and TensorFlow 2.0:
+  os.system('pip install llama-cpp-python==0.2.11')
 else:
     # we need to compile a CUBLAS version
     # Or get it from  https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
+    os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
+# In[8]:
 import textwrap
 from scipy.io.wavfile import write
 from pydub import AudioSegment
 from gradio_client import Client
 from huggingface_hub import InferenceClient
+# In[9]:
 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V2")
 from TTS.utils.manage import ModelManager
 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 ModelManager().download_model(model_name)
 model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 print("XTTS downloaded")
 if is_gpu:
     use_deepspeed=True
 else:
     use_deepspeed=False
 print("Loading XTTS")
 config = XttsConfig()
 config.load_json(os.path.join(model_path, "config.json"))
     eval=True,
     use_deepspeed=use_deepspeed,
 )
+#if is_gpu:
+#    model.cuda()
 print("Done loading TTS")
+# In[60]:
 #####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
 title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
 ROLE_PROMPTS["Pirate"]= pirate_system_message
 ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
+# In[49]:
+# In[15]:
+### WILL USE LOCAL MISTRAL OR ZEPHYR
+import os
 from huggingface_hub import hf_hub_download
 print("Downloading LLM")
+# Get the current directory
+current_dir = os.getcwd()
+# Append the current directory to the zephyr_model_path
+zephyr_model_path = os.path.join(current_dir, "zephyr-7b-beta.Q5_K_M.gguf")
+if not os.path.isfile(zephyr_model_path):
+    print("Downloading Zephyr")
+    hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=current_dir, filename="zephyr-7b-beta.Q5_K_M.gguf")
+else:
+    print("Zephyr it is already downloaded")
+# In[ ]:
+# In[16]:
 from llama_cpp import Llama
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
 LLAMA_VERBOSE=False
 llm_zephyr = Llama(model_path=zephyr_model_path,
                    n_gpu_layers=GPU_LAYERS,
                    max_new_tokens=512,
                    n_batch=128,
                    verbose=LLAMA_VERBOSE)
+print("Running LLM Zephyr")
+# In[17]:
 def split_sentences(text, max_len):
     # Apply custom rules to enforce sentence breaks with double punctuation
     text = re.sub(r"(\s*\.{2})\s*", r".\1 ", text)  # for '..'
     return wav_header + fmt_subchunk + data_subchunk + pcm_data
+# In[23]:
+def generate_local_llm(
     prompt,
     history,
     system_message=None,
     return output
+# In[28]:
+get_ipython().system('pip install OpenAI')
+# In[103]:
+def generate_stream(prompt, model="mixtral-8x7b"):
+    base_url = "https://ruslanmv-hf-llm-api.hf.space"
+    api_key = "sk-xxxxx"
+    client = OpenAI(base_url=base_url, api_key=api_key)
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "{}".format(prompt),
+            }
+        ],
+        stream=True,
+    )
+    return response
+def generate_local(
+    prompt,
+    history,
+    system_message=None,
+    temperature=0.8,
+    max_tokens=256,
+    top_p=0.95,
+    stop=None,
+):
+    formatted_prompt = format_prompt_zephyr(prompt, history, system_message=system_message)
+    try:
+        print("LLM Input:", formatted_prompt)
+        output = ""
+        stream=generate_stream(formatted_prompt)
+        for response in stream:
+            character=response.choices[0].delta.content
+            if "<|user|>" in character:
+                # end of context
+                return
+            if emoji.is_emoji(character):
+                # Bad emoji not a meaning messes chat from next lines
+                return
+            if character is not None:
+                print(character, end="", flush=True)
+                output += character
+            elif response.choices[0].finish_reason == "stop":
+                print()
+            else:
+                pass
+            yield output
+    except Exception as e:
+        if "Too Many Requests" in str(e):
+            print("ERROR: Too many requests on mistral client")
+            #gr.Warning("Unfortunately Mistral is unable to process")
+            output = "Unfortunately I am not able to process your request now !"
+        else:
+            print("Unhandled Exception: ", str(e))
+            #gr.Warning("Unfortunately Mistral is unable to process")
+            output = "I do not know what happened but I could not understand you ."
+    return output
+# In[ ]:
+# In[17]:
 def get_latents(speaker_wav,voice_cleanup=False):
     if (voice_cleanup):
         try:
         print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
     return language
+# In[18]:
 def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
     gpt_cond_latent, speaker_embedding = latent_tuple
         chunks = model.inference_stream(
             prompt,
             language,
+            gpt_cond_latent.to(device),  # Ensure gpt_cond_latent is on the same device
+            speaker_embedding.to(device),  # Ensure speaker_embedding is on the same device
+            # repetition_penalty=5.0,
             temperature=0.85,
         )
                 first_chunk_time = time.time() - t0
                 metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
                 first_chunk = False
+            # print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
+            # Ensure chunk is on the same device and convert to numpy array
             chunk = chunk.detach().cpu().numpy().squeeze()
             chunk = (chunk * 32767).astype(np.int16)
     except RuntimeError as e:
         if "device-side assert" in str(e):
+            # cannot do anything on cuda device side error, need to restart
+            print(f"Exit due to: Unrecoverable exception caused by prompt: {prompt}", flush=True)
             gr.Warning("Unhandled Exception encounter, please retry in a minute")
             print("Cuda device-assert Runtime encountered need restart")
+            # HF Space specific.. This error is unrecoverable; need to restart space
             api.restart_space(repo_id=repo_id)
         else:
             print("RuntimeError: non device-side assert error:", str(e))
+            # Does not require warning; happens on empty chunk and at the end
             ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
             return None
         return None
     except:
         return None
+# In[ ]:
+# In[19]:
 # Will be triggered on text submit (will send to generate_speech)
 def add_text(history, text):
     history = [] if history is None else history
         print("ERROR on last sentence history is :", history)
+# In[19]:
 from scipy.io.wavfile import write
 from pydub import AudioSegment
 second_of_silence = AudioSegment.silent() # use default
 second_of_silence.export("sil.wav", format='wav')
+# In[20]:
 def generate_speech_from_history(history, chatbot_role, sentence):
     language = "autodetect"
     # total_wav_bytestream = b""
     if len(sentence)==0:
         print("EMPTY SENTENCE")
         return
     # Sometimes prompt </s> coming on output remove it
     # Some post process for speech only
     sentence = sentence.replace("</s>", "")
     # remove code from speech
     sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
     sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
     sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
     sentence = sentence.replace("```", "")
     sentence = sentence.replace("...", " ")
     sentence = sentence.replace("(", " ")
     if len(sentence)==0:
         print("EMPTY SENTENCE after processing")
+        return
     # A fast fix for last character, may produce weird sounds if it is with text
     #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
     #    # just add a space
             # Do whatever necessary, first break at hypens then spaces and then even split very long words
             # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
             sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
         print("detected sentences:", sentence_list)
         for sentence in sentence_list:
             print("- sentence = ", sentence)
             if any(c.isalnum() for c in sentence):
                 if language=="autodetect":
                     #on first call autodetect, nexts sentence calls will use same language
                     language = detect_language(sentence)
+                #exists at least 1 alphanumeric (utf-8)
+                #print("Inserting data to get_voice_streaming:")
                 audio_stream = get_voice_streaming(
                         sentence, language, latent_map[chatbot_role]
                     )
             # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
             if audio_stream is not None:
                 sentence_wav_bytestream = b""
                 # frame_length = 0
                 for chunk in audio_stream:
                     try:
                         sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
                     except:
                         print("failed to remove noise")
                 # Directly encode the WAV bytestream to base64
                 base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
     return results
+# In[21]:
 latent_map = {}
+try:
+    # get the current working directory
+    path= os.getcwd()
+    name1="voices/cloee-1.wav"
+    name2="voices/julian-bedtime-style-1.wav"
+    name3="voices/pirate_by_coqui.wav"
+    name4="voices/thera-1.wav"
+    latent_map["Cloée"] = get_latents(os.path.join(path, name1))
+    latent_map["Julian"] = get_latents(os.path.join(path, name2))
+    latent_map["Pirate"] = get_latents(os.path.join(path, name3))
+    latent_map["Thera"] = get_latents(os.path.join(path, name4))
+except Exception as e:
+    print("Error:", str(e))
+# In[ ]:
 # Define the main function for the API endpoint that takes the input text and chatbot role
 def generate_story_and_speech(secret_token, input_text, chatbot_role):
     if secret_token != SECRET_TOKEN:
         raise gr.Error(
             f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
     # Initialize a list of lists for history with the user input as the first entry
     history = [[input_text, None]]
     story_sentences = get_sentence(history, chatbot_role)  # get_sentence function generates text
 )
 demo.queue()
+demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -19,4 +19,5 @@ asyncio
 noisereduce==3.0.0
 #deepspeed
 #deepspeed==0.12.6
-deepspeed==0.10.0

 noisereduce==3.0.0
 #deepspeed
 #deepspeed==0.12.6
+deepspeed==0.10.0
+ipython