Spaces:

VanYsa
/

MyAlexa

Paused

App Files Files Community

VanYsa commited on Apr 28, 2024

Commit

930e925

1 Parent(s): a6cdcf6

Update app.py

Browse files

Changed LLM to test

Files changed (1) hide show

app.py +45 -79

app.py CHANGED Viewed

@@ -5,16 +5,11 @@ import os
 import soundfile as sf
 import tempfile
 import uuid
-import os
 import torch
 import time
-from transformers import GemmaTokenizer, AutoModelForCausalLM
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from threading import Thread
 from nemo.collections.asr.models import ASRModel
-from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
-from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
@@ -29,15 +24,14 @@ DESCRIPTION = '''
 '''
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <img src="MyAlexaLogo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  ">
    <p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
 </div>
 """
-# Set an environment variable
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 canary_model = ASRModel.from_pretrained("nvidia/canary-1b").to(device)
 canary_model.eval()
@@ -47,29 +41,14 @@ decoding_cfg = canary_model.cfg.decoding
 decoding_cfg.beam.beam_size = 1
 canary_model.change_decoding_strategy(decoding_cfg)
-# setup for buffered inference
-canary_model.cfg.preprocessor.dither = 0.0
-canary_model.cfg.preprocessor.pad_to = 0
-feature_stride = canary_model.cfg.preprocessor['window_stride']
-model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
-frame_asr = FrameBatchMultiTaskAED(
-	asr_model=canary_model,
-	frame_len=40.0,
-	total_buffer=40.0,
-	batch_size=16,
 )
-amp_dtype = torch.float16
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-llama3_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct").to(device)
-terminators = [
-    tokenizer.eos_token_id,
-    tokenizer.convert_tokens_to_ids("<|eot_id|>")
-]
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
@@ -142,62 +121,49 @@ def add_message(history, message):
 	history.append((message, None))
 	return history, gr.Textbox(value="", interactive=False)
-def bot(history, message):
 	"""
 	Prints the LLM's response in the chatbot
 	"""
-	response = chat_llama3_8b(message, history, 0.95, 512)
 	history[-1][1] = ""
 	for character in response:
 		history[-1][1] += character
 		time.sleep(0.05)
 		yield history
-def chat_llama3_8b(message: str,
-              history: list,
-              temperature: float,
-              max_new_tokens: int
-             ) -> str: # type: ignore
-    """
-    Generate a streaming response using the llama3-8b model.
-    Args:
-        message (str): The input message.
-        history (list): The conversation history used by ChatInterface.
-        temperature (float): The temperature for generating the response.
-        max_new_tokens (int): The maximum number of new tokens to generate.
-    Returns:
-        str: The generated response.
-    """
-    conversation = []
-    for user, assistant in history:
-        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-    conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        input_ids= input_ids,
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        temperature=temperature,
-        eos_token_id=terminators,
-    )
-    # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
-    if temperature == 0:
-        generate_kwargs['do_sample'] = False
-    t = Thread(target=llama3_model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        #print(outputs)
-        yield "".join(outputs)
 with gr.Blocks(
 	title="MyAlexa",
 	css="""
@@ -255,7 +221,7 @@ with gr.Blocks(
 			)
 	chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot, chat_input])
-	bot_msg = chat_msg.then(bot, [chatbot, chat_msg], chatbot, api_name="bot_response")
 	bot_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input])
 	submit_button.click(

 import soundfile as sf
 import tempfile
 import uuid
+import transformers
 import torch
 import time
 from nemo.collections.asr.models import ASRModel
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
 '''
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <img src="https://huggingface.co/spaces/VanYsa/MyAlexa/blob/main/MyAlexaLogo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  ">
    <p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
 </div>
 """
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+### ASR model
 canary_model = ASRModel.from_pretrained("nvidia/canary-1b").to(device)
 canary_model.eval()
 decoding_cfg.beam.beam_size = 1
 canary_model.change_decoding_strategy(decoding_cfg)
+### LLM model
+pipeline = transformers.pipeline(
+    "text-generation",
+    model="meta-llama/Meta-Llama-3-8B-Instruct",
+    model_kwargs={"torch_dtype": torch.bfloat16},
+	device=device
 )
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
 	history.append((message, None))
 	return history, gr.Textbox(value="", interactive=False)
+def bot(history,message):
 	"""
 	Prints the LLM's response in the chatbot
 	"""
+	response = bot_response(history, message)
 	history[-1][1] = ""
 	for character in response:
 		history[-1][1] += character
 		time.sleep(0.05)
 		yield history
+def bot_response(history, message):
+	"""
+	Generates a response from the LLM model.
+	Temperature and top_p are set to 0.6 and 0.9 respectively.
+	"""
+	messages = [
+    {"role": "system", "content": "You are a helpful AI assistant."},
+    {"role": "user", "content": message},
+	]
+	prompt = pipeline.tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+	)
+	terminators = [
+		pipeline.tokenizer.eos_token_id,
+		pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+	]
+	outputs = pipeline(
+		prompt,
+		max_new_tokens=512,
+		eos_token_id=terminators,
+		do_sample=True,
+		temperature=0.6,
+		top_p=0.9,
+	)
+	print(outputs[0]["generated_text"][len(prompt):])
+	return outputs[0]["generated_text"][len(prompt):]
 with gr.Blocks(
 	title="MyAlexa",
 	css="""
 			)
 	chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot, chat_input])
+	bot_msg = chat_msg.then(bot, [chatbot, chat_input], chatbot, api_name="bot_response")
 	bot_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input])
 	submit_button.click(