allenai-OLMoE-1B-7B-0924-cpu

Build error

App Files Files Community

nisten commited on Sep 4, 2024

Commit

2b0dd1e

verified ·

1 Parent(s): 0cb4dc1

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -22

app.py CHANGED Viewed

@@ -1,37 +1,45 @@
 import gradio as gr
 import torch
-import os
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# Set the device to GPU if available, otherwise use CPU
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Load the model and tokenizer
 model_name = "allenai/OLMoE-1B-7B-0924-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(DEVICE)
-# Define the system prompt
 system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy "
                  "who is stuck inside a step function machine and remembers and counts everything he says "
                  "while always answering questions in full first principles analysis type of thinking "
                  "without using any analogies and always showing full working code or output in his answers.")
-# Define a function for generating text
-def generate_text(prompt, history):
-    full_prompt = f"{system_prompt}\n\nHuman: {prompt}\n\nAssistant:"
-    inputs = tokenizer(full_prompt, return_tensors="pt").to(DEVICE)
-    with torch.no_grad():
-        outputs = model.generate(**inputs, max_new_tokens=4000, do_sample=True, temperature=0.5)
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    assistant_response = response.split("Assistant:")[-1].strip()
-    return assistant_response
-# Set up the Gradio chat interface
 with gr.Blocks() as demo:
     chatbot = gr.Chatbot()
     msg = gr.Textbox()
     clear = gr.Button("Clear")
@@ -41,7 +49,7 @@ with gr.Blocks() as demo:
     def bot(history):
         user_message = history[-1][0]
-        bot_message = generate_text(user_message, history)
         history[-1][1] = bot_message
         return history
@@ -50,4 +58,5 @@ with gr.Blocks() as demo:
     )
     clear.click(lambda: None, None, chatbot, queue=False)
-demo.launch(share=True)

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import subprocess
+# Install flash attention
+subprocess.run('pip install --upgrade --force-reinstall --no-deps --no-build-isolation transformers torch flash-attn  ', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+# Load model and tokenizer
 model_name = "allenai/OLMoE-1B-7B-0924-Instruct"
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# Define prompts
 system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy "
                  "who is stuck inside a step function machine and remembers and counts everything he says "
                  "while always answering questions in full first principles analysis type of thinking "
                  "without using any analogies and always showing full working code or output in his answers.")
+user_prompt = '<|user|>\n'
+assistant_prompt = '<|assistant|>\n'
+prompt_suffix = "<|end|>\n"
+def generate_response(message, history):
+    full_prompt = f"{system_prompt}\n{user_prompt}{message}{prompt_suffix}{assistant_prompt}"
+    inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda:0")
+    generate_ids = model.generate(
+        **inputs,
+        max_new_tokens=1000,
+        do_sample=True,
+        temperature=0.7,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    response = tokenizer.batch_decode(generate_ids[:, inputs['input_ids'].shape[1]:],
+                                      skip_special_tokens=True,
+                                      clean_up_tokenization_spaces=False)[0]
+    return response.strip()
+# Set up Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Pissed Off Karpathy Chatbot")
     chatbot = gr.Chatbot()
     msg = gr.Textbox()
     clear = gr.Button("Clear")
     def bot(history):
         user_message = history[-1][0]
+        bot_message = generate_response(user_message, history)
         history[-1][1] = bot_message
         return history
     )
     clear.click(lambda: None, None, chatbot, queue=False)
+demo.queue()
+demo.launch(debug=True, share=True)