Starling

Paused

App Files Files Community

Tonic commited on Nov 28, 2023

Commit

03c59e6

1 Parent(s): 90ffb86

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -14

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import optimum
 import transformers
 from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM
-# from optimum.bettertransformer import BetterTransformer
 import torch
 import gradio as gr
 import json
@@ -24,8 +24,6 @@ examples = [
 ]
 model_name = "berkeley-nest/Starling-LM-7B-alpha"
-# base_model = "meta-llama/Llama-2-7b-chat-hf"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 temperature=0.4
@@ -40,14 +38,14 @@ model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
     torch_dtype=torch.bfloat16,
     load_in_4bit=True
 )
-# model = BetterTransformer.transform(model)
 model.eval()
 class StarlingBot:
-        def __init__(self, system_prompt="The following dialogue is a conversation"):
-            self.system_prompt = system_prompt
-        def predict(self, user_message, assistant_message, system_prompt, do_sample, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
             conversation = f" <s> [INST] {self.system_prompt} [INST]  {assistant_message if assistant_message else ''} </s> [/INST]  {user_message}  </s> "
             input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
             input_ids = input_ids.to(device)
@@ -56,7 +54,7 @@ class StarlingBot:
                 use_cache=False,
                 early_stopping=False,
                 bos_token_id=model.config.bos_token_id,
-                    eos_token_id=model.config.eos_token_id,
                 pad_token_id=model.config.eos_token_id,
                 temperature=temperature,
                 do_sample=True,
@@ -65,13 +63,12 @@ class StarlingBot:
                 repetition_penalty=repetition_penalty
             )
             response_text = tokenizer.decode(response[0], skip_special_tokens=True)
-            response_text = response.strip()
 #           response_text = response.split("<|assistant|>\n")[-1]
             return response_text
-        finally:
-            del input_ids, attention_mask, output_ids
-            gc.collect()
-            torch.cuda.empty_cache()
 starling_bot = StarlingBot()
@@ -79,7 +76,7 @@ iface = gr.Interface(
     fn=starling_bot.predict,
     title=title,
     description=description,
-#   examples=examples,
     inputs=[
         gr.Textbox(label="🌟🤩User Message", type="text", lines=5),
         gr.Textbox(label="💫🌠Starling Assistant Message or Instructions ", lines=2),

 import optimum
 import transformers
 from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM
 import torch
 import gradio as gr
 import json
 ]
 model_name = "berkeley-nest/Starling-LM-7B-alpha"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 temperature=0.4
     torch_dtype=torch.bfloat16,
     load_in_4bit=True
 )
 model.eval()
 class StarlingBot:
+    def __init__(self, system_prompt="The following dialogue is a conversation"):
+        self.system_prompt = system_prompt
+    def predict(self, user_message, assistant_message, system_prompt, do_sample, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
+        try:
             conversation = f" <s> [INST] {self.system_prompt} [INST]  {assistant_message if assistant_message else ''} </s> [/INST]  {user_message}  </s> "
             input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
             input_ids = input_ids.to(device)
                 use_cache=False,
                 early_stopping=False,
                 bos_token_id=model.config.bos_token_id,
+                eos_token_id=model.config.eos_token_id,
                 pad_token_id=model.config.eos_token_id,
                 temperature=temperature,
                 do_sample=True,
                 repetition_penalty=repetition_penalty
             )
             response_text = tokenizer.decode(response[0], skip_special_tokens=True)
 #           response_text = response.split("<|assistant|>\n")[-1]
             return response_text
+            finally:
+                del input_ids, attention_mask, output_ids
+                gc.collect()
+                torch.cuda.empty_cache()
 starling_bot = StarlingBot()
     fn=starling_bot.predict,
     title=title,
     description=description,
+    examples=examples,
     inputs=[
         gr.Textbox(label="🌟🤩User Message", type="text", lines=5),
         gr.Textbox(label="💫🌠Starling Assistant Message or Instructions ", lines=2),