Spaces:

halme
/

ID2223-lab2

Sleeping

halme commited on Dec 2, 2024

Commit

bcb8b37

1 Parent(s): f7ea956

test inference on cpu

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
 from unsloth import FastLanguageModel
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
@@ -20,7 +23,7 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
     messages.append({"role": "user", "content": message})
-    response = ""
     """ for message in client.chat_completion(messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p):
         token = message.choices[0].delta.content
@@ -28,12 +31,18 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
         response += token
         yield response """
-    model, tokenizer = FastLanguageModel.from_pretrained(
         model_name = "halme/id2223_lora_model", # YOUR MODEL YOU USED FOR TRAINING
         max_seq_length = max_tokens,
         dtype = None,
         load_in_4bit = True,
     )
     FastLanguageModel.for_inference(model) # Enable native 2x faster inference

 import gradio as gr
 from huggingface_hub import InferenceClient
 from unsloth import FastLanguageModel
+from peft import AutoPeftModelForCausalLM
+from transformers import AutoTokenizer
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
     messages.append({"role": "user", "content": message})
+    #response = ""
     """ for message in client.chat_completion(messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p):
         token = message.choices[0].delta.content
         response += token
         yield response """
+    """     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name = "halme/id2223_lora_model", # YOUR MODEL YOU USED FOR TRAINING
         max_seq_length = max_tokens,
         dtype = None,
         load_in_4bit = True,
+    ) """
+    model = AutoPeftModelForCausalLM.from_pretrained(
+        "halme/id2223_lora_model", # YOUR MODEL YOU USED FOR TRAINING
+        load_in_4bit = True,
     )
+    tokenizer = AutoTokenizer.from_pretrained("halme/id2223_lora_model")
     FastLanguageModel.for_inference(model) # Enable native 2x faster inference