halme commited on
Commit
bcb8b37
·
1 Parent(s): f7ea956

test inference on cpu

Browse files
Files changed (1) hide show
  1. app.py +11 -2
app.py CHANGED
@@ -1,6 +1,9 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  from unsloth import FastLanguageModel
 
 
 
4
 
5
  """
6
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
@@ -20,7 +23,7 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
20
 
21
  messages.append({"role": "user", "content": message})
22
 
23
- response = ""
24
 
25
  """ for message in client.chat_completion(messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p):
26
  token = message.choices[0].delta.content
@@ -28,12 +31,18 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
28
  response += token
29
  yield response """
30
 
31
- model, tokenizer = FastLanguageModel.from_pretrained(
32
  model_name = "halme/id2223_lora_model", # YOUR MODEL YOU USED FOR TRAINING
33
  max_seq_length = max_tokens,
34
  dtype = None,
35
  load_in_4bit = True,
 
 
 
 
 
36
  )
 
37
 
38
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
39
 
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  from unsloth import FastLanguageModel
4
+ from peft import AutoPeftModelForCausalLM
5
+ from transformers import AutoTokenizer
6
+
7
 
8
  """
9
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
23
 
24
  messages.append({"role": "user", "content": message})
25
 
26
+ #response = ""
27
 
28
  """ for message in client.chat_completion(messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p):
29
  token = message.choices[0].delta.content
 
31
  response += token
32
  yield response """
33
 
34
+ """ model, tokenizer = FastLanguageModel.from_pretrained(
35
  model_name = "halme/id2223_lora_model", # YOUR MODEL YOU USED FOR TRAINING
36
  max_seq_length = max_tokens,
37
  dtype = None,
38
  load_in_4bit = True,
39
+ ) """
40
+
41
+ model = AutoPeftModelForCausalLM.from_pretrained(
42
+ "halme/id2223_lora_model", # YOUR MODEL YOU USED FOR TRAINING
43
+ load_in_4bit = True,
44
  )
45
+ tokenizer = AutoTokenizer.from_pretrained("halme/id2223_lora_model")
46
 
47
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
48