syberWolf commited on
Commit
c1cb360
·
1 Parent(s): 53b72cd

push updates to handler

Browse files
Phi-3-medium-128k-instruct-IQ2_XS.gguf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c769c4137173dd434c070e116e4b0599af2b12752ba4c7188a1bf8bf5372a55
3
- size 4127405088
 
 
 
 
handler.py CHANGED
@@ -1,33 +1,30 @@
1
- from typing import Dict, List, Any
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
 
 
 
4
 
5
  class EndpointHandler:
6
  def __init__(self, path=""):
7
- # load model and processor from path
8
- self.tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-4k-instruct", trust_remote_code=True)
9
- self.model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-medium-4k-instruct", trust_remote_code=True)
 
 
 
 
 
 
 
 
10
 
11
- def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
12
- """
13
- Args:
14
- data (:obj:):
15
- includes the deserialized image file as PIL.Image
16
- """
17
- # process input
18
  inputs = data.pop("inputs", data)
19
  parameters = data.pop("parameters", None)
20
 
21
- # preprocess
22
- input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
23
-
24
  # pass inputs with all kwargs in data
25
  if parameters is not None:
26
- outputs = self.model.generate(input_ids, **parameters)
27
  else:
28
- outputs = self.model.generate(input_ids)
29
-
30
  # postprocess the prediction
31
- prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
32
-
33
- return [{"generated_text": prediction}]
 
 
 
1
  import torch
2
+ from typing import Dict, List, Any
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
+
5
 
6
  class EndpointHandler:
7
  def __init__(self, path=""):
8
+ # load the model
9
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ "microsoft/Phi-3-mini-128k-instruct",
12
+ device_map="auto",
13
+ torch_dtype=torch.bfloat16,
14
+ device_map="cuda",
15
+ trust_remote_code=True
16
+ )
17
+ # create inference pipeline
18
+ self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
19
 
20
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
 
 
 
 
 
 
21
  inputs = data.pop("inputs", data)
22
  parameters = data.pop("parameters", None)
23
 
 
 
 
24
  # pass inputs with all kwargs in data
25
  if parameters is not None:
26
+ prediction = self.pipeline(inputs, **parameters)
27
  else:
28
+ prediction = self.pipeline(inputs)
 
29
  # postprocess the prediction
30
+ return prediction
 
 
requirements.txt DELETED
@@ -1,2 +0,0 @@
1
- transformers>=4.40
2
- flash-attn