syberWolf commited on
Commit
f96aa72
·
1 Parent(s): c676380

update handler and add flash attention

Browse files
Files changed (2) hide show
  1. handler.py +5 -2
  2. requirements.txt +1 -0
handler.py CHANGED
@@ -9,8 +9,6 @@ class EndpointHandler:
9
  tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
10
  model = AutoModelForCausalLM.from_pretrained(
11
  "microsoft/Phi-3-mini-128k-instruct",
12
- torch_dtype=torch.bfloat16,
13
- device_map="cuda",
14
  trust_remote_code=True
15
  )
16
  # create inference pipeline
@@ -18,6 +16,11 @@ class EndpointHandler:
18
 
19
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
20
  inputs = data.pop("inputs", data)
 
 
 
 
 
21
  parameters = data.pop("parameters", None)
22
 
23
  # pass inputs with all kwargs in data
 
9
  tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
10
  model = AutoModelForCausalLM.from_pretrained(
11
  "microsoft/Phi-3-mini-128k-instruct",
 
 
12
  trust_remote_code=True
13
  )
14
  # create inference pipeline
 
16
 
17
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
18
  inputs = data.pop("inputs", data)
19
+
20
+ for key in ['stop_sequences', 'watermark', 'stop']:
21
+ if key in inputs:
22
+ del inputs[key]
23
+
24
  parameters = data.pop("parameters", None)
25
 
26
  # pass inputs with all kwargs in data
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ flash-attn==latest