syberWolf commited on
Commit
1aeb34c
·
1 Parent(s): f345890

load new phi

Browse files
Files changed (2) hide show
  1. handler.py +23 -113
  2. requirements.txt +1 -3
handler.py CHANGED
@@ -1,123 +1,33 @@
1
- from llama_cpp import Llama
2
- from typing import Dict, List, Any, Union
3
- import os
4
-
5
 
6
  class EndpointHandler:
7
- _instance = None # Singleton instance
8
- _model_loaded = False # Flag to check if the model is loaded
9
-
10
- def __new__(cls, *args, **kwargs): # Allow arguments to pass through
11
- if not cls._instance:
12
- cls._instance = super(EndpointHandler, cls).__new__(cls)
13
- return cls._instance
14
-
15
- def __init__(self, model_path=""):
16
- if not self._model_loaded:
17
- # Construct the model path assuming the model is in the same directory as the handler file
18
- script_dir = os.path.dirname(os.path.abspath(__file__))
19
- model_filename = "Phi-3-medium-128k-instruct-IQ2_XS.gguf"
20
- self.model_path = os.path.join(script_dir, model_filename)
21
-
22
- # Check if the model file exists
23
- if not os.path.exists(self.model_path):
24
- raise ValueError(f"Model path does not exist: {self.model_path}")
25
-
26
- # Load the GGUF model using llama_cpp
27
- self.llm = Llama(
28
- model_path=self.model_path,
29
- n_ctx=5000, # Set context length to 5000 tokens
30
- # n_threads=12, # Adjust the number of CPU threads as per your machine
31
- n_gpu_layers=-1 # Adjust based on GPU availability
32
- )
33
 
34
- # Define generation kwargs for the model
35
- self.generation_kwargs = {
36
- "max_tokens": 400, # Respond with up to 400 tokens
37
- "stop": ["<|end|>", "<|user|>", "<|assistant|>"],
38
- "top_k": 1 # Greedy decoding
39
- }
40
-
41
- self._model_loaded = True
42
-
43
- @classmethod
44
- def get_instance(cls, model_path=""):
45
- """Provides access to the singleton instance."""
46
- if not cls._instance:
47
- cls._instance = cls(model_path) # Create instance if it doesn't exist
48
- return cls._instance
49
-
50
- def __call__(self, data: Union[Dict[str, Any], str]) -> List[Dict[str, Any]]:
51
  """
52
- Data args:
53
- inputs (:obj:`dict`): The input prompts for the LLM including system instructions and user messages.
54
- str: A string input which will create a chat completion.
55
-
56
- Return:
57
- A :obj:`list` | `dict`: will be serialized and returned.
58
  """
59
- if isinstance(data, dict):
60
- # Extract inputs
61
- inputs = data.get("inputs", {})
62
- system_instructions = inputs.get("system", "")
63
- user_message = inputs.get("message", "")
64
-
65
- if not user_message:
66
- raise ValueError("No user message provided for the model.")
67
 
68
- # Combine system instructions and user message
69
- final_input = f"{system_instructions}\n{user_message}"
70
-
71
- # Run inference with llama_cpp
72
- response = self.llm.create_chat_completion(
73
- messages=[
74
- {"role": "system", "content": system_instructions},
75
- {"role": "user", "content": user_message}
76
- ],
77
- **self.generation_kwargs
78
- )
79
-
80
- elif isinstance(data, str):
81
- # Create a chat completion from the input string
82
- response = self.llm.create_chat_completion(
83
- messages=[
84
- {"role": "user", "content": data}
85
- ],
86
- **self.generation_kwargs
87
- )
88
 
 
 
 
89
  else:
90
- raise ValueError("Invalid input type. Expected dict or str, got {}".format(type(data)))
91
-
92
- # Access generated text based on the response structure
93
- try:
94
- generated_text = response["choices"][0]["message"].get("content", "")
95
- except (KeyError, IndexError):
96
- raise ValueError("Unexpected response structure: missing 'content' in 'choices[0]['message']'")
97
-
98
- # Return the generated text
99
- return [{"generated_text": generated_text}]
100
-
101
-
102
- def main():
103
- handler = EndpointHandler() # assume Handler is the class that contains the __call__ method
104
-
105
- # Test 1: Dictionary input
106
- data_dict = {"inputs": {"system": "System instructions", "message": "Hello, how are you?"}}
107
- result_dict = handler(data_dict)
108
- print("Dictionary input result:", result_dict)
109
-
110
- # Test 2: String input
111
- data_str = "Hello, how are you?"
112
- result_str = handler(data_str)
113
- print("String input result:", result_str)
114
 
115
- # Test 3: Invalid input type
116
- data_invalid = 123
117
- try:
118
- handler(data_invalid)
119
- except ValueError as e:
120
- print("Invalid input type error:", e)
121
 
122
- if __name__ == "__main__":
123
- main()
 
1
+ from typing import Dict, List, Any
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch
 
4
 
5
  class EndpointHandler:
6
+ def __init__(self, path=""):
7
+ # load model and processor from path
8
+ self.tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-4k-instruct", trust_remote_code=True)
9
+ self.model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-medium-4k-instruct", trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  """
13
+ Args:
14
+ data (:obj:):
15
+ includes the deserialized image file as PIL.Image
 
 
 
16
  """
17
+ # process input
18
+ inputs = data.pop("inputs", data)
19
+ parameters = data.pop("parameters", None)
 
 
 
 
 
20
 
21
+ # preprocess
22
+ input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # pass inputs with all kwargs in data
25
+ if parameters is not None:
26
+ outputs = self.model.generate(input_ids, **parameters)
27
  else:
28
+ outputs = self.model.generate(input_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # postprocess the prediction
31
+ prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
32
 
33
+ return [{"generated_text": prediction}]
 
requirements.txt CHANGED
@@ -1,3 +1 @@
1
- llama-cpp-python -C cmake.args="-DLLAMA_BLAS=ON;-DLLAMA_BLAS_VENDOR=OpenBLAS"
2
- torch
3
- transformers
 
1
+ transformers>=4.4.0