handraise-dev
/

gguf-inference

Text Generation

Model card Files Files and versions Community

syberWolf commited on Jul 4, 2024

Commit

1e592e3

·

1 Parent(s): b47e2d8

try to actually use the GPU

Files changed (1) hide show

handler.py +20 -10

handler.py CHANGED Viewed

@@ -1,27 +1,37 @@
 from typing import Dict, List, Any
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 class EndpointHandler:
     def __init__(self, path=""):
         # load the model
         tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(
             "Qwen/Qwen2-1.5B-Instruct",
-            torch_dtype="auto",
             device_map="auto"
-        )
         # create inference pipeline
-        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
-        parameters = data.pop("parameters", None)
         # pass inputs with all kwargs in data
-        if parameters is not None:
-            prediction = self.pipeline(inputs, **parameters)
-        else:
-            prediction = self.pipeline(inputs)
-        # postprocess the prediction
         return prediction

 from typing import Dict, List, Any
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
 class EndpointHandler:
     def __init__(self, path=""):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
         # load the model
         tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(
             "Qwen/Qwen2-1.5B-Instruct",
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
             device_map="auto"
+        ).to(device)
         # create inference pipeline
+        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", {})
+        # Ensure inputs are on the GPU if available
+        if isinstance(inputs, str):
+            inputs = [inputs]
+        # Tensor input handling
+        try:
+            inputs = torch.tensor(inputs).cuda() if torch.cuda.is_available() else torch.tensor(inputs)
+        except:
+            pass  # If inputs are not tensors (e.g., strings), continue without conversion
         # pass inputs with all kwargs in data
+        prediction = self.pipeline(inputs, **parameters)
         return prediction