File size: 1,398 Bytes

c1cb360
 
1e592e3
fe371ad
 
1aeb34c
1e592e3
 
c1cb360
b47e2d8
c1cb360
b47e2d8
1e592e3
b47e2d8
e8628b3
1e592e3
e8628b3
 
b3aebd1
c1cb360
1aeb34c
1e592e3
 
 
 
 
 
 
 
 
 
 
b3aebd1
1aeb34c
1e592e3
 
b47e2d8

from typing import Dict, List, Any
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

class EndpointHandler:
    def __init__(self, path=""):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # load the model
        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
        model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen2-1.5B-Instruct",
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto"
        )
        
        # create inference pipeline without specifying the device
        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", {})

        # Ensure inputs are on the GPU if available
        if isinstance(inputs, str):
            inputs = [inputs]

        # Tensor input handling
        try:
            inputs = torch.tensor(inputs).cuda() if torch.cuda.is_available() else torch.tensor(inputs)
        except:
            pass  # If inputs are not tensors (e.g., strings), continue without conversion

        # pass inputs with all kwargs in data
        prediction = self.pipeline(inputs, **parameters)
        
        return prediction