File size: 4,751 Bytes
88e320f
 
 
0ebdffc
d9c2292
88e320f
 
 
 
 
 
 
 
 
0ebdffc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c410e3a
88e320f
 
4f560b4
 
 
 
 
 
 
 
0ebdffc
88e320f
0ebdffc
 
 
 
 
d9c2292
 
88e320f
0ebdffc
88e320f
4f560b4
88e320f
 
0ebdffc
 
 
4f560b4
88e320f
 
4f560b4
88e320f
4f560b4
88e320f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c410e3a
88e320f
 
 
c410e3a
88e320f
 
4f1bef3
c410e3a
88e320f
 
 
 
 
 
 
 
 
4f560b4
c410e3a
88e320f
 
 
 
 
 
c410e3a
 
88e320f
c410e3a
 
88e320f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import json
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import logging

logger = logging.getLogger(__name__)

# Test CUDA device availability and names with:
# python -c "import torch; print('\n'.join([f'{i}: {torch.cuda.get_device_name(i)}' for i in range(torch.cuda.device_count())]))"
# Can specify GPU device with:
# CUDA_VISIBLE_DEVICES="1" python script.py

class PhiForSequenceClassification(nn.Module):
    def __init__(self, base_model, num_labels=2):
        super().__init__()
        self.phi = base_model
        # Create classifier with same dtype as base model
        dtype = next(base_model.parameters()).dtype
        self.classifier = nn.Linear(self.phi.config.hidden_size, num_labels, dtype=dtype)
        
    def forward(self, **inputs):
        outputs = self.phi(**inputs, output_hidden_states=True)
        # Use the last hidden state of the last token for classification
        last_hidden_state = outputs.hidden_states[-1][:, -1, :]
        logits = self.classifier(last_hidden_state)
        return type('Outputs', (), {'logits': logits})()

def model_fn(model_dir, context=None):
    """Load the model for inference"""
    try:
        model_id = os.getenv("HF_MODEL_ID")
        
        # Set specific GPU device if available
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        if device.type == 'cuda':
            torch.cuda.empty_cache()
        logger.info(f"Using device: {device}")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        
        # Load config and specify it's a Phi3Config
        config = AutoConfig.from_pretrained(model_id, 
                                          trust_remote_code=True)
        
        # Load base model using AutoModelForCausalLM
        base_model = AutoModelForCausalLM.from_pretrained(
            model_id,
            config=config,
            torch_dtype=torch.bfloat16 if device.type == 'cuda' else torch.float32,
            trust_remote_code=True
        )
        
        # Create classification model
        model = PhiForSequenceClassification(base_model, num_labels=2)
        
        # Move model to device
        model = model.to(device)
        
        # Set memory optimizations
        if device.type == 'cuda':
            torch.backends.cudnn.benchmark = True
            
        # Ensure model is in eval mode
        model.eval()
            
        logger.info(f"Model loaded successfully on {device}")
        
        return {
            "model": model,
            "tokenizer": tokenizer,
            "device": device
        }
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise

def predict_fn(data, model_dict):
    """Make a prediction"""
    try:
        logger.info("Starting prediction")
        model = model_dict["model"]
        tokenizer = model_dict["tokenizer"]
        device = model_dict["device"]
        
        # Parse input
        if isinstance(data, str):
            input_text = data
        elif isinstance(data, dict):
            input_text = data.get("inputs", data.get("text", str(data)))
        else:
            input_text = str(data)
            
        # Tokenize input
        inputs = tokenizer(
            input_text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Move inputs to device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate prediction
        with torch.no_grad():
            if device.type == 'cuda':
                torch.cuda.empty_cache()
            
            outputs = model(**inputs)
            predictions = torch.softmax(outputs.logits, dim=1)
        
        # Move predictions to CPU and convert to numpy
        predictions = predictions.cpu().numpy()
        
        return predictions
        
    except Exception as e:
        logger.error(f"Error during prediction: {str(e)}")
        raise

def input_fn(request_body, request_content_type):
    """Parse input request"""
    if request_content_type == "application/json":
        try:
            data = json.loads(request_body)
        except:
            data = request_body
        return data
    else:
        return request_body

def output_fn(prediction, response_content_type):
    """Format the output"""
    if response_content_type == "application/json":
        return json.dumps(prediction.tolist())
    else:
        raise ValueError(f"Unsupported content type: {response_content_type}")