File size: 4,751 Bytes

88e320f
 
 
0ebdffc
d9c2292
88e320f
 
 
 
 
 
 
 
 
0ebdffc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c410e3a
88e320f
 
4f560b4
 
 
 
 
 
 
 
0ebdffc
88e320f
0ebdffc
 
 
 
 
d9c2292
 
88e320f
0ebdffc
88e320f
4f560b4
88e320f
 
0ebdffc
 
 
4f560b4
88e320f
 
4f560b4
88e320f
4f560b4
88e320f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c410e3a
88e320f
 
 
c410e3a
88e320f
 
4f1bef3
c410e3a
88e320f
 
 
 
 
 
 
 
 
4f560b4
c410e3a
88e320f
 
 
 
 
 
c410e3a
 
88e320f
c410e3a
 
88e320f

import os
import json
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import logging

logger = logging.getLogger(__name__)

# Test CUDA device availability and names with:
# python -c "import torch; print('\n'.join([f'{i}: {torch.cuda.get_device_name(i)}' for i in range(torch.cuda.device_count())]))"
# Can specify GPU device with:
# CUDA_VISIBLE_DEVICES="1" python script.py

class PhiForSequenceClassification(nn.Module):
    def __init__(self, base_model, num_labels=2):
        super().__init__()
        self.phi = base_model
        # Create classifier with same dtype as base model
        dtype = next(base_model.parameters()).dtype
        self.classifier = nn.Linear(self.phi.config.hidden_size, num_labels, dtype=dtype)
        
    def forward(self, **inputs):
        outputs = self.phi(**inputs, output_hidden_states=True)
        # Use the last hidden state of the last token for classification
        last_hidden_state = outputs.hidden_states[-1][:, -1, :]
        logits = self.classifier(last_hidden_state)
        return type('Outputs', (), {'logits': logits})()

def model_fn(model_dir, context=None):
    """Load the model for inference"""
    try:
        model_id = os.getenv("HF_MODEL_ID")
        
        # Set specific GPU device if available
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        if device.type == 'cuda':
            torch.cuda.empty_cache()
        logger.info(f"Using device: {device}")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        
        # Load config and specify it's a Phi3Config
        config = AutoConfig.from_pretrained(model_id, 
                                          trust_remote_code=True)
        
        # Load base model using AutoModelForCausalLM
        base_model = AutoModelForCausalLM.from_pretrained(
            model_id,
            config=config,
            torch_dtype=torch.bfloat16 if device.type == 'cuda' else torch.float32,
            trust_remote_code=True
        )
        
        # Create classification model
        model = PhiForSequenceClassification(base_model, num_labels=2)
        
        # Move model to device
        model = model.to(device)
        
        # Set memory optimizations
        if device.type == 'cuda':
            torch.backends.cudnn.benchmark = True
            
        # Ensure model is in eval mode
        model.eval()
            
        logger.info(f"Model loaded successfully on {device}")
        
        return {
            "model": model,
            "tokenizer": tokenizer,
            "device": device
        }
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise

def predict_fn(data, model_dict):
    """Make a prediction"""
    try:
        logger.info("Starting prediction")
        model = model_dict["model"]
        tokenizer = model_dict["tokenizer"]
        device = model_dict["device"]
        
        # Parse input
        if isinstance(data, str):
            input_text = data
        elif isinstance(data, dict):
            input_text = data.get("inputs", data.get("text", str(data)))
        else:
            input_text = str(data)
            
        # Tokenize input
        inputs = tokenizer(
            input_text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Move inputs to device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate prediction
        with torch.no_grad():
            if device.type == 'cuda':
                torch.cuda.empty_cache()
            
            outputs = model(**inputs)
            predictions = torch.softmax(outputs.logits, dim=1)
        
        # Move predictions to CPU and convert to numpy
        predictions = predictions.cpu().numpy()
        
        return predictions
        
    except Exception as e:
        logger.error(f"Error during prediction: {str(e)}")
        raise

def input_fn(request_body, request_content_type):
    """Parse input request"""
    if request_content_type == "application/json":
        try:
            data = json.loads(request_body)
        except:
            data = request_body
        return data
    else:
        return request_body

def output_fn(prediction, response_content_type):
    """Format the output"""
    if response_content_type == "application/json":
        return json.dumps(prediction.tolist())
    else:
        raise ValueError(f"Unsupported content type: {response_content_type}")