import os import json import torch import torch.nn as nn from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig import logging logger = logging.getLogger(__name__) # Test CUDA device availability and names with: # python -c "import torch; print('\n'.join([f'{i}: {torch.cuda.get_device_name(i)}' for i in range(torch.cuda.device_count())]))" # Can specify GPU device with: # CUDA_VISIBLE_DEVICES="1" python script.py class PhiForSequenceClassification(nn.Module): def __init__(self, base_model, num_labels=2): super().__init__() self.phi = base_model # Create classifier with same dtype as base model dtype = next(base_model.parameters()).dtype self.classifier = nn.Linear(self.phi.config.hidden_size, num_labels, dtype=dtype) def forward(self, **inputs): outputs = self.phi(**inputs, output_hidden_states=True) # Use the last hidden state of the last token for classification last_hidden_state = outputs.hidden_states[-1][:, -1, :] logits = self.classifier(last_hidden_state) return type('Outputs', (), {'logits': logits})() def model_fn(model_dir, context=None): """Load the model for inference""" try: model_id = os.getenv("HF_MODEL_ID") # Set specific GPU device if available device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') if device.type == 'cuda': torch.cuda.empty_cache() logger.info(f"Using device: {device}") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) # Load config and specify it's a Phi3Config config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) # Load base model using AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained( model_id, config=config, torch_dtype=torch.bfloat16 if device.type == 'cuda' else torch.float32, trust_remote_code=True ) # Create classification model model = PhiForSequenceClassification(base_model, num_labels=2) # Move model to device model = model.to(device) # Set memory optimizations if device.type == 'cuda': torch.backends.cudnn.benchmark = True # Ensure model is in eval mode model.eval() logger.info(f"Model loaded successfully on {device}") return { "model": model, "tokenizer": tokenizer, "device": device } except Exception as e: logger.error(f"Error loading model: {str(e)}") raise def predict_fn(data, model_dict): """Make a prediction""" try: logger.info("Starting prediction") model = model_dict["model"] tokenizer = model_dict["tokenizer"] device = model_dict["device"] # Parse input if isinstance(data, str): input_text = data elif isinstance(data, dict): input_text = data.get("inputs", data.get("text", str(data))) else: input_text = str(data) # Tokenize input inputs = tokenizer( input_text, add_special_tokens=True, max_length=128, padding='max_length', truncation=True, return_tensors='pt' ) # Move inputs to device inputs = {k: v.to(device) for k, v in inputs.items()} # Generate prediction with torch.no_grad(): if device.type == 'cuda': torch.cuda.empty_cache() outputs = model(**inputs) predictions = torch.softmax(outputs.logits, dim=1) # Move predictions to CPU and convert to numpy predictions = predictions.cpu().numpy() return predictions except Exception as e: logger.error(f"Error during prediction: {str(e)}") raise def input_fn(request_body, request_content_type): """Parse input request""" if request_content_type == "application/json": try: data = json.loads(request_body) except: data = request_body return data else: return request_body def output_fn(prediction, response_content_type): """Format the output""" if response_content_type == "application/json": return json.dumps(prediction.tolist()) else: raise ValueError(f"Unsupported content type: {response_content_type}")