import os
import json
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import logging

logger = logging.getLogger(__name__)

# Test CUDA device availability and names with:
# python -c "import torch; print('\n'.join([f'{i}: {torch.cuda.get_device_name(i)}' for i in range(torch.cuda.device_count())]))"
# Can specify GPU device with:
# CUDA_VISIBLE_DEVICES="1" python script.py

class PhiForSequenceClassification(nn.Module):
    def __init__(self, base_model, num_labels=2):
        super().__init__()
        self.phi = base_model
        # Create classifier with same dtype as base model
        dtype = next(base_model.parameters()).dtype
        self.classifier = nn.Linear(self.phi.config.hidden_size, num_labels, dtype=dtype)
        
    def forward(self, **inputs):
        outputs = self.phi(**inputs, output_hidden_states=True)
        # Use the last hidden state of the last token for classification
        last_hidden_state = outputs.hidden_states[-1][:, -1, :]
        logits = self.classifier(last_hidden_state)
        return type('Outputs', (), {'logits': logits})()

def model_fn(model_dir, context=None):
    """Load the model for inference"""
    try:
        model_id = os.getenv("HF_MODEL_ID")
        
        # Set specific GPU device if available
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        if device.type == 'cuda':
            torch.cuda.empty_cache()
        logger.info(f"Using device: {device}")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        
        # Load config and specify it's a Phi3Config
        config = AutoConfig.from_pretrained(model_id, 
                                          trust_remote_code=True)
        
        # Load base model using AutoModelForCausalLM
        base_model = AutoModelForCausalLM.from_pretrained(
            model_id,
            config=config,
            torch_dtype=torch.bfloat16 if device.type == 'cuda' else torch.float32,
            trust_remote_code=True
        )
        
        # Create classification model
        model = PhiForSequenceClassification(base_model, num_labels=2)
        
        # Move model to device
        model = model.to(device)
        
        # Set memory optimizations
        if device.type == 'cuda':
            torch.backends.cudnn.benchmark = True
            
        # Ensure model is in eval mode
        model.eval()
            
        logger.info(f"Model loaded successfully on {device}")
        
        return {
            "model": model,
            "tokenizer": tokenizer,
            "device": device
        }
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise

def predict_fn(data, model_dict):
    """Make a prediction"""
    try:
        logger.info("Starting prediction")
        model = model_dict["model"]
        tokenizer = model_dict["tokenizer"]
        device = model_dict["device"]
        
        # Parse input
        if isinstance(data, str):
            input_text = data
        elif isinstance(data, dict):
            input_text = data.get("inputs", data.get("text", str(data)))
        else:
            input_text = str(data)
            
        # Tokenize input
        inputs = tokenizer(
            input_text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Move inputs to device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate prediction
        with torch.no_grad():
            if device.type == 'cuda':
                torch.cuda.empty_cache()
            
            outputs = model(**inputs)
            predictions = torch.softmax(outputs.logits, dim=1)
        
        # Move predictions to CPU and convert to numpy
        predictions = predictions.cpu().numpy()
        
        return predictions
        
    except Exception as e:
        logger.error(f"Error during prediction: {str(e)}")
        raise

def input_fn(request_body, request_content_type):
    """Parse input request"""
    if request_content_type == "application/json":
        try:
            data = json.loads(request_body)
        except:
            data = request_body
        return data
    else:
        return request_body

def output_fn(prediction, response_content_type):
    """Format the output"""
    if response_content_type == "application/json":
        return json.dumps(prediction.tolist())
    else:
        raise ValueError(f"Unsupported content type: {response_content_type}")