velocity-ai
/

phi-3.5-address-validation-pretrained

@@ -1,7 +1,7 @@
 import os
 import json
 import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 import logging
 logger = logging.getLogger(__name__)
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
 # Can specify GPU device with:
 # CUDA_VISIBLE_DEVICES="1" python script.py
-def model_fn(model_dir):
     """Load the model for inference"""
     try:
         model_id = os.getenv("HF_MODEL_ID")
@@ -19,22 +19,14 @@ def model_fn(model_dir):
         # Set specific GPU device if available
         device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
         if device.type == 'cuda':
-            torch.cuda.set_device(device)
             torch.cuda.empty_cache()
         logger.info(f"Using device: {device}")
-        # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        # Load config
-        config = AutoConfig.from_pretrained(model_id,
-                                          num_labels=2,
-                                          trust_remote_code=True)
-        # Load model with sequence classification head
         model = AutoModelForSequenceClassification.from_pretrained(
             model_id,
-            config=config,
             torch_dtype=torch.bfloat16 if device.type == 'cuda' else torch.float32,
             trust_remote_code=True
         )
@@ -68,27 +60,15 @@ def predict_fn(data, model_dict):
         tokenizer = model_dict["tokenizer"]
         device = model_dict["device"]
-        logger.info(f"Model is on device: {device}")
-        # Parse input and format it like training data
         if isinstance(data, str):
             input_text = data
         elif isinstance(data, dict):
-            # Extract address components
-            addr1 = data.get('order_address1', data.get('address_line_1', ''))
-            addr2 = data.get('order_address2', data.get('address_line_2', ''))
-            city = data.get('order_city', data.get('city', ''))
-            state = data.get('order_state', data.get('state', ''))
-            pincode = str(data.get('order_pincode', data.get('pincode', '')))
-            # Format exactly like training data
-            input_text = f"Address_line_1: {addr1} Address_line_2: {addr2} City: {city} State: {state} Pincode: {pincode}"
         else:
             input_text = str(data)
-        logger.debug(f"Parsed input text: {input_text}")
-        # Create tensors directly on target device
         inputs = tokenizer(
             input_text,
             add_special_tokens=True,
@@ -99,43 +79,23 @@ def predict_fn(data, model_dict):
         )
         # Move inputs to device
-        if device.type == 'cuda':
-            inputs = {k: v.cuda() for k, v in inputs.items()}
-        logger.debug(f"Inputs moved to device: {device}")
-        # Log tensor devices and dtypes
-        for k, v in inputs.items():
-            logger.debug(f"Input '{k}' - Device: {v.device}, Shape: {v.shape}, Dtype: {v.dtype}")
         # Generate prediction
-        logger.info("Generating prediction")
         with torch.no_grad():
             if device.type == 'cuda':
                 torch.cuda.empty_cache()
-            try:
-                # Run inference
-                outputs = model(**inputs)
-                # Convert to float32 before softmax to ensure compatibility
-                logits = outputs.logits.to(dtype=torch.float32)
-                predictions = torch.softmax(logits, dim=1)
-            except RuntimeError as e:
-                logger.error("Error during inference:")
-                logger.error(f"Model device: {next(model.parameters()).device}")
-                logger.error(f"Input devices: {[f'{k}: {v.device}' for k, v in inputs.items()]}")
-                raise
-        # Move predictions to CPU and ensure float32
-        predictions = predictions.cpu().float().numpy()
         return predictions
     except Exception as e:
         logger.error(f"Error during prediction: {str(e)}")
-        logger.error(f"Model device: {next(model.parameters()).device}")
-        logger.error(f"Input devices: {[f'{k}: {v.device}' for k, v in inputs.items()]}")
         raise
 def input_fn(request_body, request_content_type):

 import os
 import json
 import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import logging
 logger = logging.getLogger(__name__)
 # Can specify GPU device with:
 # CUDA_VISIBLE_DEVICES="1" python script.py
+def model_fn(model_dir, context=None):
     """Load the model for inference"""
     try:
         model_id = os.getenv("HF_MODEL_ID")
         # Set specific GPU device if available
         device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
         if device.type == 'cuda':
             torch.cuda.empty_cache()
         logger.info(f"Using device: {device}")
+        # Load tokenizer and model directly using AutoModelForSequenceClassification
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         model = AutoModelForSequenceClassification.from_pretrained(
             model_id,
+            num_labels=2,
             torch_dtype=torch.bfloat16 if device.type == 'cuda' else torch.float32,
             trust_remote_code=True
         )
         tokenizer = model_dict["tokenizer"]
         device = model_dict["device"]
+        # Parse input
         if isinstance(data, str):
             input_text = data
         elif isinstance(data, dict):
+            input_text = data.get("inputs", data.get("text", str(data)))
         else:
             input_text = str(data)
+        # Tokenize input
         inputs = tokenizer(
             input_text,
             add_special_tokens=True,
         )
         # Move inputs to device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
         # Generate prediction
         with torch.no_grad():
             if device.type == 'cuda':
                 torch.cuda.empty_cache()
+            outputs = model(**inputs)
+            predictions = torch.softmax(outputs.logits, dim=1)
+        # Move predictions to CPU and convert to numpy
+        predictions = predictions.cpu().numpy()
         return predictions
     except Exception as e:
         logger.error(f"Error during prediction: {str(e)}")
         raise
 def input_fn(request_body, request_content_type):