Spaces:

Hashii1729
/

Vestiq

Sleeping

App Files Files Community

Hashii1729 commited on Jun 24

Commit

c96ca80

1 Parent(s): 384c4a0

Enhance HuggingFaceFashionAnalyzer: optimize model loading, suppress warnings, and improve CPU performance settings

Browse files

Files changed (1) hide show

fast.py +71 -11

fast.py CHANGED Viewed

@@ -15,14 +15,28 @@ import torch.nn.functional as F
 import torchvision.transforms as v2
 from huggingface_hub import PyTorchModelHubMixin
 import numpy as np
 app = FastAPI(title="HuggingFace Fashion Analyzer API", version="1.0.0")
 # Fashion Image Encoder class for yainage90 model
 class ImageEncoder(nn.Module, PyTorchModelHubMixin):
-    def __init__(self, config):
         super(ImageEncoder, self).__init__()
-        self.swin = SwinModel(config=config)
         self.embedding_layer = nn.Linear(config.hidden_size, 128)
     def forward(self, image_tensor):
@@ -39,11 +53,27 @@ class HuggingFaceFashionAnalyzer:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
         # Initialize yainage90 fashion object detection model
         try:
             self.detection_ckpt = 'yainage90/fashion-object-detection'
-            self.detection_processor = AutoImageProcessor.from_pretrained(self.detection_ckpt)
-            self.detection_model = AutoModelForObjectDetection.from_pretrained(self.detection_ckpt).to(self.device)
             print("Fashion object detection model loaded successfully")
         except Exception as e:
             print(f"Error loading fashion detection model: {e}")
@@ -54,10 +84,21 @@ class HuggingFaceFashionAnalyzer:
         try:
             self.encoder_ckpt = "yainage90/fashion-image-feature-extractor"
             self.encoder_config = SwinConfig.from_pretrained(self.encoder_ckpt)
-            self.encoder_image_processor = AutoImageProcessor.from_pretrained(self.encoder_ckpt)
             # Create the encoder with proper configuration - use from_pretrained directly
-            self.feature_encoder = ImageEncoder.from_pretrained(self.encoder_ckpt).to(self.device)
             # Setup image transforms for feature extraction
             self.transform = v2.Compose([
@@ -71,12 +112,21 @@ class HuggingFaceFashionAnalyzer:
             self.feature_encoder = None
             self.transform = None
-        # Initialize basic image captioning as fallback
         try:
             self.image_to_text = pipeline(
                 "image-to-text",
                 model="Salesforce/blip-image-captioning-base",
-                device=0 if torch.cuda.is_available() else -1
             )
             print("Basic image captioning model loaded successfully")
         except Exception as e:
@@ -88,6 +138,12 @@ class HuggingFaceFashionAnalyzer:
             0: 'bag', 1: 'bottom', 2: 'dress', 3: 'hat', 4: 'shoes', 5: 'outer', 6: 'top'
         }
     def process_image_from_bytes(self, image_bytes):
         """Process image bytes and return PIL Image"""
         image = Image.open(io.BytesIO(image_bytes))
@@ -412,9 +468,12 @@ class HuggingFaceFashionAnalyzer:
             return {"error": "Fashion detection model not available"}
         try:
-            with torch.no_grad():
                 inputs = self.detection_processor(images=[image], return_tensors="pt")
-                outputs = self.detection_model(**inputs.to(self.device))
                 target_sizes = torch.tensor([[image.size[1], image.size[0]]])
                 results = self.detection_processor.post_process_object_detection(
                     outputs, threshold=0.4, target_sizes=target_sizes
@@ -445,7 +504,8 @@ class HuggingFaceFashionAnalyzer:
             # Transform image for feature extraction
             image_tensor = self.transform(image)
-            with torch.no_grad():
                 embedding = self.feature_encoder(image_tensor.unsqueeze(0).to(self.device))
             return {

 import torchvision.transforms as v2
 from huggingface_hub import PyTorchModelHubMixin
 import numpy as np
+import warnings
+# Suppress specific warnings for cleaner output
+warnings.filterwarnings("ignore", message=".*use_fast.*")
+warnings.filterwarnings("ignore", message=".*copying from a non-meta parameter.*")
+warnings.filterwarnings("ignore", message=".*slow image processor.*")
+warnings.filterwarnings("ignore", message=".*slow processor.*")
 app = FastAPI(title="HuggingFace Fashion Analyzer API", version="1.0.0")
 # Fashion Image Encoder class for yainage90 model
 class ImageEncoder(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, config=None):
         super(ImageEncoder, self).__init__()
+        if config is None:
+            # Create a default config if none provided
+            config = SwinConfig()
+        elif isinstance(config, dict):
+            # Convert dict to SwinConfig if needed
+            config = SwinConfig(**config)
+        self.swin = SwinModel(config)
         self.embedding_layer = nn.Linear(config.hidden_size, 128)
     def forward(self, image_tensor):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
+        # Set CPU optimization settings
+        if self.device == "cpu":
+            torch.set_num_threads(2)  # Limit CPU threads to reduce load
+            print("CPU optimization: Limited threads to 2 for better performance")
         # Initialize yainage90 fashion object detection model
         try:
             self.detection_ckpt = 'yainage90/fashion-object-detection'
+            # Use fast processor to avoid warnings
+            self.detection_processor = AutoImageProcessor.from_pretrained(
+                self.detection_ckpt,
+                use_fast=True
+            )
+            # Load model with proper parameter assignment to avoid warnings
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                self.detection_model = AutoModelForObjectDetection.from_pretrained(
+                    self.detection_ckpt,
+                    torch_dtype=torch.float32 if self.device == "cpu" else torch.float16,
+                    low_cpu_mem_usage=True if self.device == "cpu" else False
+                ).to(self.device)
             print("Fashion object detection model loaded successfully")
         except Exception as e:
             print(f"Error loading fashion detection model: {e}")
         try:
             self.encoder_ckpt = "yainage90/fashion-image-feature-extractor"
             self.encoder_config = SwinConfig.from_pretrained(self.encoder_ckpt)
+            # Use fast processor to avoid warnings
+            self.encoder_image_processor = AutoImageProcessor.from_pretrained(
+                self.encoder_ckpt,
+                use_fast=True
+            )
             # Create the encoder with proper configuration - use from_pretrained directly
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                self.feature_encoder = ImageEncoder.from_pretrained(self.encoder_ckpt).to(self.device)
+                # Set appropriate dtype after loading
+                if self.device == "cpu":
+                    self.feature_encoder = self.feature_encoder.float()
+                else:
+                    self.feature_encoder = self.feature_encoder.half()
             # Setup image transforms for feature extraction
             self.transform = v2.Compose([
             self.feature_encoder = None
             self.transform = None
+        # Initialize basic image captioning as fallback with CPU optimization
         try:
+            # Configure model kwargs for CPU optimization
+            model_kwargs = {}
+            if self.device == "cpu":
+                model_kwargs["low_cpu_mem_usage"] = True
+                model_kwargs["torch_dtype"] = torch.float32
+            else:
+                model_kwargs["torch_dtype"] = torch.float16
             self.image_to_text = pipeline(
                 "image-to-text",
                 model="Salesforce/blip-image-captioning-base",
+                device=0 if torch.cuda.is_available() else -1,
+                model_kwargs=model_kwargs
             )
             print("Basic image captioning model loaded successfully")
         except Exception as e:
             0: 'bag', 1: 'bottom', 2: 'dress', 3: 'hat', 4: 'shoes', 5: 'outer', 6: 'top'
         }
+        # Set models to evaluation mode for inference optimization
+        if self.detection_model:
+            self.detection_model.eval()
+        if self.feature_encoder:
+            self.feature_encoder.eval()
     def process_image_from_bytes(self, image_bytes):
         """Process image bytes and return PIL Image"""
         image = Image.open(io.BytesIO(image_bytes))
             return {"error": "Fashion detection model not available"}
         try:
+            # Use inference mode for better performance
+            with torch.inference_mode():
                 inputs = self.detection_processor(images=[image], return_tensors="pt")
+                # Move inputs to device efficiently
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                outputs = self.detection_model(**inputs)
                 target_sizes = torch.tensor([[image.size[1], image.size[0]]])
                 results = self.detection_processor.post_process_object_detection(
                     outputs, threshold=0.4, target_sizes=target_sizes
             # Transform image for feature extraction
             image_tensor = self.transform(image)
+            # Use inference mode for better performance
+            with torch.inference_mode():
                 embedding = self.feature_encoder(image_tensor.unsqueeze(0).to(self.device))
             return {