Spaces:

DawnC
/

VisionScout

Running on Zero

File size: 27,941 Bytes

import os
import numpy as np
import torch
import cv2
from PIL import Image
import tempfile
import uuid
from typing import Dict, List, Any, Optional, Tuple

from detection_model import DetectionModel
from color_mapper import ColorMapper
from visualization_helper import VisualizationHelper
from evaluation_metrics import EvaluationMetrics
from lighting_analyzer import LightingAnalyzer
from scene_analyzer import SceneAnalyzer
from places365_model import Places365Model

class ImageProcessor:
    """
    Class for handling image processing and object detection operations
    Separates processing logic from UI components
    """

    def __init__(self, use_llm=True, llm_model_path=None, enable_places365=True, places365_model_name='resnet50_places365'):
        """Initialize the image processor with required components"""
        print(f"Initializing ImageProcessor with use_llm={use_llm}, enable_places365={enable_places365}")

        try:
            # Initialize basic components first
            self.use_llm = use_llm
            self.llm_model_path = llm_model_path
            self.enable_places365 = enable_places365
            self.model_instances = {}

            # Initialize ColorMapper
            self.color_mapper = ColorMapper()
            print("ColorMapper initialized successfully")

            # Initialize LightingAnalyzer
            self.lighting_analyzer = LightingAnalyzer()
            print("LightingAnalyzer initialized successfully")

            # Initialize Places365 model if enabled
            self.places365_model = None
            if self.enable_places365:
                try:
                    self.places365_model = Places365Model(
                        model_name=places365_model_name,
                        device=None
                    )
                    print(f"Places365 model initialized successfully with {places365_model_name}")
                except Exception as e:
                    print(f"Warning: Failed to initialize Places365 model: {e}")
                    print("Continuing without Places365 analysis")
                    self.enable_places365 = False
                    self.places365_model = None

            # Initialize SceneAnalyzer with error handling
            self.scene_analyzer = None
            self.class_names = None  # Will be set when first model is loaded

            try:
                # Initialize SceneAnalyzer without class_names (will be set later)
                self.scene_analyzer = SceneAnalyzer(
                    class_names=None,
                    use_llm=self.use_llm,
                    use_clip=True,
                    enable_landmark=True,
                    llm_model_path=self.llm_model_path
                )
                print("SceneAnalyzer initialized successfully")

                # Verify critical components
                if self.scene_analyzer is not None:
                    print(f"SceneAnalyzer status - spatial_analyzer: {hasattr(self.scene_analyzer, 'spatial_analyzer')}, "
                        f"descriptor: {hasattr(self.scene_analyzer, 'descriptor')}, "
                        f"scene_describer: {hasattr(self.scene_analyzer, 'scene_describer')}")
                else:
                    print("WARNING: scene_analyzer is None after initialization")

            except Exception as e:
                print(f"Error initializing SceneAnalyzer: {e}")
                import traceback
                traceback.print_exc()
                self.scene_analyzer = None

            print("ImageProcessor initialization completed successfully")

        except Exception as e:
            print(f"Critical error during ImageProcessor initialization: {e}")
            import traceback
            traceback.print_exc()
            raise RuntimeError(f"Failed to initialize ImageProcessor: {str(e)}")

    def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
        """
        Get or create a model instance based on model name

        Args:
            model_name: Name of the model to use
            confidence: Confidence threshold for detection
            iou: IoU threshold for non-maximum suppression

        Returns:
            DetectionModel instance
        """
        if model_name not in self.model_instances:
            print(f"Creating new model instance for {model_name}")
            self.model_instances[model_name] = DetectionModel(
                model_name=model_name,
                confidence=confidence,
                iou=iou
            )
        else:
            print(f"Using existing model instance for {model_name}")
            self.model_instances[model_name].confidence = confidence

        return self.model_instances[model_name]

    def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None, enable_landmark=True, places365_info=None) -> Dict:
        """
        Perform scene analysis on detection results

        Args:
            detection_result: Object detection result from YOLOv8
            lighting_info: Lighting condition analysis results (optional)
            enable_landmark: Whether to enable landmark detection
            places365_info: Places365 analysis results (optional)

        Returns:
            Dictionary containing scene analysis results
        """
        print(f"DEBUG: analyze_scene received enable_landmark={enable_landmark}")
        try:
            # Check if detection_result has valid names
            class_names = getattr(detection_result, 'names', None) if detection_result else None

            # Initialize or reinitialize scene analyzer if needed
            if self.scene_analyzer is None:
                print("Scene analyzer not initialized, creating new instance")
                self.scene_analyzer = SceneAnalyzer(
                    class_names=class_names,
                    use_llm=self.use_llm,
                    use_clip=True,
                    enable_landmark=enable_landmark,
                    llm_model_path=self.llm_model_path
                )

                if self.scene_analyzer is None:
                    raise ValueError("Failed to create SceneAnalyzer instance")
            else:
                # Update existing scene analyzer settings
                self.scene_analyzer.enable_landmark = enable_landmark

                # Update class names if available and different
                if class_names and self.scene_analyzer.class_names != class_names:
                    self.scene_analyzer.class_names = class_names
                    if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
                        self.scene_analyzer.spatial_analyzer.class_names = class_names

                # Update landmark detection settings in child components
                if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
                    self.scene_analyzer.spatial_analyzer.enable_landmark = enable_landmark

            # Perform scene analysis with lighting info and Places365 context
            scene_analysis = self.scene_analyzer.analyze(
                detection_result=detection_result,
                lighting_info=lighting_info,
                class_confidence_threshold=0.35,
                scene_confidence_threshold=0.6,
                enable_landmark=enable_landmark,
                places365_info=places365_info
            )

            return scene_analysis

        except Exception as e:
            print(f"Error in scene analysis: {str(e)}")
            import traceback
            traceback.print_exc()

            # Return a valid default result
            return {
                "scene_type": "unknown",
                "confidence": 0.0,
                "description": f"Error during scene analysis: {str(e)}",
                "enhanced_description": "Scene analysis could not be completed due to an error.",
                "objects_present": [],
                "object_count": 0,
                "regions": {},
                "possible_activities": [],
                "safety_concerns": [],
                "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
            }

    def analyze_lighting_conditions(self, image, places365_info: Optional[Dict] = None):
        """
        分析光照條件並考慮 Places365 場景資訊。

        Args:
            image: 輸入圖像
            places365_info: Places365 場景分析結果，用於覆蓋邏輯

        Returns:
            Dict: 光照分析結果
        """
        return self.lighting_analyzer.analyze(image, places365_info=places365_info)

    def analyze_places365_scene(self, image):
        """
        Analyze scene using Places365 model.

        Args:
            image: Input image (PIL Image)

        Returns:
            Dict: Places365 analysis results or None if disabled/failed
        """
        if not self.enable_places365 or self.places365_model is None:
            return None

        try:
            if not isinstance(image, Image.Image):
                if isinstance(image, np.ndarray):
                    image = Image.fromarray(image)
                else:
                    print(f"Warning: Cannot process image of type {type(image)} for Places365")
                    return None

            places365_result = self.places365_model.predict(image)

            if places365_result and places365_result.get('confidence', 0) > 0.1:
                print(f"Places365 detected: {places365_result['scene_label']} "
                    f"(mapped: {places365_result['mapped_scene_type']}) "
                    f"confidence: {places365_result['confidence']:.3f}")
                return places365_result
            else:
                print("Places365 analysis failed or low confidence")
                return None

        except Exception as e:
            print(f"Error in Places365 analysis: {str(e)}")
            return None

    def process_image(self, image: Any, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None,  enable_landmark: bool = True) -> Tuple[Any, str, Dict]:
        """
        Process an image for object detection and scene analysis.
        Args:
            image: Input image (numpy array or PIL Image).
            model_name: Name of the model to use.
            confidence_threshold: Confidence threshold for detection.
            filter_classes: Optional list of classes to filter results.
            enable_landmark: Whether to enable landmark detection for this run.
        Returns:
            Tuple of (result_image_pil, result_text, stats_data_with_scene_analysis).
        """
        model_instance = self.get_model_instance(model_name, confidence_threshold)
        if model_instance is None:
            return None, f"Failed to load model: {model_name}. Please check model configuration.", {}

        result = None
        stats_data = {}
        temp_path = None
        pil_image_for_processing = None # Use this to store the consistently processed PIL image

        try:
            if isinstance(image, np.ndarray):
                if image.ndim == 3 and image.shape[2] == 3: # RGB or BGR
                    # Assuming BGR from OpenCV, convert to RGB for PIL standard
                    image_rgb_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    pil_image_for_processing = Image.fromarray(image_rgb_np)
                elif image.ndim == 3 and image.shape[2] == 4: # RGBA or BGRA
                    image_rgba_np = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA) # Ensure RGBA
                    pil_image_for_processing = Image.fromarray(image_rgba_np).convert("RGB") # Convert to RGB
                elif image.ndim == 2: # Grayscale
                    pil_image_for_processing = Image.fromarray(image).convert("RGB")
                else:
                    pil_image_for_processing = Image.fromarray(image) # Hope for the best
            elif isinstance(image, Image.Image):
                pil_image_for_processing = image.copy() # Use a copy
            elif image is None:
                return None, "No image provided. Please upload an image.", {}
            else:
                return None, f"Unsupported image type: {type(image)}. Please provide a NumPy array or PIL Image.", {}

            if pil_image_for_processing.mode != "RGB": # Ensure final image is RGB
                pil_image_for_processing = pil_image_for_processing.convert("RGB")

            # Add Places365 scene analysis parallel to lighting analysis
            places365_info = self.analyze_places365_scene(pil_image_for_processing)

            lighting_info = self.analyze_lighting_conditions(pil_image_for_processing, places365_info=places365_info)

            temp_dir = tempfile.gettempdir()
            temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
            temp_path = os.path.join(temp_dir, temp_filename)
            pil_image_for_processing.save(temp_path, format="JPEG")

            result = model_instance.detect(temp_path)

            if result is None or not hasattr(result, 'boxes'):
                scene_analysis_no_yolo = self.analyze_scene(result, lighting_info, enable_landmark=enable_landmark, places365_info=places365_info)
                desc_no_yolo = scene_analysis_no_yolo.get("enhanced_description", scene_analysis_no_yolo.get("description", "Detection failed, scene context analysis attempted."))
                stats_data["scene_analysis"] = scene_analysis_no_yolo
                if places365_info:
                    stats_data["places365_analysis"] = places365_info
                return pil_image_for_processing, desc_no_yolo, stats_data

            # 統計資訊
            stats_data = EvaluationMetrics.calculate_basic_stats(result)
            spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
            stats_data["spatial_metrics"] = spatial_metrics
            stats_data["lighting_conditions"] = lighting_info
            if places365_info:
                stats_data["places365_analysis"] = places365_info

            if filter_classes and len(filter_classes) > 0:
                classes = result.boxes.cls.cpu().numpy().astype(int)
                confs = result.boxes.conf.cpu().numpy()
                mask = np.isin(classes, filter_classes)
                filtered_stats_data = {
                    "total_objects": int(np.sum(mask)), "class_statistics": {},
                    "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0.0,
                    "spatial_metrics": stats_data.get("spatial_metrics",{}),
                    "lighting_conditions": lighting_info
                }
                if places365_info:
                    filtered_stats_data["places365_analysis"] = places365_info
                names = result.names
                class_conf_sums = {}
                for cls_id_int, conf_val in zip(classes[mask], confs[mask]):
                    cls_name = names[cls_id_int]
                    if cls_name not in filtered_stats_data["class_statistics"]:
                        filtered_stats_data["class_statistics"][cls_name] = {"count": 0}
                        class_conf_sums[cls_name] = 0.0
                    filtered_stats_data["class_statistics"][cls_name]["count"] += 1 # 累計統計資訊
                    class_conf_sums[cls_name] += conf_val
                for cls_name_stat, data_stat in filtered_stats_data["class_statistics"].items():
                    data_stat["average_confidence"] = round(class_conf_sums[cls_name_stat] / data_stat["count"] if data_stat["count"] > 0 else 0.0, 4)
                stats_data = filtered_stats_data

            viz_data = EvaluationMetrics.generate_visualization_data(result, self.color_mapper.get_all_colors())

            result_image_pil = VisualizationHelper.visualize_detection(
                temp_path, result, color_mapper=self.color_mapper,
                figsize=(12, 12), return_pil=True, filter_classes=filter_classes
            )

            result_text_summary = EvaluationMetrics.format_detection_summary(viz_data)

            #  Pass the enable_landmark parameter from function signature
            # Initialize or update scene analyzer if needed
            if self.scene_analyzer is None:
                print("Creating SceneAnalyzer in process_image")
                self.scene_analyzer = SceneAnalyzer(
                    class_names=result.names if result else None,
                    use_llm=self.use_llm,
                    use_clip=True,
                    enable_landmark=enable_landmark,
                    llm_model_path=self.llm_model_path
                )

                if self.scene_analyzer is None:
                    print("ERROR: Failed to create SceneAnalyzer in process_image")
            else:
                # Update existing scene analyzer with current settings
                if result and hasattr(result, 'names'):
                    self.scene_analyzer.class_names = result.names
                    if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
                        self.scene_analyzer.spatial_analyzer.class_names = result.names

                self.scene_analyzer.enable_landmark = enable_landmark
                if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
                    self.scene_analyzer.spatial_analyzer.enable_landmark = enable_landmark

            # Perform scene analysis using the existing analyze_scene method
            scene_analysis_result = self.analyze_scene(
                detection_result=result,
                lighting_info=lighting_info,
                enable_landmark=enable_landmark,
                places365_info=places365_info
            )

            stats_data["scene_analysis"] = scene_analysis_result

            final_result_text = result_text_summary

            # Use enable_landmark parameter for landmark block
            if enable_landmark and "detected_landmarks" in scene_analysis_result:
                landmarks_detected = scene_analysis_result.get("detected_landmarks", [])
                if not landmarks_detected and scene_analysis_result.get("primary_landmark"):
                    primary_lm = scene_analysis_result.get("primary_landmark")
                    if isinstance(primary_lm, dict): landmarks_detected = [primary_lm]

                if landmarks_detected:
                    final_result_text += "\n\n--- Detected Landmarks ---\n"
                    # Ensure drawing on the correct PIL image
                    img_to_draw_on = result_image_pil.copy() # Draw on a copy
                    img_for_drawing_cv2 = cv2.cvtColor(np.array(img_to_draw_on), cv2.COLOR_RGB2BGR)

                    for landmark_item in landmarks_detected:
                        if not isinstance(landmark_item, dict): continue

                        # Use .get() for all potentially missing keys 比較保險
                        landmark_name_disp = landmark_item.get("class_name", landmark_item.get("name", "N/A"))
                        landmark_loc_disp = landmark_item.get("location", "N/A")
                        landmark_conf_disp = landmark_item.get("confidence", 0.0)

                        final_result_text += f"• {landmark_name_disp} ({landmark_loc_disp}, confidence: {landmark_conf_disp:.2f})\n"

                        if "box" in landmark_item:
                            box = landmark_item["box"]
                            pt1 = (int(box[0]), int(box[1])); pt2 = (int(box[2]), int(box[3]))
                            color_lm = (255, 0, 255); thickness_lm = 3 # Magenta BGR
                            cv2.rectangle(img_for_drawing_cv2, pt1, pt2, color_lm, thickness_lm)

                            label_lm = f"{landmark_name_disp} ({landmark_conf_disp:.2f})"
                            font_scale_lm = 0.6; font_thickness_lm = 1
                            (w_text, h_text), baseline = cv2.getTextSize(label_lm, cv2.FONT_HERSHEY_SIMPLEX, font_scale_lm, font_thickness_lm)

                            # Label position logic (simplified from your extensive one for brevity)
                            label_y_pos = pt1[1] - baseline - 3
                            if label_y_pos < h_text : # If label goes above image, put it below box
                                label_y_pos = pt2[1] + h_text + baseline + 3

                            label_bg_pt1 = (pt1[0], label_y_pos - h_text - baseline)
                            label_bg_pt2 = (pt1[0] + w_text, label_y_pos + baseline)

                            cv2.rectangle(img_for_drawing_cv2, label_bg_pt1, label_bg_pt2, color_lm, -1)
                            cv2.putText(img_for_drawing_cv2, label_lm, (pt1[0], label_y_pos),
                                        cv2.FONT_HERSHEY_SIMPLEX, font_scale_lm, (255,255,255), font_thickness_lm, cv2.LINE_AA)

                    result_image_pil = Image.fromarray(cv2.cvtColor(img_for_drawing_cv2, cv2.COLOR_BGR2RGB))

            return result_image_pil, final_result_text, stats_data

        except Exception as e:
            error_message = f"Error in ImageProcessor.process_image: {str(e)}"
            import traceback
            traceback.print_exc()
            return pil_image_for_processing if pil_image_for_processing else None, error_message, {}
        finally:
            if temp_path and os.path.exists(temp_path):
                try: os.remove(temp_path)
                except Exception as e: print(f"Warning: Cannot delete temp file {temp_path}: {str(e)}")

    def format_result_text(self, stats: Dict) -> str:
        """
        Format detection statistics into readable text with improved spacing

        Args:
            stats: Dictionary containing detection statistics

        Returns:
            Formatted text summary
        """
        if not stats or "total_objects" not in stats:
            return "No objects detected."

        # 減少不必要的空行
        lines = [
            f"Detected {stats['total_objects']} objects.",
            f"Average confidence: {stats.get('average_confidence', 0):.2f}",
            "Objects by class:"
        ]

        if "class_statistics" in stats and stats["class_statistics"]:
            # 按計數排序類別
            sorted_classes = sorted(
                stats["class_statistics"].items(),
                key=lambda x: x[1]["count"],
                reverse=True
            )

            for cls_name, cls_stats in sorted_classes:
                count = cls_stats["count"]
                conf = cls_stats.get("average_confidence", 0)

                item_text = "item" if count == 1 else "items"
                lines.append(f"• {cls_name}: {count} {item_text} (avg conf: {conf:.2f})")
        else:
            lines.append("No class information available.")

        # 添加空間資訊
        if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
            lines.append("Object Distribution:")

            dist = stats["spatial_metrics"]["spatial_distribution"]
            x_mean = dist.get("x_mean", 0)
            y_mean = dist.get("y_mean", 0)

            # 描述物體的大致位置
            if x_mean < 0.33:
                h_pos = "on the left side"
            elif x_mean < 0.67:
                h_pos = "in the center"
            else:
                h_pos = "on the right side"

            if y_mean < 0.33:
                v_pos = "in the upper part"
            elif y_mean < 0.67:
                v_pos = "in the middle"
            else:
                v_pos = "in the lower part"

            lines.append(f"• Most objects appear {h_pos} {v_pos} of the image")

        return "\n".join(lines)

    def format_json_for_display(self, stats: Dict) -> Dict:
        """
        Format statistics JSON for better display

        Args:
            stats: Raw statistics dictionary

        Returns:
            Formatted statistics structure for display
        """
        # Create a cleaner copy of the stats for display
        display_stats = {}

        # Add summary section
        display_stats["summary"] = {
            "total_objects": stats.get("total_objects", 0),
            "average_confidence": round(stats.get("average_confidence", 0), 3)
        }

        # Add class statistics in a more organized way
        if "class_statistics" in stats and stats["class_statistics"]:
            # Sort classes by count (descending)
            sorted_classes = sorted(
                stats["class_statistics"].items(),
                key=lambda x: x[1].get("count", 0),
                reverse=True
            )

            class_stats = {}
            for cls_name, cls_data in sorted_classes:
                class_stats[cls_name] = {
                    "count": cls_data.get("count", 0),
                    "average_confidence": round(cls_data.get("average_confidence", 0), 3)
                }

            display_stats["detected_objects"] = class_stats

        # Simplify spatial metrics
        if "spatial_metrics" in stats:
            spatial = stats["spatial_metrics"]

            # Simplify spatial distribution
            if "spatial_distribution" in spatial:
                dist = spatial["spatial_distribution"]
                display_stats["spatial"] = {
                    "distribution": {
                        "x_mean": round(dist.get("x_mean", 0), 3),
                        "y_mean": round(dist.get("y_mean", 0), 3),
                        "x_std": round(dist.get("x_std", 0), 3),
                        "y_std": round(dist.get("y_std", 0), 3)
                    }
                }

            # Add simplified size information
            if "size_distribution" in spatial:
                size = spatial["size_distribution"]
                display_stats["spatial"]["size"] = {
                    "mean_area": round(size.get("mean_area", 0), 3),
                    "min_area": round(size.get("min_area", 0), 3),
                    "max_area": round(size.get("max_area", 0), 3)
                }

        return display_stats

    def prepare_visualization_data(self, stats: Dict, available_classes: Dict[int, str]) -> Dict:
        """
        Prepare data for visualization based on detection statistics

        Args:
            stats: Detection statistics
            available_classes: Dictionary of available class IDs and names

        Returns:
            Visualization data dictionary
        """
        if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
            return {"error": "No detection data available"}

        # Prepare visualization data
        viz_data = {
            "total_objects": stats.get("total_objects", 0),
            "average_confidence": stats.get("average_confidence", 0),
            "class_data": []
        }

        # Class data
        for cls_name, cls_stats in stats.get("class_statistics", {}).items():
            # Search class ID
            class_id = -1
            for id, name in available_classes.items():
                if name == cls_name:
                    class_id = id
                    break

            cls_data = {
                "name": cls_name,
                "class_id": class_id,
                "count": cls_stats.get("count", 0),
                "average_confidence": cls_stats.get("average_confidence", 0),
                "color": self.color_mapper.get_color(class_id if class_id >= 0 else cls_name)
            }

            viz_data["class_data"].append(cls_data)

        # Descending order
        viz_data["class_data"].sort(key=lambda x: x["count"], reverse=True)

        return viz_data