Spaces:

DawnC
/

VisionScout

Running on Zero

File size: 62,938 Bytes

4d1f920


import torch
import clip
from PIL import Image
import numpy as np
from typing import List, Dict, Tuple, Optional, Union, Any

from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts

class CLIPZeroShotClassifier:
    """
    使用CLIP模型進行零樣本分類，專注於識別世界知名地標。
    作為YOLO檢測的補充，處理標準對象檢測無法識別的地標建築。
    """
    def __init__(self, model_name: str = "ViT-L/14", device: str = None):
        """
        初始化CLIP零樣本分類器

        Args:
            model_name: CLIP模型名稱，默認為"ViT-L/14"
            device: 運行設備，None則自動選擇
        """
        # 設置運行設備
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device

        print(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.device}")
        try:
            self.model, self.preprocess = clip.load(model_name, device=self.device)
            print(f"Successfully loaded CLIP model")
        except Exception as e:
            print(f"Error loading CLIP model: {e}")
            raise

        # 加載地標數據
        try:
            self.landmark_data = ALL_LANDMARKS
            self.landmark_prompts = get_all_landmark_prompts()
            print(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification")

            # 預計算地標文本特徵
            self.landmark_text_features = self._precompute_text_features(self.landmark_prompts)

            # 創建地標ID到索引的映射，可快速查找
            self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())}

            # 初始化批處理參數
            self.batch_size = 16  # 默認批處理大小
            self.confidence_threshold_multipliers = {
                "close_up": 0.9,     # 近景標準閾值
                "partial": 0.6,      # 部分可見降低閾值要求
                "distant": 0.5,      # 遠景更低閾值要求
                "full_image": 0.7    # 整張圖像需要更高閾值
            }

            self.landmark_type_thresholds = {
                "tower": 0.5,         # 塔型建築需要更高閾值
                "skyscraper": 0.4,    # 摩天大樓使用較低閾值
                "building": 0.55,     # 一般建築物閾值略微降低
                "monument": 0.5,      # 紀念碑閾值
                "natural": 0.6        # 自然地標可以使用較低閾值
            }

            # 初始化結果快取
            self.results_cache = {}  # 使用圖像hash作為鍵
            self.cache_max_size = 100  # 最大快取項目數

        except ImportError:
            print("Warning: landmark_data.py not found. Landmark classification will be limited")
            self.landmark_data = {}
            self.landmark_prompts = []
            self.landmark_text_features = None
            self.landmark_id_to_index = {}
            self.results_cache = {}

    def _get_image_hash(self, image):
        """
        為圖像生成簡單的 hash 值用於快取

        Args:
            image: PIL Image 或 numpy 數組

        Returns:
            str: 圖像的 hash 值
        """
        if isinstance(image, np.ndarray):
            # 對於 numpy 數組，降採樣並計算簡單 hash
            small_img = image[::10, ::10] if image.ndim == 3 else image
            return hash(small_img.tobytes())
        else:
            # 對於 PIL 圖像，調整大小後轉換為 bytes
            small_img = image.resize((32, 32))
            return hash(small_img.tobytes())

    def _manage_cache(self):
        """
        管理結果快取大小
        """
        if len(self.results_cache) > self.cache_max_size:
            oldest_key = next(iter(self.results_cache))
            del self.results_cache[oldest_key]

    def set_batch_size(self, batch_size: int):
        """
        設置批處理大小

        Args:
            batch_size: 新的批處理大小
        """
        self.batch_size = max(1, batch_size)
        print(f"Batch size set to {self.batch_size}")


    def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
        """
        調整特定檢測類型的置信度閾值乘數

        Args:
            detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
            multiplier: 置信度閾值乘數
        """
        if detection_type in self.confidence_threshold_multipliers:
            self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier))
            print(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}")
        else:
            print(f"Unknown detection type: {detection_type}")


    def _precompute_text_features(self, text_prompts: List[str]) -> torch.Tensor:
        """
        預計算文本提示的CLIP特徵，提高批處理效率

        Args:
            text_prompts: 文本提示列表

        Returns:
            torch.Tensor: 預計算的文本特徵
        """
        if not text_prompts:
            return None

        with torch.no_grad():
            # Process in batches to avoid CUDA memory issues
            batch_size = 128  # Adjust based on GPU memory
            features_list = []

            for i in range(0, len(text_prompts), batch_size):
                batch_prompts = text_prompts[i:i+batch_size]
                text_tokens = clip.tokenize(batch_prompts).to(self.device)
                batch_features = self.model.encode_text(text_tokens)
                batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
                features_list.append(batch_features)

            # Concatenate all batches
            if len(features_list) > 1:
                text_features = torch.cat(features_list, dim=0)
            else:
                text_features = features_list[0]

        return text_features

    def _perform_pyramid_analysis(self,
                         image: Union[Image.Image, np.ndarray],
                         levels: int = 4,
                         base_threshold: float = 0.25,
                         aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
        """
        Performs multi-scale pyramid analysis on the image to improve landmark detection.

        Args:
            image: Input image
            levels: Number of pyramid levels
            base_threshold: Base confidence threshold
            aspect_ratios: Different aspect ratios to try (for tall buildings vs wide landscapes)

        Returns:
            Dict: Results of pyramid analysis
        """
        # Ensure image is PIL format
        if not isinstance(image, Image.Image):
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
            else:
                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

        width, height = image.size
        pyramid_results = []

        # 對每個縮放和縱橫比組合進行處理
        for level in range(levels):
            # 計算縮放因子
            scale_factor = 1.0 - (level * 0.2)

            for aspect_ratio in aspect_ratios:
                # 計算新尺寸，保持面積近似不變
                if aspect_ratio != 1.0:
                    # 保持面積近似不變的情況下調整縱橫比
                    new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
                    new_height = int(height * scale_factor * aspect_ratio**0.5)
                else:
                    new_width = int(width * scale_factor)
                    new_height = int(height * scale_factor)

                # 調整圖像大小
                scaled_image = image.resize((new_width, new_height), Image.LANCZOS)

                # 預處理圖像
                image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)

                # 獲取圖像特徵
                with torch.no_grad():
                    image_features = self.model.encode_image(image_input)
                    image_features = image_features / image_features.norm(dim=-1, keepdim=True)

                    # 計算相似度
                    similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
                    similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

                # 找到最佳匹配
                best_idx = similarity.argmax().item()
                best_score = similarity[best_idx]

                if best_score >= base_threshold:
                    landmark_id = list(self.landmark_data.keys())[best_idx]
                    landmark_info = self.landmark_data[landmark_id]

                    pyramid_results.append({
                        "landmark_id": landmark_id,
                        "landmark_name": landmark_info["name"],
                        "confidence": float(best_score),
                        "scale_factor": scale_factor,
                        "aspect_ratio": aspect_ratio,
                        "location": landmark_info["location"]
                    })

        # 按置信度排序
        pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)

        return {
            "is_landmark": len(pyramid_results) > 0,
            "results": pyramid_results,
            "best_result": pyramid_results[0] if pyramid_results else None
        }

    def _enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
        """
        Enhances image features to improve landmark detection.

        Args:
            image: Input image

        Returns:
            PIL.Image: Enhanced image
        """
        # Ensure image is PIL format
        if not isinstance(image, Image.Image):
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
            else:
                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

        # Convert to numpy for processing
        img_array = np.array(image)

        # Skip processing for grayscale images
        if len(img_array.shape) < 3:
            return image

        # Apply adaptive contrast enhancement
        # Convert to LAB color space
        from skimage import color, exposure
        try:
            # Convert to LAB color space
            if img_array.shape[2] == 4:  # Handle RGBA
                img_array = img_array[:,:,:3]

            lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
            l_channel = lab[:,:,0]

            # Enhance contrast of L channel
            p2, p98 = np.percentile(l_channel, (2, 98))
            l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))

            # Replace L channel and convert back to RGB
            lab[:,:,0] = l_channel_enhanced
            enhanced_img = color.lab2rgb(lab) * 255.0
            enhanced_img = enhanced_img.astype(np.uint8)

            return Image.fromarray(enhanced_img)
        except ImportError:
            print("Warning: skimage not available for feature enhancement")
            return image
        except Exception as e:
            print(f"Error in feature enhancement: {e}")
            return image

    def _determine_landmark_type(self, landmark_id):
        """
        自動判斷地標類型，基於地標數據和命名

        Returns:
            str: 地標類型，用於調整閾值
        """
        if not landmark_id:
            return "building"  # 預設類型

        # 獲取地標詳細數據
        landmark_data = self.landmark_data if hasattr(self, 'landmark_data') else {}
        landmark_info = landmark_data.get(landmark_id, {})

        # 獲取地標相關文本
        landmark_id_lower = landmark_id.lower()
        landmark_name = landmark_info.get("name", "").lower()
        landmark_location = landmark_info.get("location", "").lower()
        landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])]

        # 合併所有文本數據用於特徵判斷
        combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases)

        # 地標類型的特色特徵
        type_features = {
            "skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"],
            "tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"],
            "monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"],
            "natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"],
            "temple": ["temple", "shrine", "寺", "神社", "廟"],
            "palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"],
            "distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"]
        }

        # 檢查是否位於亞洲地區
        asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand",
                        "hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"]
        is_asian = any(region in landmark_location for region in asian_regions)

        # 判斷地標類型
        best_type = None
        max_matches = 0

        for type_name, features in type_features.items():
            # 計算特徵詞匹配數量
            matches = sum(1 for feature in features if feature in combined_text)
            if matches > max_matches:
                max_matches = matches
                best_type = type_name

        # 處理亞洲地區特例
        if is_asian and best_type == "tower":
            best_type = "skyscraper"  # 亞洲地區的塔型建築閾值較低

        # 特例處理：檢測傾斜建築
        if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]):
            return "distinctive"  # 傾斜建築需要特殊處理

        return best_type if best_type and max_matches > 0 else "building"  # 預設為一般建築

    def classify_image_region(self,
                    image: Union[Image.Image, np.ndarray],
                    box: List[float],
                    threshold: float = 0.25,
                    detection_type: str = "close_up") -> Dict[str, Any]:
        """
        對圖像的特定區域進行地標分類，具有增強的多尺度和部分識別能力

        Args:
            image: 原始圖像 (PIL Image 或 numpy數組)
            box: 邊界框 [x1, y1, x2, y2]
            threshold: 基礎分類置信度閾值
            detection_type: 檢測類型，影響置信度調整

        Returns:
            Dict: 地標分類結果
        """
        # 確保圖像是PIL格式
        if not isinstance(image, Image.Image):
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
            else:
                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

        # 生成圖像區域的hash用於快取
        region_key = (self._get_image_hash(image), tuple(box), detection_type)
        if region_key in self.results_cache:
            return self.results_cache[region_key]

        # 裁剪區域
        x1, y1, x2, y2 = map(int, box)
        cropped_image = image.crop((x1, y1, x2, y2))
        enhanced_image = self._enhance_features(cropped_image)

        # 分析視角信息
        viewpoint_info = self._analyze_viewpoint(enhanced_image)
        dominant_viewpoint = viewpoint_info["dominant_viewpoint"]

        # 計算區域信息
        region_width = x2 - x1
        region_height = y2 - y1
        image_width, image_height = image.size

        # 根據區域大小判斷可能的檢測類型
        region_area_ratio = (region_width * region_height) / (image_width * image_height)
        if detection_type == "auto":
            if region_area_ratio > 0.5:
                detection_type = "close_up"
            elif region_area_ratio > 0.2:
                detection_type = "partial"
            else:
                detection_type = "distant"

        # 根據視角調整檢測類型
        if dominant_viewpoint == "close_up" and detection_type != "close_up":
            detection_type = "close_up"
        elif dominant_viewpoint == "distant" and detection_type != "distant":
            detection_type = "distant"
        elif dominant_viewpoint == "angled_view":
            detection_type = "partial"  # 角度視圖可能是部分可見

        # 調整置信度閾值
        base_multiplier = self.confidence_threshold_multipliers.get(detection_type, 1.0)
        adjusted_threshold = threshold * base_multiplier

        # 調整多尺度處理的尺度範圍和縱橫比 - 增強對傾斜建築的支持
        scales = [1.0]  # 默認尺度

        # 基於視角選擇合適的尺度和縱橫比
        if detection_type in ["partial", "distant"]:
            scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]  # 標準範圍

        # 如果是特殊視角，進一步調整尺度和縱橫比 - 新增
        if dominant_viewpoint in ["angled_view", "low_angle"]:
            scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4]  # 更寬的範圍

        # 準備縱橫比 - 同時支持水平和垂直地標
        aspect_ratios = [1.0, 0.8, 1.2]  # 標準縱橫比

        # 針對可能的傾斜建築增加更多縱橫比 - 新增
        if dominant_viewpoint in ["angled_view", "unique_feature"]:
            aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5]  # 更多樣的縱橫比

        best_result = {
            "landmark_id": None,
            "landmark_name": None,
            "confidence": 0.0,
            "is_landmark": False
        }

        # 多尺度和縱橫比分析
        for scale in scales:
            for aspect_ratio in aspect_ratios:
                # 縮放裁剪區域
                current_width, current_height = cropped_image.size

                # 計算新尺寸，保持面積不變但調整縱橫比
                if aspect_ratio != 1.0:
                    new_width = int(current_width * scale * (1/aspect_ratio)**0.5)
                    new_height = int(current_height * scale * aspect_ratio**0.5)
                else:
                    new_width = int(current_width * scale)
                    new_height = int(current_height * scale)

                # 確保尺寸至少為1像素
                new_width = max(1, new_width)
                new_height = max(1, new_height)

                # 縮放圖像
                try:
                    scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS)
                except Exception as e:
                    print(f"Failed to resize image to {new_width}x{new_height}: {e}")
                    continue

                # 預處理裁剪圖像
                try:
                    image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
                except Exception as e:
                    print(f"Failed to preprocess image: {e}")
                    continue

                # 獲取圖像特徵
                with torch.no_grad():
                    try:
                        image_features = self.model.encode_image(image_input)
                        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

                        # 計算與地標提示的相似度
                        similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
                        similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

                        # 找到最佳匹配
                        best_idx = similarity.argmax().item()
                        best_score = similarity[best_idx]

                        # 如果當前尺度結果更好，則更新
                        if best_score > best_result["confidence"]:
                            landmark_id = list(self.landmark_data.keys())[best_idx]
                            landmark_info = self.landmark_data[landmark_id]

                            best_result = {
                                "landmark_id": landmark_id,
                                "landmark_name": landmark_info["name"],
                                "location": landmark_info["location"],
                                "confidence": float(best_score),
                                "is_landmark": best_score >= adjusted_threshold,
                                "scale_used": scale,
                                "aspect_ratio_used": aspect_ratio,
                                "viewpoint": dominant_viewpoint
                            }

                            # 添加額外可用信息
                            for key in ["year_built", "architectural_style", "significance"]:
                                if key in landmark_info:
                                    best_result[key] = landmark_info[key]
                    except Exception as e:
                        print(f"Error in calculating similarity: {e}")
                        continue

        # 只有在有識別出地標ID且信心度足夠高時才應用地標類型閾值調整
        if best_result["landmark_id"]:
            landmark_type = self._determine_landmark_type(best_result["landmark_id"])

            # 檢測是否為特殊類型的建築如斜塔
            if landmark_type == "distinctive":
                # 特殊建築的閾值降低25%
                type_multiplier = 0.75
            else:
                # 使用已有的類型閾值
                type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5

            # 更新判斷是否為地標的標準
            final_threshold = adjusted_threshold * type_multiplier
            best_result["is_landmark"] = best_result["confidence"] >= final_threshold
            best_result["landmark_type"] = landmark_type  # 添加地標類型信息
            best_result["threshold_applied"] = final_threshold  # 記錄應用的閾值

        # 快取結果
        self.results_cache[region_key] = best_result
        self._manage_cache()

        return best_result

    def classify_batch_regions(self,
                              image: Union[Image.Image, np.ndarray],
                              boxes: List[List[float]],
                              threshold: float = 0.28) -> List[Dict[str, Any]]:
        """
        批量處理多個圖像區域，提高效率

        Args:
            image: 原始圖像
            boxes: 邊界框列表
            threshold: 置信度閾值

        Returns:
            List[Dict]: 分類結果列表
        """
        if not self.landmark_text_features is not None:
            return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]

        # 確保圖像是PIL格式
        if not isinstance(image, Image.Image):
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
            else:
                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

        # 無框可處理時
        if not boxes:
            return []

        # 裁剪並預處理所有區域
        cropped_inputs = []
        for box in boxes:
            x1, y1, x2, y2 = map(int, box)
            cropped_image = image.crop((x1, y1, x2, y2))
            processed_image = self.preprocess(cropped_image).unsqueeze(0)
            cropped_inputs.append(processed_image)

        # batch process
        batch_tensor = torch.cat(cropped_inputs).to(self.device)

        # batch encoding
        with torch.no_grad():
            image_features = self.model.encode_image(batch_tensor)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)

            # 計算相似度
            similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
            similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy()

        # 處理每個區域的結果
        results = []
        for i, sim in enumerate(similarity):
            best_idx = sim.argmax().item()
            best_score = sim[best_idx]

            if best_score >= threshold:
                landmark_id = list(self.landmark_data.keys())[best_idx]
                landmark_info = self.landmark_data[landmark_id]

                results.append({
                    "landmark_id": landmark_id,
                    "landmark_name": landmark_info["name"],
                    "location": landmark_info["location"],
                    "confidence": float(best_score),
                    "is_landmark": True,
                    "box": boxes[i]
                })
            else:
                results.append({
                    "landmark_id": None,
                    "landmark_name": None,
                    "confidence": float(best_score),
                    "is_landmark": False,
                    "box": boxes[i]
                })

        return results

    def search_entire_image(self,
                        image: Union[Image.Image, np.ndarray],
                        threshold: float = 0.35,
                        detailed_analysis: bool = False) -> Dict[str, Any]:
        """
        檢查整張圖像是否包含地標，具有增強的分析能力

        Args:
            image: 原始圖像
            threshold: 置信度閾值
            detailed_analysis: 是否進行詳細分析，包括多區域檢測

        Returns:
            Dict: 地標分類結果
        """
        # 確保圖像是PIL格式
        if not isinstance(image, Image.Image):
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
            else:
                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

        # 檢查快取
        image_key = (self._get_image_hash(image), "entire_image", detailed_analysis)
        if image_key in self.results_cache:
            return self.results_cache[image_key]

        # 調整閾值
        adjusted_threshold = threshold * self.confidence_threshold_multipliers.get("full_image", 1.0)

        # 預處理圖像
        image_input = self.preprocess(image).unsqueeze(0).to(self.device)

        # 獲取圖像特徵
        with torch.no_grad():
            image_features = self.model.encode_image(image_input)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)

            # 計算與地標提示的相似度
            similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

        # 找到最佳匹配
        best_idx = similarity.argmax().item()
        best_score = similarity[best_idx]

        # top3 landmark
        top_indices = similarity.argsort()[-3:][::-1]
        top_landmarks = []

        for idx in top_indices:
            score = similarity[idx]
            landmark_id = list(self.landmark_data.keys())[idx]
            landmark_info = self.landmark_data[landmark_id]

            landmark_result = {
                "landmark_id": landmark_id,
                "landmark_name": landmark_info["name"],
                "location": landmark_info["location"],
                "confidence": float(score)
            }

            # 添加額外可用信息
            if "year_built" in landmark_info:
                landmark_result["year_built"] = landmark_info["year_built"]
            if "architectural_style" in landmark_info:
                landmark_result["architectural_style"] = landmark_info["architectural_style"]
            if "significance" in landmark_info:
                landmark_result["significance"] = landmark_info["significance"]

            top_landmarks.append(landmark_result)

        # main result
        result = {}
        if best_score >= adjusted_threshold:
            landmark_id = list(self.landmark_data.keys())[best_idx]
            landmark_info = self.landmark_data[landmark_id]

            # 應用地標類型特定閾值
            landmark_type = self._determine_landmark_type(landmark_id)
            type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
            final_threshold = adjusted_threshold * type_multiplier

            if best_score >= final_threshold:
                result = {
                    "landmark_id": landmark_id,
                    "landmark_name": landmark_info["name"],
                    "location": landmark_info["location"],
                    "confidence": float(best_score),
                    "is_landmark": True,
                    "landmark_type": landmark_type,
                    "top_landmarks": top_landmarks
                }

                # 添加額外可用信息
                if "year_built" in landmark_info:
                    result["year_built"] = landmark_info["year_built"]
                if "architectural_style" in landmark_info:
                    result["architectural_style"] = landmark_info["architectural_style"]
                if "significance" in landmark_info:
                    result["significance"] = landmark_info["significance"]
            else:
                result = {
                    "landmark_id": None,
                    "landmark_name": None,
                    "confidence": float(best_score),
                    "is_landmark": False,
                    "top_landmarks": top_landmarks
                }

        # 如果請求詳細分析且是地標，進一步分析圖像區域
        if detailed_analysis and result.get("is_landmark", False):
            # 創建不同區域進行更深入分析
            width, height = image.size
            regions = [
                # 中心區域
                [width * 0.25, height * 0.25, width * 0.75, height * 0.75],
                # 左半部
                [0, 0, width * 0.5, height],
                # 右半部
                [width * 0.5, 0, width, height],
                # 上半部
                [0, 0, width, height * 0.5],
                # 下半部
                [0, height * 0.5, width, height]
            ]

            region_results = []
            for i, box in enumerate(regions):
                region_result = self.classify_image_region(
                    image,
                    box,
                    threshold=threshold * 0.9,
                    detection_type="partial"
                )
                if region_result["is_landmark"]:
                    region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i]
                    region_results.append(region_result)

            # 添加區域分析結果
            if region_results:
                result["region_analyses"] = region_results

        # 快取結果
        self.results_cache[image_key] = result
        self._manage_cache()

        return result

    def enhanced_landmark_detection(self,
                              image: Union[Image.Image, np.ndarray],
                              threshold: float = 0.3) -> Dict[str, Any]:
        """
        Enhanced landmark detection using multiple analysis techniques.

        Args:
            image: Input image
            threshold: Base confidence threshold

        Returns:
            Dict: Comprehensive landmark detection results
        """
        # Ensure image is PIL format
        if not isinstance(image, Image.Image):
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
            else:
                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

        # Phase 1: Analyze viewpoint to adjust detection parameters
        viewpoint_info = self._analyze_viewpoint(image)
        viewpoint = viewpoint_info["dominant_viewpoint"]

        # Adjust threshold based on viewpoint
        if viewpoint == "distant":
            adjusted_threshold = threshold * 0.7  # Lower threshold for distant views
        elif viewpoint == "close_up":
            adjusted_threshold = threshold * 1.1  # Higher threshold for close-ups
        else:
            adjusted_threshold = threshold

        # Phase 2: Perform multi-scale pyramid analysis
        pyramid_results = self._perform_pyramid_analysis(image, levels=3, base_threshold=adjusted_threshold)

        # Phase 3: Perform grid-based region analysis
        grid_results = []
        width, height = image.size

        # Create adaptive grid based on viewpoint
        if viewpoint == "distant":
            grid_size = 3  # Coarser grid for distant views
        elif viewpoint == "close_up":
            grid_size = 5  # Finer grid for close-ups
        else:
            grid_size = 4  # Default grid size

        # Generate grid regions
        for i in range(grid_size):
            for j in range(grid_size):
                box = [
                    width * (j/grid_size),
                    height * (i/grid_size),
                    width * ((j+1)/grid_size),
                    height * ((i+1)/grid_size)
                ]

                # Apply feature enhancement
                region_result = self.classify_image_region(
                    image,
                    box,
                    threshold=adjusted_threshold,
                    detection_type="auto"
                )

                if region_result["is_landmark"]:
                    region_result["grid_position"] = (i, j)
                    grid_results.append(region_result)

        # Phase 4: Cross-validate and combine results
        all_detections = []

        # Add pyramid results
        if pyramid_results["is_landmark"] and pyramid_results["best_result"]:
            all_detections.append({
                "source": "pyramid",
                "landmark_id": pyramid_results["best_result"]["landmark_id"],
                "landmark_name": pyramid_results["best_result"]["landmark_name"],
                "confidence": pyramid_results["best_result"]["confidence"],
                "scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0)
            })

        # Add grid results
        for result in grid_results:
            all_detections.append({
                "source": "grid",
                "landmark_id": result["landmark_id"],
                "landmark_name": result["landmark_name"],
                "confidence": result["confidence"],
                "grid_position": result.get("grid_position", (0, 0))
            })

        # Search entire image
        full_image_result = self.search_entire_image(image, threshold=adjusted_threshold)
        if full_image_result and full_image_result.get("is_landmark", False):
            all_detections.append({
                "source": "full_image",
                "landmark_id": full_image_result["landmark_id"],
                "landmark_name": full_image_result["landmark_name"],
                "confidence": full_image_result["confidence"]
            })

        # Group by landmark_id and calculate aggregate confidence
        landmark_groups = {}
        for detection in all_detections:
            landmark_id = detection["landmark_id"]
            if landmark_id not in landmark_groups:
                landmark_groups[landmark_id] = {
                    "landmark_id": landmark_id,
                    "landmark_name": detection["landmark_name"],
                    "detections": [],
                    "sources": set()
                }

            landmark_groups[landmark_id]["detections"].append(detection)
            landmark_groups[landmark_id]["sources"].add(detection["source"])

        # Calculate aggregate confidence for each landmark
        for landmark_id, group in landmark_groups.items():
            detections = group["detections"]

            # Base confidence is the maximum confidence from any source
            max_confidence = max(d["confidence"] for d in detections)

            # Bonus for detection from multiple sources
            source_count = len(group["sources"])
            source_bonus = min(0.15, (source_count - 1) * 0.05)  # Up to 15% bonus

            # Consistency bonus for multiple detections of the same landmark
            detection_count = len(detections)
            consistency_bonus = min(0.1, (detection_count - 1) * 0.02)  # Up to 10% bonus

            # Calculate final confidence
            aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus)

            group["confidence"] = aggregate_confidence
            group["detection_count"] = detection_count
            group["source_count"] = source_count

        # Sort landmarks by confidence
        sorted_landmarks = sorted(
            landmark_groups.values(),
            key=lambda x: x["confidence"],
            reverse=True
        )

        return {
            "is_landmark_scene": len(sorted_landmarks) > 0,
            "detected_landmarks": sorted_landmarks,
            "viewpoint_info": viewpoint_info,
            "primary_landmark": sorted_landmarks[0] if sorted_landmarks else None
        }

    def _analyze_architectural_features(self, image):
        """
        Analyzes the architectural features of a structure in the image without hardcoding specific landmarks.

        Args:
            image: Input image

        Returns:
            Dict: Architectural feature analysis results
        """
        # Define universal architectural feature prompts that apply to all types of landmarks
        architecture_prompts = {
            "tall_structure": "a tall vertical structure standing alone",
            "tiered_building": "a building with multiple stacked tiers or segments",
            "historical_structure": "a building with historical architectural elements",
            "modern_design": "a modern structure with contemporary architectural design",
            "segmented_exterior": "a structure with visible segmented or sectioned exterior",
            "viewing_platform": "a tall structure with observation area at the top",
            "time_display": "a structure with timepiece features",
            "glass_facade": "a building with prominent glass exterior surfaces",
            "memorial_structure": "a monument or memorial structure",
            "ancient_construction": "ancient constructed elements or archaeological features",
            "natural_landmark": "a natural geographic formation or landmark",
            "slanted_design": "a structure with non-vertical or leaning profile"
        }

        # Calculate similarity scores against universal architectural patterns
        context_scores = self.calculate_similarity_scores(image, architecture_prompts)

        # Determine most relevant architectural features
        top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]

        # Calculate feature confidence
        context_confidence = sum(score for _, score in top_features) / 3

        # Determine primary architectural category based on top features
        architectural_categories = {
            "tower": ["tall_structure", "viewing_platform", "time_display"],
            "skyscraper": ["tall_structure", "modern_design", "glass_facade"],
            "historical": ["historical_structure", "ancient_construction", "memorial_structure"],
            "natural": ["natural_landmark"],
            "distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
        }

        # Score each category based on the top features
        category_scores = {}
        for category, features in architectural_categories.items():
            category_score = 0
            for feature, score in context_scores.items():
                if feature in features:
                    category_score += score
            category_scores[category] = category_score

        primary_category = max(category_scores.items(), key=lambda x: x[1])[0]

        return {
            "architectural_features": top_features,
            "context_confidence": context_confidence,
            "primary_category": primary_category,
            "category_scores": category_scores
        }

    def intelligent_landmark_search(self,
                                image: Union[Image.Image, np.ndarray],
                                yolo_boxes: Optional[List[List[float]]] = None,
                                base_threshold: float = 0.25) -> Dict[str, Any]:
        """
        對圖像進行智能地標搜索，綜合整張圖像分析和區域分析

        Args:
            image: 原始圖像
            yolo_boxes: YOLO檢測到的邊界框 (可選)
            base_threshold: 基礎置信度閾值

        Returns:
            Dict: 包含所有檢測結果的綜合分析
        """
        # 確保圖像是PIL格式
        if not isinstance(image, Image.Image):
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
            else:
                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

        # No YOLO 框時，可以稍微降低閾值以提高召回率
        actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold

        # 首先對整張圖像進行分析
        try:
            full_image_result = self.search_entire_image(
                image,
                threshold=actual_threshold,
                detailed_analysis=True  # 確保詳細分析開啟
            )

            # No YOLO 框，則進行多尺度分析以提高檢測機會
            if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)):
                print("No YOLO boxes provided, attempting multi-scale pyramid analysis")
                try:
                    if hasattr(self, '_perform_pyramid_analysis'):
                        pyramid_results = self._perform_pyramid_analysis(
                            image,
                            levels=4,  #
                            base_threshold=actual_threshold,
                            aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0]
                        )

                        if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold:
                            # 使用金字塔分析結果增強或替代全圖結果
                            if not full_image_result or not full_image_result.get("is_landmark", False):
                                full_image_result = {
                                    "is_landmark": True,
                                    "landmark_id": pyramid_results["best_result"]["landmark_id"],
                                    "landmark_name": pyramid_results["best_result"]["landmark_name"],
                                    "confidence": pyramid_results["best_result"]["confidence"],
                                    "location": pyramid_results["best_result"].get("location", "Unknown Location")
                                }
                                print(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}")
                    else:
                        print("Pyramid analysis not available, skipping multi-scale detection")
                except Exception as e:
                    print(f"Error in pyramid analysis: {e}")
        except Exception as e:
            print(f"Error in search_entire_image: {e}")
            import traceback
            traceback.print_exc()
            full_image_result = None

        # 初始化結果字典
        result = {
            "full_image_analysis": full_image_result if full_image_result else {},
            "is_landmark_scene": False,  # 默認值
            "detected_landmarks": []
        }

        # 上下文感知比較，處理接近的排名結果
        if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2:
            top_landmarks = full_image_result["top_landmarks"]

            # 檢查前兩個結果是否非常接近（信心度差異小於 0.1）
            if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1:
                # 對於接近的結果，使用通用建築特徵分析進行區分
                try:
                    # 分析建築特徵
                    if hasattr(self, '_analyze_architectural_features'):
                        architectural_analysis = self._analyze_architectural_features(image)
                        top_features = architectural_analysis.get("architectural_features", [])
                        primary_category = architectural_analysis.get("primary_category", "")

                        # 根據建築特徵調整地標置信度
                        for i, landmark in enumerate(top_landmarks[:2]):
                            if i >= len(top_landmarks):
                                continue

                            landmark_id = landmark.get("landmark_id", "").lower()
                            confidence_boost = 0

                            # 使用主要建築類別來調整置信度，使用通用條件而非特定地標名稱
                            if primary_category == "tower" and any(term in landmark_id for term in ["tower", "spire", "needle"]):
                                confidence_boost += 0.05
                            elif primary_category == "skyscraper" and any(term in landmark_id for term in ["building", "skyscraper", "tall"]):
                                confidence_boost += 0.05
                            elif primary_category == "historical" and any(term in landmark_id for term in ["monument", "castle", "palace", "temple"]):
                                confidence_boost += 0.05
                            elif primary_category == "distinctive" and any(term in landmark_id for term in ["unusual", "unique", "special", "famous"]):
                                confidence_boost += 0.05

                            # 根據特定特徵進一步微調，使用通用特徵描述而非特定地標
                            for feature, score in top_features:
                                if feature == "time_display" and "clock" in landmark_id:
                                    confidence_boost += 0.03
                                elif feature == "segmented_exterior" and "segmented" in landmark_id:
                                    confidence_boost += 0.03
                                elif feature == "slanted_design" and "leaning" in landmark_id:
                                    confidence_boost += 0.03

                            # 應用信心度調整
                            if confidence_boost > 0 and i < len(top_landmarks):
                                top_landmarks[i]["confidence"] += confidence_boost
                                print(f"Boosted {landmark['landmark_name']} confidence by {confidence_boost:.2f} based on architectural features ({primary_category})")

                        # 重新排序
                        top_landmarks.sort(key=lambda x: x["confidence"], reverse=True)
                        full_image_result["top_landmarks"] = top_landmarks
                        if top_landmarks:
                            full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"]
                            full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"]
                            full_image_result["confidence"] = top_landmarks[0]["confidence"]
                            full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location")
                except Exception as e:
                    print(f"Error in architectural feature analysis: {e}")
                    import traceback
                    traceback.print_exc()

        if full_image_result and full_image_result.get("is_landmark", False):
            result["is_landmark_scene"] = True
            landmark_id = full_image_result.get("landmark_id", "unknown")

            # extract landmark info
            landmark_specific_info = self._extract_landmark_specific_info(landmark_id)

            landmark_info = {
                "landmark_id": landmark_id,
                "landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"),
                "confidence": full_image_result.get("confidence", 0.0),
                "location": full_image_result.get("location", "Unknown Location"),
                "region_type": "full_image",
                "box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)]
            }

            # 整合地標特定info，確保正確的名稱被使用
            landmark_info.update(landmark_specific_info)

            # 如果特定信息中有更準確的地標名稱，使用它
            if landmark_specific_info.get("landmark_name"):
                landmark_info["landmark_name"] = landmark_specific_info["landmark_name"]

            result["detected_landmarks"].append(landmark_info)

            # 確保地標特定活動被正確設置為主要結果
            if landmark_specific_info.get("has_specific_activities", False):
                result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", [])
                print(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}")

        # 如果提供了YOLO邊界框，分析這些區域
        if yolo_boxes and len(yolo_boxes) > 0:
            for box in yolo_boxes:
                try:
                    if hasattr(self, 'classify_image_region'):
                        box_result = self.classify_image_region(
                            image,
                            box,
                            threshold=base_threshold,
                            detection_type="auto"
                        )

                        # 如果檢測到地標
                        if box_result and box_result.get("is_landmark", False):
                            # 檢查是否與已檢測的地標重複
                            is_duplicate = False
                            for existing in result["detected_landmarks"]:
                                if existing.get("landmark_id") == box_result.get("landmark_id"):
                                    # 如果新的置信度更高，則更新
                                    if box_result.get("confidence", 0) > existing.get("confidence", 0):
                                        existing.update({
                                            "confidence": box_result.get("confidence", 0),
                                            "region_type": "yolo_box",
                                            "box": box
                                        })
                                    is_duplicate = True
                                    break

                            # 如果不是重複的，添加到列表
                            if not is_duplicate:
                                result["detected_landmarks"].append({
                                    "landmark_id": box_result.get("landmark_id", "unknown"),
                                    "landmark_name": box_result.get("landmark_name", "Unknown Landmark"),
                                    "confidence": box_result.get("confidence", 0.0),
                                    "location": box_result.get("location", "Unknown Location"),
                                    "region_type": "yolo_box",
                                    "box": box
                                })
                except Exception as e:
                    print(f"Error in analyzing YOLO box: {e}")
                    continue

        # 最後，執行額外的網格搜索以捕獲可能被遺漏的地標
        # 但只有在尚未發現地標或僅發現低置信度地標時
        should_do_grid_search = (
            len(result["detected_landmarks"]) == 0 or
            max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5
        )

        if should_do_grid_search and hasattr(self, 'classify_image_region'):
            try:
                # 創建5x5網格
                width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0)))
                if not isinstance(width, (int, float)) or width <= 0:
                    width = getattr(image, 'width', 0)
                if not isinstance(height, (int, float)) or height <= 0:
                    height = getattr(image, 'height', 0)

                if width > 0 and height > 0:
                    grid_boxes = []
                    for i in range(5):
                        for j in range(5):
                            grid_boxes.append([
                                width * (j/5), height * (i/5),
                                width * ((j+1)/5), height * ((i+1)/5)
                            ])

                    # 分析每個網格區域
                    for box in grid_boxes:
                        try:
                            grid_result = self.classify_image_region(
                                image,
                                box,
                                threshold=base_threshold * 0.9,  # 稍微降低網格搜索閾值
                                detection_type="partial"
                            )

                            # 如果檢測到地標
                            if grid_result and grid_result.get("is_landmark", False):
                                # 檢查是否與已檢測的地標重複
                                is_duplicate = False
                                for existing in result["detected_landmarks"]:
                                    if existing.get("landmark_id") == grid_result.get("landmark_id"):
                                        is_duplicate = True
                                        break

                                # 如果不是重複的，添加到列表
                                if not is_duplicate:
                                    result["detected_landmarks"].append({
                                        "landmark_id": grid_result.get("landmark_id", "unknown"),
                                        "landmark_name": grid_result.get("landmark_name", "Unknown Landmark"),
                                        "confidence": grid_result.get("confidence", 0.0),
                                        "location": grid_result.get("location", "Unknown Location"),
                                        "region_type": "grid",
                                        "box": box
                                    })
                        except Exception as e:
                            print(f"Error in analyzing grid region: {e}")
                            continue
            except Exception as e:
                print(f"Error in grid search: {e}")
                import traceback
                traceback.print_exc()

        # 按置信度排序檢測結果
        result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True)

        # 更新整體場景類型判斷
        if len(result["detected_landmarks"]) > 0:
            result["is_landmark_scene"] = True
            result["primary_landmark"] = result["detected_landmarks"][0]

            # 添加 clip_analysis_on_full_image 結果，以便給 LLM 提供更多上下文
            if full_image_result and "clip_analysis" in full_image_result:
                result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"]

        return result

    def _extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]:
        """
        提取特定地標的詳細信息，包括特色模板和活動建議

        Args:
            landmark_id: 地標ID

        Returns:
            Dict: 地標特定信息
        """
        if not landmark_id or landmark_id == "unknown":
            return {"has_specific_activities": False}

        specific_info = {"has_specific_activities": False}

        # 從 ALL_LANDMARKS 或 self.landmark_data 中提取基本信息
        landmark_data_source = None

        # 優先嘗試從類屬性獲取
        if hasattr(self, 'landmark_data') and self.landmark_data and landmark_id in self.landmark_data:
            landmark_data_source = self.landmark_data[landmark_id]
            print(f"Using landmark data from class attribute for {landmark_id}")
        else:
            try:
                if landmark_id in ALL_LANDMARKS:
                    landmark_data_source = ALL_LANDMARKS[landmark_id]
                    print(f"Using landmark data from ALL_LANDMARKS for {landmark_id}")
            except ImportError:
                print("Warning: Could not import ALL_LANDMARKS from landmark_data")
            except Exception as e:
                print(f"Error accessing ALL_LANDMARKS: {e}")

        # 處理地標基本數據
        if landmark_data_source:
            # 提取正確的地標名稱
            if "name" in landmark_data_source:
                specific_info["landmark_name"] = landmark_data_source["name"]

            # 提取所有可用的 prompts 作為特色模板
            if "prompts" in landmark_data_source:
                specific_info["feature_templates"] = landmark_data_source["prompts"][:5]
                specific_info["primary_template"] = landmark_data_source["prompts"][0]

            # 提取別名info
            if "aliases" in landmark_data_source:
                specific_info["aliases"] = landmark_data_source["aliases"]

            # 提取位置信息
            if "location" in landmark_data_source:
                specific_info["location"] = landmark_data_source["location"]

            # 提取其他相關信息
            for key in ["year_built", "architectural_style", "significance", "description"]:
                if key in landmark_data_source:
                    specific_info[key] = landmark_data_source[key]

        # 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議
        try:
            if landmark_id in LANDMARK_ACTIVITIES:
                activities = LANDMARK_ACTIVITIES[landmark_id]
                specific_info["landmark_specific_activities"] = activities
                specific_info["has_specific_activities"] = True
                print(f"Found {len(activities)} specific activities for landmark {landmark_id}")
            else:
                print(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES")
                specific_info["has_specific_activities"] = False
        except ImportError:
            print("Warning: Could not import LANDMARK_ACTIVITIES from landmark_activities")
            specific_info["has_specific_activities"] = False
        except Exception as e:
            print(f"Error loading landmark activities for {landmark_id}: {e}")
            specific_info["has_specific_activities"] = False

        return specific_info

    def _analyze_viewpoint(self, image: Union[Image.Image, np.ndarray]) -> Dict[str, float]:
        """
        Analyzes the image viewpoint to adjust detection parameters.

        Args:
            image: Input image

        Returns:
            Dict: Viewpoint analysis results
        """
        viewpoint_prompts = {
            "aerial_view": "an aerial view from above looking down",
            "street_level": "a street level view looking up at a tall structure",
            "eye_level": "an eye-level horizontal view of a landmark",
            "distant": "a distant view of a landmark on the horizon",
            "close_up": "a close-up detailed view of architectural features",
            "interior": "an interior view inside a structure"
        }

        # Calculate similarity scores
        viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts)

        # Find dominant viewpoint
        dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])

        return {
            "viewpoint_scores": viewpoint_scores,
            "dominant_viewpoint": dominant_viewpoint[0],
            "confidence": dominant_viewpoint[1]
        }

    def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
                                prompts: Dict[str, str]) -> Dict[str, float]:
        """
        計算圖像與一組特定提示之間的相似度分數

        Args:
            image: 輸入圖像
            prompts: 提示詞字典 {名稱: 提示文本}

        Returns:
            Dict[str, float]: 每個提示的相似度分數
        """
        # 確保圖像是PIL格式
        if not isinstance(image, Image.Image):
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
            else:
                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

        # 預處理圖像
        image_input = self.preprocess(image).unsqueeze(0).to(self.device)

        # 獲取圖像特徵
        with torch.no_grad():
            image_features = self.model.encode_image(image_input)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # 計算與每個提示的相似度
        scores = {}
        prompt_texts = list(prompts.values())
        prompt_tokens = clip.tokenize(prompt_texts).to(self.device)

        with torch.no_grad():
            prompt_features = self.model.encode_text(prompt_tokens)
            prompt_features = prompt_features / prompt_features.norm(dim=-1, keepdim=True)

            # calculate similarity
            similarity = (100.0 * image_features @ prompt_features.T).softmax(dim=-1)
            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]

        # 填充結果字典
        for i, (name, _) in enumerate(prompts.items()):
            scores[name] = float(similarity[i])

        return scores