Spaces:

Omniscient001
/

Omniscient

Sleeping

App Files Files Community

Andy Lee commited on Jun 8

Commit

749ea04

1 Parent(s): d140f7d

feat: use keyboards to interact

Browse files

Files changed (4) hide show

config.py +7 -55
geo_bot.py +178 -71
main.py +120 -201
mapcrunch_controller.py +110 -141

config.py CHANGED Viewed

@@ -8,54 +8,13 @@ MAPCRUNCH_URL = "https://www.mapcrunch.com"
 # UI element selectors
 SELECTORS = {
     "go_button": "#go-button",
-    "options_button": "#options-button",
-    "stealth_checkbox": "#stealth",
-    "urban_checkbox": "#cities",
-    "indoor_checkbox": "#inside",
-    "tour_checkbox": "#tour",
-    "auto_checkbox": "#auto",
     "pano_container": "#pano",
-    "map_container": "#map",
     "address_element": "#address",
-    "confirm_button": "#confirm-button",  # Will be determined dynamically
-    "country_list": "#countrylist",
-    "continent_links": "#continents a",
-}
-# MapCrunch collection options
-MAPCRUNCH_OPTIONS = {
-    "urban_only": True,  # Show urban areas only
-    "exclude_indoor": True,  # Exclude indoor views
-    "stealth_mode": False,  # Hide location info during gameplay
-    "tour_mode": False,  # 360 degree tour
-    "auto_mode": False,  # Automatic slideshow
-    "selected_countries": None,  # None means all, or list like ['us', 'gb', 'jp']
-    "selected_continents": None,  # None means all, or list like [1, 2]  # 1=N.America, 2=Europe, etc
 }
 # Data collection settings
 DATA_COLLECTION_CONFIG = {
-    "save_thumbnails": True,  # Save small screenshots
-    "thumbnail_size": (320, 240),  # Thumbnail dimensions
-    "save_full_screenshots": False,  # Save full resolution screenshots (storage intensive)
-    "extract_address": True,  # Extract address/location name
-    "wait_after_go": 3,  # Seconds to wait after clicking Go
-    "retry_on_failure": True,  # Retry if location fails
-    "max_retries": 3,  # Max retries per location
-}
-# Reference points for coordinate calibration (used in pyautogui coordinate system)
-REFERENCE_POINTS = {
-    "kodiak": {"lat": 57.7916, "lon": -152.4083},
-    "hobart": {"lat": -42.8833, "lon": 147.3355},
-}
-# Selenium settings
-SELENIUM_CONFIG = {
-    "headless": False,
-    "window_size": (1920, 1080),
-    "implicit_wait": 10,
-    "page_load_timeout": 30,
 }
 # Model configurations
@@ -66,27 +25,20 @@ MODELS_CONFIG = {
     },
     "claude-3.5-sonnet": {
         "class": "ChatAnthropic",
-        "model_name": "claude-3-5-sonnet-20241022",
     },
     "gemini-1.5-pro": {
         "class": "ChatGoogleGenerativeAI",
-        "model_name": "gemini-1.5-pro",
     },
-}
-# Benchmark settings
-BENCHMARK_CONFIG = {
-    "rounds_per_model": 50,
-    "data_collection_samples": 200,
-    "screenshot_delay": 2,
-    "click_delay": 1,
 }
 # Data paths
 DATA_PATHS = {
     "golden_labels": "data/golden_labels.json",
-    "screenshots": "data/screenshots/",
-    "thumbnails": "data/thumbnails/",
     "results": "results/",
-    "screen_regions": "screen_regions.yaml",  # Keep for backward compatibility
 }

 # UI element selectors
 SELECTORS = {
     "go_button": "#go-button",
     "pano_container": "#pano",
     "address_element": "#address",
 }
 # Data collection settings
 DATA_COLLECTION_CONFIG = {
+    "wait_after_go": 3,
 }
 # Model configurations
     },
     "claude-3.5-sonnet": {
         "class": "ChatAnthropic",
+        "model_name": "claude-3-5-sonnet-20240620",
     },
     "gemini-1.5-pro": {
         "class": "ChatGoogleGenerativeAI",
+        "model_name": "gemini-1.5-pro-latest",
+    },
+    "gemini-2.5-pro": {
+        "class": "ChatGoogleGenerativeAI",
+        "model_name": "gemini-2.5-pro-preview-06-05",
     },
 }
 # Data paths
 DATA_PATHS = {
     "golden_labels": "data/golden_labels.json",
     "results": "results/",
 }

geo_bot.py CHANGED Viewed

@@ -1,11 +1,10 @@
-# geo_bot.py (Final Streamlined Version)
-from io import BytesIO
 import base64
 import re
-from typing import Tuple, List, Optional
-from PIL import Image
 from langchain_core.messages import HumanMessage, BaseMessage
 from langchain_openai import ChatOpenAI
 from langchain_anthropic import ChatAnthropic
@@ -13,24 +12,52 @@ from langchain_google_genai import ChatGoogleGenerativeAI
 from mapcrunch_controller import MapCrunchController
-PROMPT_INSTRUCTIONS = """
-Try to predict where the image was taken.
-First describe the relevant details in the image to do it.
-List some regions and places where it could be.
-Choose the most likely Country and City or Specific Location.
-At the end, in the last line apart from the previous reasoning, write the Latitude and Longitude from that guessed location
-using the following format, making sure that the coords are valid floats, without anything else and making sure to be consistent with the format:
-Lat: XX.XXXX, Lon: XX.XXXX
-"""
-class GeoBot:
-    """A streamlined bot focused purely on image analysis for the benchmark."""
-    prompt_instructions: str = PROMPT_INSTRUCTIONS
     def __init__(
-        self, model=ChatOpenAI, model_name="gpt-4o", use_selenium=True, headless=False
     ):
         self.model = model(model=model_name)
         self.model_name = model_name
@@ -42,87 +69,167 @@ class GeoBot:
     @staticmethod
     def pil_to_base64(image: Image) -> str:
         buffered = BytesIO()
         image.save(buffered, format="PNG")
         return base64.b64encode(buffered.getvalue()).decode("utf-8")
-    @classmethod
-    def create_message(cls, images_data: List[str]) -> HumanMessage:
-        content = [{"type": "text", "text": cls.prompt_instructions}]
-        for img_data in images_data:
             content.append(
                 {
                     "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{img_data}"},
                 }
             )
-        return HumanMessage(content=content)
-    def extract_lat_lon_from_response(
-        self, response: BaseMessage
-    ) -> Optional[Tuple[float, float]]:
-        """Extracts latitude and longitude from LLM response using regex for robustness."""
         try:
             content = response.content.strip()
-            last_line = ""
-            for line in reversed(content.split("\n")):
-                if "lat" in line.lower() and "lon" in line.lower():
-                    last_line = line
-                    break
-            if not last_line:
-                print(f"❌ No coordinate line found in response.")
-                return None
-            print(f"🎯 {self.model_name} Prediction: {last_line}")
-            numbers = re.findall(r"[-+]?\d*\.\d+|\d+", last_line)
-            if len(numbers) < 2:
-                print(
-                    f"❌ Could not find two numbers for lat/lon in line: '{last_line}'"
-                )
                 return None
-            lat, lon = float(numbers[0]), float(numbers[1])
-            if not (-90 <= lat <= 90 and -180 <= lon <= 180):
-                print(f"❌ Invalid coordinates extracted: Lat {lat}, Lon {lon}")
-                return None
-            return lat, lon
-        except Exception as e:
-            print(
-                f"❌ Error parsing lat/lon from response: {e}\nFull response was:\n{content}"
-            )
-            return None
-    def take_screenshot(self) -> Optional[Image.Image]:
-        """Takes a screenshot of the Street View area using the controller."""
-        if self.use_selenium and self.controller:
-            screenshot_bytes = self.controller.take_street_view_screenshot()
-            if screenshot_bytes:
-                return Image.open(BytesIO(screenshot_bytes))
         return None
     def analyze_image(self, image: Image) -> Optional[Tuple[float, float]]:
-        """Analyzes an image and returns the predicted (latitude, longitude)."""
-        try:
-            screenshot_b64 = self.pil_to_base64(image)
-            message = self.create_message([screenshot_b64])
-            response = self.model.invoke([message])
-            print(f"\n🤖 Full response from {self.model_name}:")
-            print(response.content)
-            return self.extract_lat_lon_from_response(response)
-        except Exception as e:
-            print(f"❌ Error in analyze_image: {e}")
-            return None
     def close(self):
-        """Cleans up resources."""
         if self.controller:
             self.controller.close()

 import base64
+import json
 import re
+from io import BytesIO
+from typing import Tuple, List, Optional, Dict, Any, Type
+from PIL import Image
 from langchain_core.messages import HumanMessage, BaseMessage
 from langchain_openai import ChatOpenAI
 from langchain_anthropic import ChatAnthropic
 from mapcrunch_controller import MapCrunchController
+AGENT_PROMPT_TEMPLATE = """
+**Mission:** You are an expert geo-location agent. Your goal is to find clues to determine your location within a limited number of steps.
+**Current Status:**
+- **Remaining Steps: {remaining_steps}**
+- **Available Actions This Turn: {available_actions}**
+---
+**Core Principles of an Expert Player:**
+1.  **Final Step Rule:** If `remaining_steps` is **exactly 1**, this is your last action and it **MUST be `GUESS`**. Do not use your final step for exploration.
+2.  **Be Decisive:** If you find a key clue (a specific address, a unique landmark, or text identifying a city/region), make a `GUESS` immediately. Don't waste steps.
+3.  **Efficient Exploration:**
+    - At intersections or when the view is unpromising, **pan first** to see all directions before moving.
+    - If a path looks barren, don't get stuck moving forward. It's often smarter to turn around (using `PAN` or `MOVE_BACKWARD`).
+4.  **Understand Your Path (The Arrow Heuristic):** The navigation arrows on the ground show the two directions of the **road**. `MOVE_FORWARD` follows the arrow that appears **physically higher on your screen**. `MOVE_BACKWARD` follows the lower arrow. Use this to navigate predictably.
+---
+**Context & Task:**
+You will receive a sequence of images from your journey. The last image is your **CURRENT** view. Analyze the full history and your current view, apply the Core Principles, and decide your next action.
+**Action History:**
+{history_text}
+**JSON Output Format:**
+Your response MUST be a valid JSON object wrapped in ```json ... ```.
+- For exploration: `{{"reasoning": "...", "action_details": {{"action": "ACTION_NAME"}} }}`
+- For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}`
+"""
+BENCHMARK_PROMPT = """
+Analyze the image and determine its geographic coordinates.
+1.  Describe visual clues.
+2.  Suggest potential regions.
+3.  State your most probable location.
+4.  Provide coordinates in the last line in this exact format: `Lat: XX.XXXX, Lon: XX.XXXX`
+"""
+class GeoBot:
     def __init__(
+        self,
+        model: Type,
+        model_name: str,
+        use_selenium: bool = True,
+        headless: bool = False,
     ):
         self.model = model(model=model_name)
         self.model_name = model_name
     @staticmethod
     def pil_to_base64(image: Image) -> str:
         buffered = BytesIO()
+        image.thumbnail((1024, 1024))
         image.save(buffered, format="PNG")
         return base64.b64encode(buffered.getvalue()).decode("utf-8")
+    def _create_message_with_history(
+        self, prompt: str, image_b64_list: List[str]
+    ) -> List[HumanMessage]:
+        """Creates a message for the LLM that includes text and a sequence of images."""
+        content = [{"type": "text", "text": prompt}]
+        # Add the JSON format instructions right after the main prompt text
+        content.append(
+            {
+                "type": "text",
+                "text": '\n**JSON Output Format:**\nYour response MUST be a valid JSON object wrapped in ```json ... ```.\n- For exploration: `{{"reasoning": "...", "action_details": {{"action": "ACTION_NAME"}} }}`\n- For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}`',
+            }
+        )
+        for b64_string in image_b64_list:
             content.append(
                 {
                     "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{b64_string}"},
                 }
             )
+        return [HumanMessage(content=content)]
+    def _create_llm_message(self, prompt: str, image_b64: str) -> List[HumanMessage]:
+        """Original method for single-image analysis (benchmark)."""
+        return [
+            HumanMessage(
+                content=[
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                    },
+                ]
+            )
+        ]
+    def _parse_agent_response(self, response: BaseMessage) -> Optional[Dict[str, Any]]:
+        """
+        Robustly parses JSON from the LLM response, handling markdown code blocks.
+        """
         try:
             content = response.content.strip()
+            match = re.search(r"```json\s*(\{.*?\})\s*```", content, re.DOTALL)
+            if match:
+                json_str = match.group(1)
+            else:
+                json_str = content
+            return json.loads(json_str)
+        except (json.JSONDecodeError, AttributeError) as e:
+            print(f"Invalid JSON from LLM: {e}\nFull response was:\n{response.content}")
+            return None
+    def run_agent_loop(self, max_steps: int = 10) -> Optional[Tuple[float, float]]:
+        history: List[Dict[str, Any]] = []
+        for step in range(max_steps, 0, -1):
+            print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---")
+            self.controller.setup_clean_environment()
+            screenshot_bytes = self.controller.take_street_view_screenshot()
+            if not screenshot_bytes:
+                print("Failed to take screenshot. Ending agent loop.")
                 return None
+            current_screenshot_b64 = self.pil_to_base64(
+                Image.open(BytesIO(screenshot_bytes))
+            )
+            available_actions = self.controller.get_available_actions()
+            print(f"Available actions: {available_actions}")
+            history_text = ""
+            image_b64_for_prompt = []
+            if not history:
+                history_text = "No history yet. This is the first step."
+            else:
+                for i, h in enumerate(history):
+                    history_text += f"--- History Step {i + 1} ---\n"
+                    history_text += f"Reasoning: {h.get('reasoning', 'N/A')}\n"
+                    history_text += f"Action: {h.get('action_details', {}).get('action', 'N/A')}\n\n"
+                    image_b64_for_prompt.append(h["screenshot_b64"])
+            image_b64_for_prompt.append(current_screenshot_b64)
+            prompt = AGENT_PROMPT_TEMPLATE.format(
+                remaining_steps=step,
+                history_text=history_text,
+                available_actions=json.dumps(available_actions),
+            )
+            message = self._create_message_with_history(prompt, image_b64_for_prompt)
+            response = self.model.invoke(message)
+            decision = self._parse_agent_response(response)
+            if not decision:
+                print(
+                    "Response parsing failed. Using default recovery action: PAN_RIGHT."
+                )
+                decision = {
+                    "reasoning": "Recovery due to parsing failure.",
+                    "action_details": {"action": "PAN_RIGHT"},
+                }
+            decision["screenshot_b64"] = current_screenshot_b64
+            history.append(decision)
+            action_details = decision.get("action_details", {})
+            action = action_details.get("action")
+            print(f"AI Reasoning: {decision.get('reasoning', 'N/A')}")
+            print(f"AI Action: {action}")
+            if action == "GUESS":
+                lat, lon = action_details.get("lat"), action_details.get("lon")
+                if lat is not None and lon is not None:
+                    return lat, lon
+            elif action == "MOVE_FORWARD":
+                self.controller.move("forward")
+            elif action == "MOVE_BACKWARD":
+                self.controller.move("backward")
+            elif action == "PAN_LEFT":
+                self.controller.pan_view("left")
+            elif action == "PAN_RIGHT":
+                self.controller.pan_view("right")
+        print("Max steps reached. Agent did not make a final guess.")
         return None
     def analyze_image(self, image: Image) -> Optional[Tuple[float, float]]:
+        image_b64 = self.pil_to_base64(image)
+        message = self._create_llm_message(BENCHMARK_PROMPT, image_b64)
+        response = self.model.invoke(message)
+        print(f"\nLLM Response:\n{response.content}")
+        content = response.content.strip()
+        last_line = ""
+        for line in reversed(content.split("\n")):
+            if "lat" in line.lower() and "lon" in line.lower():
+                last_line = line
+                break
+        if not last_line:
+            return None
+        numbers = re.findall(r"[-+]?\d*\.\d+|\d+", last_line)
+        if len(numbers) < 2:
+            return None
+        lat, lon = float(numbers[0]), float(numbers[1])
+        return lat, lon
+    def take_screenshot(self) -> Optional[Image.Image]:
+        screenshot_bytes = self.controller.take_street_view_screenshot()
+        if screenshot_bytes:
+            return Image.open(BytesIO(screenshot_bytes))
+        return None
     def close(self):
         if self.controller:
             self.controller.close()

main.py CHANGED Viewed

@@ -1,254 +1,173 @@
-#!/usr/bin/env python3
-"""
-Main entry point for MapCrunch geo-location testing
-Usage:
-    python main.py --mode data --samples 50 --urban --no-indoor   # Collect filtered data
-    python main.py --mode benchmark --models gpt-4o claude-3.5-sonnet  # Run benchmark
-    python main.py --mode interactive --model gpt-4o  # Interactive testing
-"""
 import argparse
-import os
-from time import sleep
-from typing import Dict
 from langchain_openai import ChatOpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_google_genai import ChatGoogleGenerativeAI
 from geo_bot import GeoBot
-from data_collector import DataCollector
 from benchmark import MapGuesserBenchmark
-from config import MODELS_CONFIG, SUCCESS_THRESHOLD_KM
-def interactive_mode(model_name: str = "gpt-4o", turns: int = 5, plot: bool = False):
-    """Interactive mode - play turns manually like the original"""
-    print(f"🎮 Starting interactive mode with {model_name}")
-    # Get model class
-    config = MODELS_CONFIG.get(model_name)
-    if not config:
-        print(f"❌ Unknown model: {model_name}")
         return
-    model_class_name = config["class"]
-    model_class = globals()[model_class_name]
-    model_instance = config["model_name"]
-    # Create bot with Selenium integration
-    with GeoBot(model=model_class, model_name=model_instance, use_selenium=True) as bot:
-        # Setup clean environment
-        if bot.controller:
-            bot.controller.setup_clean_environment()
-        for turn in range(turns):
-            print(f"\n{'=' * 50}")
-            print(f"🎯 Turn {turn + 1}/{turns}")
-            print(f"{'=' * 50}")
-            try:
-                # Get new location (click Go button)
-                if bot.controller:
-                    if not bot.controller.click_go_button():
-                        print("❌ Failed to get new location")
-                        continue
-                else:
-                    print("⚠️  Manual mode: Please click Go button and press Enter")
-                    input()
-                # Take screenshot and analyze
-                screenshot = bot.take_screenshot()
-                location = bot.analyze_image(screenshot)
-                if location is not None:
-                    bot.select_map_location(*location, plot=plot)
-                    print("✅ Location selected successfully")
-                else:
-                    print("❌ Could not determine location")
-                    # Select a default location
-                    bot.select_map_location(
-                        x=bot.map_x + bot.map_w // 2,
-                        y=bot.map_y + bot.map_h // 2,
-                        plot=plot,
-                    )
-                # Brief pause between turns
-                sleep(2)
-            except KeyboardInterrupt:
-                print(f"\n⏹️  Game stopped by user after {turn + 1} turns")
-                break
-            except Exception as e:
-                print(f"❌ Error in turn {turn + 1}: {e}")
-                continue
-def data_collection_mode(
-    samples: int = 50, headless: bool = False, options: Dict = None
-):
-    """Data collection mode"""
-    print(f"📊 Starting data collection mode - {samples} samples")
-    if options:
-        print(f"🔧 Using custom options: {options}")
-    with DataCollector(headless=headless, options=options) as collector:
-        data = collector.collect_samples(samples)
-        print(f"✅ Collected {len(data)} samples successfully")
-def benchmark_mode(
-    models: list = None, samples: int = 10, live: bool = False, headless: bool = False
-):
-    """Benchmark mode"""
-    if models is None:
-        models = ["gpt-4o"]  # Default model
-    print(f"🏁 Starting benchmark mode")
-    print(f"   Models: {models}")
-    print(f"   Samples per model: {samples}")
-    print(f"   Mode: {'live' if live else 'offline'}")
-    benchmark = MapGuesserBenchmark(headless=headless)
-    try:
-        summary = benchmark.run_benchmark(
-            models=models, max_samples=samples, use_live_mode=live
-        )
-        print(f"\n🎉 Benchmark Complete!")
-        if summary:
-            print(f"\n📊 Results Summary:")
-            for model, stats in summary.items():
-                print(f"\n🤖 {model}:")
-                print(
-                    f"   Success Rate (under {SUCCESS_THRESHOLD_KM}km): {stats.get('success_rate', 0) * 100:.1f}%"
-                )
-                print(f"   📏 Average Distance: {stats['average_distance_km']:.1f} km")
-                print(f"   📊 Median Distance: {stats['median_distance_km']:.1f} km")
-                print(f"   🎯 Best: {stats['min_distance_km']:.1f} km")
-                print(f"   📈 Worst: {stats['max_distance_km']:.1f} km")
-    except Exception as e:
-        print(f"❌ Benchmark failed: {e}")
 def main():
-    parser = argparse.ArgumentParser(
-        description="MapCrunch Geo-Location AI Benchmark",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  # Collect training data with filters
-  python main.py --mode data --samples 100 --urban --no-indoor
-  # Collect from specific countries
-  python main.py --mode data --samples 50 --countries us gb jp --urban
-  # Run benchmark on saved data
-  python main.py --mode benchmark --models gpt-4o claude-3.5-sonnet --samples 20
-  # Interactive testing
-  python main.py --mode interactive --model gpt-4o --turns 5 --plot
-  # Live benchmark (uses MapCrunch website directly)
-  python main.py --mode benchmark --live --models gpt-4o
-        """,
-    )
     parser.add_argument(
         "--mode",
-        choices=["interactive", "data", "benchmark"],
-        default="interactive",
-        help="Operation mode",
     )
-    # Interactive mode options
     parser.add_argument(
         "--model",
         choices=list(MODELS_CONFIG.keys()),
         default="gpt-4o",
-        help="Model for interactive mode",
-    )
-    parser.add_argument(
-        "--turns", type=int, default=5, help="Number of turns in interactive mode"
-    )
-    parser.add_argument(
-        "--plot", action="store_true", help="Generate plots of predictions"
     )
-    # Data collection options
     parser.add_argument(
-        "--samples", type=int, default=50, help="Number of samples to collect/test"
     )
     parser.add_argument(
-        "--urban", action="store_true", help="Collect only urban locations"
     )
-    parser.add_argument("--no-indoor", action="store_true", help="Exclude indoor views")
     parser.add_argument(
-        "--countries",
-        nargs="+",
-        help="Specific countries to collect from (e.g., us gb jp)",
     )
-    # Benchmark options
     parser.add_argument(
         "--models",
         nargs="+",
         choices=list(MODELS_CONFIG.keys()),
-        help="Models to benchmark",
-    )
-    parser.add_argument(
-        "--live", action="store_true", help="Use live MapCrunch website for benchmark"
-    )
-    # General options
-    parser.add_argument(
-        "--headless", action="store_true", help="Run browser in headless mode"
     )
     args = parser.parse_args()
-    print(f"🚀 MapCrunch Geo-Location AI Benchmark")
-    print(f"   Mode: {args.mode}")
-    try:
-        if args.mode == "interactive":
-            interactive_mode(model_name=args.model, turns=args.turns, plot=args.plot)
-        elif args.mode == "data":
-            # Configure collection options from args
-            from config import MAPCRUNCH_OPTIONS
-            options = MAPCRUNCH_OPTIONS.copy()
-            if args.urban:
-                options["urban_only"] = True
-            if args.no_indoor:
-                options["exclude_indoor"] = True
-            if args.countries:
-                options["selected_countries"] = args.countries
-            data_collection_mode(
-                samples=args.samples, headless=args.headless, options=options
-            )
-        elif args.mode == "benchmark":
-            benchmark_mode(
-                models=args.models,
-                samples=args.samples,
-                live=args.live,
-                headless=args.headless,
-            )
-    except KeyboardInterrupt:
-        print(f"\n⏹️  Operation interrupted by user")
-    except Exception as e:
-        print(f"❌ Error: {e}")
-        raise
 if __name__ == "__main__":

 import argparse
+import json
+import random
+from typing import Dict, Optional, List
 from langchain_openai import ChatOpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_google_genai import ChatGoogleGenerativeAI
 from geo_bot import GeoBot
 from benchmark import MapGuesserBenchmark
+from config import MODELS_CONFIG, DATA_PATHS, SUCCESS_THRESHOLD_KM
+def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
+    """
+    Runs the AI Agent in a benchmark loop over multiple samples,
+    using multi-step exploration for each.
+    """
+    print(
+        f"Starting Agent Mode (as a benchmark): model={model_name}, steps={steps}, samples={samples}"
+    )
+    try:
+        with open(DATA_PATHS["golden_labels"], "r", encoding="utf-8") as f:
+            golden_labels = json.load(f).get("samples", [])
+    except FileNotFoundError:
+        print(f"Error: Golden labels file not found at {DATA_PATHS['golden_labels']}.")
         return
+    if not golden_labels:
+        print("Error: No samples found in golden_labels.json.")
+        return
+    num_to_test = min(samples, len(golden_labels))
+    test_samples = golden_labels[:num_to_test]
+    print(f"Will run on {len(test_samples)} samples.")
+    config = MODELS_CONFIG.get(model_name)
+    model_class = globals()[config["class"]]
+    model_instance_name = config["model_name"]
+    benchmark_helper = MapGuesserBenchmark(headless=True)
+    all_results = []
+    with GeoBot(
+        model=model_class, model_name=model_instance_name, headless=headless
+    ) as bot:
+        for i, sample in enumerate(test_samples):
+            print(
+                f"\n--- Running Sample {i + 1}/{len(test_samples)} (ID: {sample.get('id')}) ---"
+            )
+            # **FIXED**: Correct sequence: Load Data -> Clean Environment -> Run Loop
+            if not bot.controller.load_location_from_data(sample):
+                print(
+                    f"   ❌ Failed to load location for sample {sample.get('id')}. Skipping."
+                )
+                continue
+            bot.controller.setup_clean_environment()
+            final_guess = bot.run_agent_loop(max_steps=steps)
+            true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
+            distance_km = None
+            is_success = False
+            if final_guess:
+                distance_km = benchmark_helper.calculate_distance(
+                    true_coords, final_guess
+                )
+                if distance_km is not None:
+                    is_success = distance_km <= SUCCESS_THRESHOLD_KM
+                print(f"\nResult for Sample ID: {sample.get('id')}")
+                print(
+                    f"  Ground Truth: Lat={true_coords['lat']:.4f}, Lon={true_coords['lng']:.4f}"
+                )
+                print(
+                    f"  Final Guess:  Lat={final_guess[0]:.4f}, Lon={final_guess[1]:.4f}"
+                )
+                dist_str = f"{distance_km:.1f} km" if distance_km is not None else "N/A"
+                print(f"  Distance: {dist_str}, Success: {is_success}")
+            else:
+                print("Agent did not make a final guess for this sample.")
+            all_results.append(
+                {
+                    "sample_id": sample.get("id"),
+                    "model": bot.model_name,
+                    "true_coordinates": true_coords,
+                    "predicted_coordinates": final_guess,
+                    "distance_km": distance_km,
+                    "success": is_success,
+                }
+            )
+    summary = benchmark_helper.generate_summary(all_results)
+    if summary:
+        print("\n\n--- Agent Benchmark Complete! Summary ---")
+        for model, stats in summary.items():
+            print(f"Model: {model}")
+            print(f"  Success Rate: {stats['success_rate'] * 100:.1f}%")
+            print(f"  Avg Distance: {stats['average_distance_km']:.1f} km")
+    print("\nAgent Mode finished.")
+def benchmark_mode(models: list, samples: int, headless: bool):
+    """Runs the benchmark on pre-collected data."""
+    print(f"Starting Benchmark Mode: models={models}, samples={samples}")
+    benchmark = MapGuesserBenchmark(headless=headless)
+    summary = benchmark.run_benchmark(models=models, max_samples=samples)
+    if summary:
+        print("\n--- Benchmark Complete! Summary ---")
+        for model, stats in summary.items():
+            print(f"Model: {model}")
+            print(f"  Success Rate: {stats['success_rate'] * 100:.1f}%")
+            print(f"  Avg Distance: {stats['average_distance_km']:.1f} km")
 def main():
+    parser = argparse.ArgumentParser(description="MapCrunch AI Agent & Benchmark")
     parser.add_argument(
         "--mode",
+        choices=["agent", "benchmark"],
+        default="agent",
+        help="Operation mode.",
     )
     parser.add_argument(
         "--model",
         choices=list(MODELS_CONFIG.keys()),
         default="gpt-4o",
+        help="Model to use.",
     )
     parser.add_argument(
+        "--steps", type=int, default=10, help="[Agent] Number of exploration steps."
     )
     parser.add_argument(
+        "--samples",
+        type=int,
+        default=50,
+        help="Number of samples to process for the selected mode.",
     )
     parser.add_argument(
+        "--headless", action="store_true", help="Run browser in headless mode."
     )
     parser.add_argument(
         "--models",
         nargs="+",
         choices=list(MODELS_CONFIG.keys()),
+        help="[Benchmark] Models to benchmark.",
     )
     args = parser.parse_args()
+    if args.mode == "agent":
+        agent_mode(
+            model_name=args.model,
+            steps=args.steps,
+            headless=args.headless,
+            samples=args.samples,
+        )
+    elif args.mode == "benchmark":
+        benchmark_mode(
+            models=args.models or [args.model],
+            samples=args.samples,
+            headless=args.headless,
+        )
 if __name__ == "__main__":

mapcrunch_controller.py CHANGED Viewed

@@ -1,14 +1,12 @@
-# mapcrunch_controller.py (Fixed)
 from selenium import webdriver
-from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from typing import Dict, Optional
-import time
-from config import MAPCRUNCH_URL, SELECTORS, DATA_COLLECTION_CONFIG, MAPCRUNCH_OPTIONS
 class MapCrunchController:
@@ -17,155 +15,126 @@ class MapCrunchController:
         if headless:
             options.add_argument("--headless")
         options.add_argument("--window-size=1920,1080")
-        options.add_experimental_option("excludeSwitches", ["enable-automation"])
         self.driver = webdriver.Chrome(options=options)
         self.wait = WebDriverWait(self.driver, 10)
         self.driver.get(MAPCRUNCH_URL)
         time.sleep(3)
-    # **新增**: 完整实现了选项设置功能
-    def setup_collection_options(self, options: Dict = None):
-        if options is None:
-            options = MAPCRUNCH_OPTIONS
-        try:
-            options_button = self.wait.until(
-                EC.element_to_be_clickable(
-                    (By.CSS_SELECTOR, SELECTORS["options_button"])
-                )
-            )
-            # 点击以确保面板是打开的
-            if "visible" not in options_button.find_element(
-                By.XPATH, ".."
-            ).get_attribute("class"):
-                options_button.click()
-            time.sleep(1)
-            # Urban
-            urban_checkbox = self.driver.find_element(
-                By.CSS_SELECTOR, SELECTORS["urban_checkbox"]
             )
-            if options.get("urban_only", False) != urban_checkbox.is_selected():
-                urban_checkbox.click()
-                print(f"✅ Urban mode set to: {options.get('urban_only', False)}")
-            # Indoor
-            indoor_checkbox = self.driver.find_element(
-                By.CSS_SELECTOR, SELECTORS["indoor_checkbox"]
-            )
-            if options.get("exclude_indoor", True) == indoor_checkbox.is_selected():
-                indoor_checkbox.click()
-                print(
-                    f"✅ Indoor views excluded: {options.get('exclude_indoor', True)}"
-                )
-            # 关闭面板
-            options_button.click()
-            time.sleep(0.5)
-            print("✅ Collection options configured.")
-            return True
-        except Exception as e:
-            print(f"❌ Error configuring options: {e}")
-            return False
-    # ... 其他所有函数 (click_go_button, get_live_location_identifiers, 等) 保持我们上一版的最终形态，无需改动 ...
     def click_go_button(self) -> bool:
-        try:
-            go_button = self.wait.until(
-                EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["go_button"]))
-            )
-            go_button.click()
-            time.sleep(DATA_COLLECTION_CONFIG.get("wait_after_go", 5))
-            return True
-        except Exception as e:
-            print(f"❌ Error clicking Go button: {e}")
-            return False
-    def get_live_location_identifiers(self) -> Dict:
-        try:
-            return self.driver.execute_script("""
-                try {
-                    const pov = window.panorama.getPov();
-                    return {
-                        panoId: window.panorama ? window.panorama.getPano() : null,
-                        pov: { heading: pov.heading, pitch: pov.pitch, zoom: pov.zoom }
-                    };
-                } catch (e) { return { error: e.toString() }; }
-            """)
-        except Exception as e:
-            print(f"❌ Error getting live identifiers via JS: {e}")
-            return {}
-    def get_current_address(self) -> Optional[str]:
-        try:
-            address_element = self.wait.until(
-                EC.visibility_of_element_located(
-                    (By.CSS_SELECTOR, SELECTORS["address_element"])
-                )
-            )
-            return address_element.get_attribute("title") or address_element.text
-        except TimeoutException:
-            return "Address not found"
-    def setup_clean_environment(self):
-        try:
-            self.driver.execute_script(
-                "if(typeof hideLoc === 'function') { hideLoc(); }"
             )
-            self.driver.execute_script("""
-                const elementsToHide = ['#menu', '#social', '#bottom-box', '#topbar'];
-                elementsToHide.forEach(sel => { const el = document.querySelector(sel); if (el) el.style.display = 'none'; });
-                const panoBox = document.querySelector('#pano-box'); if (panoBox) panoBox.style.height = '100vh';
-            """)
-        except Exception as e:
-            print(f"⚠️ Warning: Could not fully configure clean environment: {e}")
     def load_location_from_data(self, location_data: Dict) -> bool:
-        """
-        Loads a new location. PRIORITY: JS call. FALLBACK: URL navigation.
-        """
-        try:
-            assert self.driver is not None
-            pano_id = location_data.get("pano_id")
-            pov = location_data.get("pov")
-            # 策略B：优先尝试通过JS直接设置场景，速度最快
-            if pano_id and pov:
-                # print(f"✅ Loading location via JS Call: PanoID {pano_id[:10]}...")
-                self.driver.execute_script(
-                    "window.panorama.setPano(arguments[0]);"
-                    "window.panorama.setPov(arguments[1]);",
-                    pano_id,
-                    pov,
-                )
-                time.sleep(2)  # 等待新瓦片图加载
-                return True
-            # 策略A：如果数据不完整，回退到URL加载的方式
-            url_slug = location_data.get("url_slug")
-            if url_slug:
-                url_to_load = f"{MAPCRUNCH_URL}/p/{url_slug}"
-                print(f"⚠️ JS load failed, falling back to URL Slug: {url_to_load}")
-                self.driver.get(url_to_load)
-                time.sleep(4)
-                return True
-            print("❌ Cannot load location: No valid pano_id/pov or url_slug in data.")
-            return False
-        except Exception as e:
-            print(f"❌ Error loading location: {e}")
-            return False
-    def take_street_view_screenshot(self) -> Optional[bytes]:
-        try:
-            pano_element = self.wait.until(
-                EC.presence_of_element_located(
-                    (By.CSS_SELECTOR, SELECTORS["pano_container"])
-                )
             )
-            return pano_element.screenshot_as_png
-        except Exception:
-            return None
     def close(self):
         if self.driver:

+import time
+from typing import Dict, Optional, List
 from selenium import webdriver
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from config import MAPCRUNCH_URL, SELECTORS, DATA_COLLECTION_CONFIG
 class MapCrunchController:
         if headless:
             options.add_argument("--headless")
         options.add_argument("--window-size=1920,1080")
         self.driver = webdriver.Chrome(options=options)
         self.wait = WebDriverWait(self.driver, 10)
         self.driver.get(MAPCRUNCH_URL)
         time.sleep(3)
+    def setup_clean_environment(self):
+        """
+        Minimal environment setup using hideLoc() and hiding major UI.
+        """
+        self.driver.execute_script("if(typeof hideLoc === 'function') hideLoc();")
+        self.driver.execute_script("""
+            const topBar = document.querySelector('#topbar');
+            if (topBar) topBar.style.display = 'none';
+            const bottomBox = document.querySelector('#bottom-box');
+            if (bottomBox) bottomBox.style.display = 'none';
+            const infoFirstView = document.querySelector('#info-firstview');
+            if (infoFirstView) infoFirstView.style.display = 'none';
+        """)
+    def get_available_actions(self) -> List[str]:
+        """
+        Checks for movement links via JavaScript.
+        FIXED: Removed PAN_UP and PAN_DOWN as they are not very useful.
+        """
+        base_actions = ["PAN_LEFT", "PAN_RIGHT", "GUESS"]
+        links = self.driver.execute_script("return window.panorama.getLinks();")
+        if links and len(links) > 0:
+            base_actions.extend(["MOVE_FORWARD", "MOVE_BACKWARD"])
+        return base_actions
+    def pan_view(self, direction: str, degrees: int = 45):
+        """Pans the view using a direct JS call."""
+        pov = self.driver.execute_script("return window.panorama.getPov();")
+        if direction == "left":
+            pov["heading"] -= degrees
+        elif direction == "right":
+            pov["heading"] += degrees
+        # UP/DOWN panning logic removed as actions are no longer available.
+        self.driver.execute_script("window.panorama.setPov(arguments[0]);", pov)
+        time.sleep(0.5)
+    def move(self, direction: str):
+        """Moves by finding the best panorama link and setting it via JS."""
+        pov = self.driver.execute_script("return window.panorama.getPov();")
+        links = self.driver.execute_script("return window.panorama.getLinks();")
+        if not links:
+            return
+        current_heading = pov["heading"]
+        best_link = None
+        if direction == "forward":
+            min_diff = 360
+            for link in links:
+                diff = 180 - abs(abs(link["heading"] - current_heading) - 180)
+                if diff < min_diff:
+                    min_diff = diff
+                    best_link = link
+        elif direction == "backward":
+            target_heading = (current_heading + 180) % 360
+            min_diff = 360
+            for link in links:
+                diff = 180 - abs(abs(link["heading"] - target_heading) - 180)
+                if diff < min_diff:
+                    min_diff = diff
+                    best_link = link
+        if best_link:
+            self.driver.execute_script(
+                "window.panorama.setPano(arguments[0]);", best_link["pano"]
             )
+            time.sleep(2.5)
+    # ... a többi metódus változatlan ...
+    def select_map_location_and_guess(self, lat: float, lon: float):
+        """Minimalist guess confirmation."""
+        self.driver.execute_script(
+            "document.querySelector('#bottom-box').style.display = 'block';"
+        )
+        self.wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["go_button"]))
+        ).click()
+        time.sleep(0.5)
+        self.wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["confirm_button"]))
+        ).click()
+        time.sleep(3)
+    def get_ground_truth_location(self) -> Optional[Dict[str, float]]:
+        """Directly gets location from JS object."""
+        return self.driver.execute_script("return window.loc;")
     def click_go_button(self) -> bool:
+        self.wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, SELECTORS["go_button"]))
+        ).click()
+        time.sleep(DATA_COLLECTION_CONFIG.get("wait_after_go", 3))
+        return True
+    def take_street_view_screenshot(self) -> Optional[bytes]:
+        pano_element = self.wait.until(
+            EC.presence_of_element_located(
+                (By.CSS_SELECTOR, SELECTORS["pano_container"])
             )
+        )
+        return pano_element.screenshot_as_png
     def load_location_from_data(self, location_data: Dict) -> bool:
+        pano_id, pov = location_data.get("pano_id"), location_data.get("pov")
+        if pano_id and pov:
+            self.driver.execute_script(
+                "window.panorama.setPano(arguments[0]); window.panorama.setPov(arguments[1]);",
+                pano_id,
+                pov,
             )
+            time.sleep(2)
+            return True
+        return False
     def close(self):
         if self.driver: