Spaces:

Omniscient001
/

Omniscient

Running

App Files Files Community

LRU1 commited on 29 days ago

Commit

1a948ca

1 Parent(s): 772ca75

add test mode to huggingface UI

Browse files

add test mode

Update config.py

add test mode

add test mode

Files changed (5) hide show

app.py +307 -207
config.py +2 -2
experiment_runner.py +0 -0
geo_bot.py +165 -0
mapcrunch_controller.py +10 -0

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import streamlit as st
 import json
 import os
 import time
 import re
 from pathlib import Path
@@ -67,7 +69,7 @@ with st.sidebar:
     st.header("Configuration")
     # Mode selection
-    mode = st.radio("Mode", ["Dataset Mode", "Online Mode"], index=0)
     if mode == "Dataset Mode":
         # Get available datasets and ensure we have a valid default
@@ -114,6 +116,43 @@ with st.sidebar:
         num_samples = st.slider(
             "Samples to Test", 1, len(golden_labels), min(3, len(golden_labels))
         )
     else:  # Online Mode
         st.info("Enter a URL to analyze a specific location")
@@ -211,221 +250,282 @@ with st.sidebar:
             help="Controls randomness in AI responses. 0.0 = deterministic, higher = more creative",
         )
     start_button = st.button("🚀 Start", type="primary")
 # Main Logic
 if start_button:
-    test_samples = golden_labels[:num_samples]
-    config = MODELS_CONFIG[model_choice]
-    model_class = get_model_class(config["class"])
-    benchmark_helper = MapGuesserBenchmark(
-        dataset_name=dataset_choice if mode == "Dataset Mode" else "online"
-    )
-    all_results = []
-    progress_bar = st.progress(0)
-    with GeoBot(
-        model=model_class,
-        model_name=config["model_name"],
-        headless=True,
-        temperature=temperature,
-    ) as bot:
-        for i, sample in enumerate(test_samples):
-            st.divider()
-            st.header(f"Sample {i + 1}/{num_samples}")
-            if mode == "Online Mode":
-                # Load the MapCrunch URL directly
-                bot.controller.load_url(sample["url"])
-            else:
-                # Load from dataset as before
-                bot.controller.load_location_from_data(sample)
-            bot.controller.setup_clean_environment()
-            # Create containers for UI updates
-            sample_container = st.container()
-            # Initialize UI state for this sample
-            step_containers = {}
-            sample_steps_data = []
-            def ui_step_callback(step_info):
-                """Callback function to update UI after each step"""
-                step_num = step_info["step_num"]
-                # Store step data
-                sample_steps_data.append(step_info)
-                with sample_container:
-                    # Create step container if it doesn't exist
-                    if step_num not in step_containers:
-                        step_containers[step_num] = st.container()
-                    with step_containers[step_num]:
-                        st.subheader(f"Step {step_num}/{step_info['max_steps']}")
-                        col1, col2 = st.columns([1, 2])
-                        with col1:
-                            # Display screenshot
-                            st.image(
-                                step_info["screenshot_bytes"],
-                                caption=f"What AI sees - Step {step_num}",
-                                use_column_width=True,
-                            )
-                        with col2:
-                            # Show available actions
-                            st.write("**Available Actions:**")
-                            st.code(
-                                json.dumps(step_info["available_actions"], indent=2)
-                            )
-                            # Show history context - use the history from step_info
-                            current_history = step_info.get("history", [])
-                            history_text = bot.generate_history_text(current_history)
-                            st.write("**AI Context:**")
-                            st.text_area(
-                                "History",
-                                history_text,
-                                height=100,
-                                disabled=True,
-                                key=f"history_{i}_{step_num}",
-                            )
-                            # Show AI reasoning and action
-                            action = step_info.get("action_details", {}).get(
-                                "action", "N/A"
-                            )
-                            if step_info.get("is_final_step") and action != "GUESS":
-                                st.warning("Max steps reached. Forcing GUESS.")
-                            st.write("**AI Reasoning:**")
-                            st.info(step_info.get("reasoning", "N/A"))
-                            if step_info.get("debug_message") != "N/A":
-                                st.write("**AI Debug Message:**")
-                                st.code(step_info.get("debug_message"), language="json")
-                            st.write("**AI Action:**")
-                            if action == "GUESS":
-                                lat = step_info.get("action_details", {}).get("lat")
-                                lon = step_info.get("action_details", {}).get("lon")
-                                st.success(f"`{action}` - {lat:.4f}, {lon:.4f}")
-                            else:
-                                st.success(f"`{action}`")
-                            # Show decision details for debugging
-                            with st.expander("Decision Details"):
-                                decision_data = {
-                                    "reasoning": step_info.get("reasoning"),
-                                    "action_details": step_info.get("action_details"),
-                                    "remaining_steps": step_info.get("remaining_steps"),
-                                }
-                                st.json(decision_data)
-                # Force UI refresh
-                time.sleep(0.5)  # Small delay to ensure UI updates are visible
-            # Run the agent loop with UI callback
-            try:
-                final_guess = bot.run_agent_loop(
-                    max_steps=steps_per_sample, step_callback=ui_step_callback
-                )
-            except Exception as e:
-                st.error(f"Error during agent execution: {e}")
-                final_guess = None
-            # Sample Results
-            with sample_container:
-                st.subheader("Sample Result")
-                true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
-                distance_km = None
-                is_success = False
-                if final_guess:
-                    distance_km = benchmark_helper.calculate_distance(
-                        true_coords, final_guess
-                    )
-                    if distance_km is not None:
-                        is_success = distance_km <= SUCCESS_THRESHOLD_KM
-                    col1, col2, col3 = st.columns(3)
-                    col1.metric(
-                        "Final Guess", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}"
-                    )
-                    col2.metric(
-                        "Ground Truth",
-                        f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}",
                     )
-                    col3.metric(
-                        "Distance",
-                        f"{distance_km:.1f} km",
-                        delta="Success" if is_success else "Failed",
                     )
-                else:
-                    st.error("No final guess made")
-                all_results.append(
-                    {
-                        "sample_id": sample.get("id"),
-                        "model": model_choice,
-                        "steps_taken": len(sample_steps_data),
-                        "max_steps": steps_per_sample,
-                        "temperature": temperature,
-                        "true_coordinates": true_coords,
-                        "predicted_coordinates": final_guess,
-                        "distance_km": distance_km,
-                        "success": is_success,
-                    }
-                )
-            progress_bar.progress((i + 1) / num_samples)
-    # Final Summary
-    st.divider()
-    st.header("🏁 Final Results")
-    # Calculate summary stats
-    successes = [r for r in all_results if r["success"]]
-    success_rate = len(successes) / len(all_results) if all_results else 0
-    valid_distances = [
-        r["distance_km"] for r in all_results if r["distance_km"] is not None
-    ]
-    avg_distance = sum(valid_distances) / len(valid_distances) if valid_distances else 0
-    # Overall metrics
-    col1, col2, col3 = st.columns(3)
-    col1.metric("Success Rate", f"{success_rate * 100:.1f}%")
-    col2.metric("Average Distance", f"{avg_distance:.1f} km")
-    col3.metric("Total Samples", len(all_results))
-    # Detailed results table
-    st.subheader("Detailed Results")
-    st.dataframe(all_results, use_container_width=True)
-    # Success/failure breakdown
-    if successes:
-        st.subheader("✅ Successful Samples")
-        st.dataframe(successes, use_container_width=True)
-    failures = [r for r in all_results if not r["success"]]
-    if failures:
-        st.subheader("❌ Failed Samples")
-        st.dataframe(failures, use_container_width=True)
-    # Export functionality
-    if st.button("💾 Export Results"):
-        results_json = json.dumps(all_results, indent=2)
-        st.download_button(
-            label="Download results.json",
-            data=results_json,
-            file_name=f"geo_results_{dataset_choice}_{model_choice}_{num_samples}samples.json",
-            mime="application/json",
-        )
 def handle_tab_completion():

 import json
 import os
 import time
+import pandas as pd
+import altair as alt
 import re
 from pathlib import Path
     st.header("Configuration")
     # Mode selection
+    mode = st.radio("Mode", ["Dataset Mode", "Online Mode", "Test Mode"], index=0)
     if mode == "Dataset Mode":
         # Get available datasets and ensure we have a valid default
         num_samples = st.slider(
             "Samples to Test", 1, len(golden_labels), min(3, len(golden_labels))
         )
+    elif mode == "Test Mode":
+        st.info("🔬 Multi-Model Benchmark Testing")
+        available_datasets = get_available_datasets()
+        dataset_choice = st.selectbox("Dataset", available_datasets, index=0)
+        selected_models = st.multiselect(
+            "Select Models to Compare",
+            list(MODELS_CONFIG.keys()),
+            default=[DEFAULT_MODEL],
+        )
+        if not selected_models:
+            st.warning("Please select at least one model to run the test.")
+            st.stop()
+        steps_per_sample = st.slider("Max Steps", 1, 50, 10)
+        temperature = st.slider(
+            "Temperature",
+            0.0,
+            2.0,
+            DEFAULT_TEMPERATURE,
+            0.1,
+            help="Controls randomness in AI responses. 0.0 = deterministic, higher = more creative",
+        )
+        # load dataset
+        data_paths = get_data_paths(dataset_choice)
+        try:
+            with open(data_paths["golden_labels"], "r") as f:
+                golden_labels = json.load(f).get("samples", [])
+            st.success(f"Dataset '{dataset_choice}' loaded with {len(golden_labels)} samples")
+        except Exception as e:
+            st.error(f"Error loading dataset '{dataset_choice}': {str(e)}")
+            st.stop()
+        num_samples = st.slider("Samples per Run", 1, len(golden_labels), min(10, len(golden_labels)))
+        runs_per_model = st.slider("Runs per Model", 1, 10, 5)
     else:  # Online Mode
         st.info("Enter a URL to analyze a specific location")
             help="Controls randomness in AI responses. 0.0 = deterministic, higher = more creative",
         )
+    # common start button
     start_button = st.button("🚀 Start", type="primary")
 # Main Logic
 if start_button:
+    if mode == "Test Mode":
+        benchmark_helper = MapGuesserBenchmark(dataset_name=dataset_choice)
+        summary_by_step = {}
+        progress_bar = st.progress(0)
+        for mi, model_name in enumerate(selected_models):
+            st.header(f"Model: {model_name}")
+            config = MODELS_CONFIG[model_name]
+            model_class = get_model_class(config["class"])
+            successes_per_step = [0]*steps_per_sample
+            total_iterations = runs_per_model * num_samples
+            model_bar = st.progress(0, text=f"Running {model_name}")
+            iteration_counter = 0
+            for run_idx in range(runs_per_model):
+                with GeoBot(model=model_class, model_name=config["model_name"], headless=True, temperature=temperature) as bot:
+                    for si, sample in enumerate(golden_labels[:num_samples]):
+                        if not bot.controller.load_location_from_data(sample):
+                            iteration_counter += 1
+                            model_bar.progress(iteration_counter/total_iterations)
+                            continue
+                        predictions = bot.test_run_agent_loop(max_steps=steps_per_sample)
+                        true_coords = {"lat": sample["lat"], "lng": sample["lng"]}
+                        for step_idx, pred in enumerate(predictions):
+                            if isinstance(pred, dict) and "lat" in pred:
+                                dist = benchmark_helper.calculate_distance(true_coords, (pred["lat"], pred["lon"]))
+                                if dist is not None and dist <= SUCCESS_THRESHOLD_KM:
+                                    successes_per_step[step_idx] += 1
+                        iteration_counter += 1
+                        model_bar.progress(iteration_counter/total_iterations)
+            # calculate accuracy per step
+            acc_per_step = [s/(num_samples*runs_per_model) for s in successes_per_step]
+            summary_by_step[model_name] = acc_per_step
+            progress_bar.progress((mi+1)/len(selected_models))
+        # plot
+        st.subheader("Accuracy vs Steps")
+        # summary_by_step {model: [acc_step1, acc_step2, ...]}
+        df_wide = pd.DataFrame(summary_by_step)
+        df_long = (
+            df_wide
+            .reset_index(names="Step")
+            .melt(id_vars="Step", var_name="Model", value_name="Accuracy")
+        )
+        chart = (
+            alt.Chart(df_long)
+            .mark_line(point=True)
+            .encode(
+                x=alt.X("Step:O", title="Step #"),
+                y=alt.Y("Accuracy:Q", title="Accuracy", scale=alt.Scale(domain=[0, 1])),
+                color=alt.Color("Model:N", title="Model"),
+                tooltip=["Model:N", "Step:O", alt.Tooltip("Accuracy:Q", format=".2%")],
+            )
+            .properties(width=700, height=400)
+        )
+        st.altair_chart(chart, use_container_width=True)
+        st.stop()
+    else:
+        test_samples = golden_labels[:num_samples]
+        config = MODELS_CONFIG[model_choice]
+        model_class = get_model_class(config["class"])
+        benchmark_helper = MapGuesserBenchmark(
+            dataset_name=dataset_choice if mode == "Dataset Mode" else "online"
+        )
+        all_results = []
+        progress_bar = st.progress(0)
+        with GeoBot(
+            model=model_class,
+            model_name=config["model_name"],
+            headless=True,
+            temperature=temperature,
+        ) as bot:
+            for i, sample in enumerate(test_samples):
+                st.divider()
+                st.header(f"Sample {i + 1}/{num_samples}")
+                if mode == "Online Mode":
+                    # Load the MapCrunch URL directly
+                    bot.controller.load_url(sample["url"])
+                else:
+                    # Load from dataset as before
+                    bot.controller.load_location_from_data(sample)
+                bot.controller.setup_clean_environment()
+                # Create containers for UI updates
+                sample_container = st.container()
+                # Initialize UI state for this sample
+                step_containers = {}
+                sample_steps_data = []
+                def ui_step_callback(step_info):
+                    """Callback function to update UI after each step"""
+                    step_num = step_info["step_num"]
+                    # Store step data
+                    sample_steps_data.append(step_info)
+                    with sample_container:
+                        # Create step container if it doesn't exist
+                        if step_num not in step_containers:
+                            step_containers[step_num] = st.container()
+                        with step_containers[step_num]:
+                            st.subheader(f"Step {step_num}/{step_info['max_steps']}")
+                            col1, col2 = st.columns([1, 2])
+                            with col1:
+                                # Display screenshot
+                                st.image(
+                                    step_info["screenshot_bytes"],
+                                    caption=f"What AI sees - Step {step_num}",
+                                    use_column_width=True,
+                                )
+                            with col2:
+                                # Show available actions
+                                st.write("**Available Actions:**")
+                                st.code(
+                                    json.dumps(step_info["available_actions"], indent=2)
+                                )
+                                # Show history context - use the history from step_info
+                                current_history = step_info.get("history", [])
+                                history_text = bot.generate_history_text(current_history)
+                                st.write("**AI Context:**")
+                                st.text_area(
+                                    "History",
+                                    history_text,
+                                    height=100,
+                                    disabled=True,
+                                    key=f"history_{i}_{step_num}",
+                                )
+                                # Show AI reasoning and action
+                                action = step_info.get("action_details", {}).get(
+                                    "action", "N/A"
+                                )
+                                if step_info.get("is_final_step") and action != "GUESS":
+                                    st.warning("Max steps reached. Forcing GUESS.")
+                                st.write("**AI Reasoning:**")
+                                st.info(step_info.get("reasoning", "N/A"))
+                                if step_info.get("debug_message") != "N/A":
+                                    st.write("**AI Debug Message:**")
+                                    st.code(step_info.get("debug_message"), language="json")
+                                st.write("**AI Action:**")
+                                if action == "GUESS":
+                                    lat = step_info.get("action_details", {}).get("lat")
+                                    lon = step_info.get("action_details", {}).get("lon")
+                                    st.success(f"`{action}` - {lat:.4f}, {lon:.4f}")
+                                else:
+                                    st.success(f"`{action}`")
+                                # Show decision details for debugging
+                                with st.expander("Decision Details"):
+                                    decision_data = {
+                                        "reasoning": step_info.get("reasoning"),
+                                        "action_details": step_info.get("action_details"),
+                                        "remaining_steps": step_info.get("remaining_steps"),
+                                    }
+                                    st.json(decision_data)
+                    # Force UI refresh
+                    time.sleep(0.5)  # Small delay to ensure UI updates are visible
+                # Run the agent loop with UI callback
+                try:
+                    final_guess = bot.run_agent_loop(
+                        max_steps=steps_per_sample, step_callback=ui_step_callback
                     )
+                except Exception as e:
+                    st.error(f"Error during agent execution: {e}")
+                    final_guess = None
+                # Sample Results
+                with sample_container:
+                    st.subheader("Sample Result")
+                    true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
+                    distance_km = None
+                    is_success = False
+                    if final_guess:
+                        distance_km = benchmark_helper.calculate_distance(
+                            true_coords, final_guess
+                        )
+                        if distance_km is not None:
+                            is_success = distance_km <= SUCCESS_THRESHOLD_KM
+                        col1, col2, col3 = st.columns(3)
+                        col1.metric(
+                            "Final Guess", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}"
+                        )
+                        col2.metric(
+                            "Ground Truth",
+                            f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}",
+                        )
+                        col3.metric(
+                            "Distance",
+                            f"{distance_km:.1f} km",
+                            delta="Success" if is_success else "Failed",
+                        )
+                    else:
+                        st.error("No final guess made")
+                    all_results.append(
+                        {
+                            "sample_id": sample.get("id"),
+                            "model": model_choice,
+                            "steps_taken": len(sample_steps_data),
+                            "max_steps": steps_per_sample,
+                            "temperature": temperature,
+                            "true_coordinates": true_coords,
+                            "predicted_coordinates": final_guess,
+                            "distance_km": distance_km,
+                            "success": is_success,
+                        }
                     )
+                progress_bar.progress((i + 1) / num_samples)
+        # Final Summary
+        st.divider()
+        st.header("🏁 Final Results")
+        # Calculate summary stats
+        successes = [r for r in all_results if r["success"]]
+        success_rate = len(successes) / len(all_results) if all_results else 0
+        valid_distances = [
+            r["distance_km"] for r in all_results if r["distance_km"] is not None
+        ]
+        avg_distance = sum(valid_distances) / len(valid_distances) if valid_distances else 0
+        # Overall metrics
+        col1, col2, col3 = st.columns(3)
+        col1.metric("Success Rate", f"{success_rate * 100:.1f}%")
+        col2.metric("Average Distance", f"{avg_distance:.1f} km")
+        col3.metric("Total Samples", len(all_results))
+        # Detailed results table
+        st.subheader("Detailed Results")
+        st.dataframe(all_results, use_container_width=True)
+        # Success/failure breakdown
+        if successes:
+            st.subheader("✅ Successful Samples")
+            st.dataframe(successes, use_container_width=True)
+        failures = [r for r in all_results if not r["success"]]
+        if failures:
+            st.subheader("❌ Failed Samples")
+            st.dataframe(failures, use_container_width=True)
+        # Export functionality
+        if st.button("💾 Export Results"):
+            results_json = json.dumps(all_results, indent=2)
+            st.download_button(
+                label="Download results.json",
+                data=results_json,
+                file_name=f"geo_results_{dataset_choice}_{model_choice}_{num_samples}samples.json",
+                mime="application/json",
+            )
 def handle_tab_completion():

config.py CHANGED Viewed

@@ -38,12 +38,12 @@ DEFAULT_TEMPERATURE = 1.0
 # Model configurations
 MODELS_CONFIG = {
     "gpt-4o": {
-        "class": "ChatOpenAI",
         "model_name": "gpt-4o",
         "description": "OpenAI GPT-4o",
     },
     "gpt-4o-mini": {
-        "class": "ChatOpenAI",
         "model_name": "gpt-4o-mini",
         "description": "OpenAI GPT-4o Mini",
     },

 # Model configurations
 MODELS_CONFIG = {
     "gpt-4o": {
+        "class": "OpenRouter",
         "model_name": "gpt-4o",
         "description": "OpenAI GPT-4o",
     },
     "gpt-4o-mini": {
+        "class": "OpenRouter",
         "model_name": "gpt-4o-mini",
         "description": "OpenAI GPT-4o Mini",
     },

experiment_runner.py ADDED Viewed

File without changes

geo_bot.py CHANGED Viewed

@@ -69,6 +69,72 @@ Your response MUST be a valid JSON object wrapped in ```json ... ```.
 ```
 """
 BENCHMARK_PROMPT = """
 Analyze the image and determine its geographic coordinates.
 1.  Describe visual clues.
@@ -255,6 +321,49 @@ class GeoBot:
         return decision
     def execute_action(self, action: str) -> bool:
         """
         Execute the given action using the controller.
@@ -272,6 +381,62 @@ class GeoBot:
             self.controller.pan_view("right")
         return True
     def run_agent_loop(
         self, max_steps: int = 10, step_callback=None
     ) -> Optional[Tuple[float, float]]:

 ```
 """
+TEST_AGENT_PROMPT_TEMPLATE = """
+**Mission:** You are an expert geo-location agent. Your goal is to pinpoint our position based on the surroundings and your observation history.
+**Current Status**
+• Actions You Can Take *this* turn: {available_actions}
+────────────────────────────────
+**Core Principles**
+1.  **Observe → Orient → Act**
+    Start each turn with a structured three-part reasoning block:
+    **(1) Visual Clues —** plainly describe what you see (signs, text language, road lines, vegetation, building styles, vehicles, terrain, weather, etc.).
+    **(2) Potential Regions —** list the most plausible regions/countries those clues suggest.
+    **(3) Most Probable + Plan —** pick the single likeliest region and explain the next action (move/pan or guess).
+2.  **Navigate with Labels:**
+    - `MOVE_FORWARD` follows the green **UP** arrow.
+    - `MOVE_BACKWARD` follows the red **DOWN** arrow.
+    - No arrow ⇒ you cannot move that way.
+3.  **Efficient Exploration:**
+    - **Pan Before You Move:** At fresh spots/intersections, use `PAN_LEFT` / `PAN_RIGHT` first.
+    - After ~2 or 3 fruitless moves in repetitive scenery, turn around.
+4.  **Be Decisive:** A unique, definitive clue (full address, rare town name, etc.) ⇒ `GUESS` immediately.
+5.  **Final-Step Rule:** If **Remaining Steps = 1**, you **MUST** `GUESS` and you should carefully check the image and the surroundings.
+6.  **Always Predict:** On EVERY step, provide your current best estimate of the location, even if you're not ready to make a final guess.
+────────────────────────────────
+**Context & Task:**
+Analyze your full journey history and current view, apply the Core Principles, and decide your next action in the required JSON format.
+**Action History**
+{history_text}
+────────────────────────────────
+**JSON Output Format:**
+Your response MUST be a valid JSON object wrapped in ```json ... ```.
+{{
+  "reasoning": "…",
+  "current_prediction": {{
+    "lat": <float>,
+    "lon": <float>,
+    "location_description": "Brief description of predicted location"
+  }},
+  "action_details": {{"action": action chosen from the available actions}}
+}}
+**Example **
+```json
+{{
+  "reasoning": "(1) Visual Clues — I see left-side driving, eucalyptus trees, and a yellow speed-warning sign; the road markings are solid white. (2) Potential Regions — Southeastern Australia, Tasmania, or the North Island of New Zealand. (3) Most Probable + Plan — The scene most likely sits in a suburb of Hobart, Tasmania. I will PAN_LEFT to look for additional road signs that confirm this.",
+  "current_prediction": {{
+    "lat": -42.8806,
+    "lon": 147.3250,
+    "location_description": "Hobart suburb, Tasmania, Australia"
+  }},
+  "action_details": {{
+    "action": "PAN_LEFT"
+  }}
+}}
+```
+"""
 BENCHMARK_PROMPT = """
 Analyze the image and determine its geographic coordinates.
 1.  Describe visual clues.
         return decision
+    def execute_test_agent_step(
+        self,
+        history: List[Dict[str, Any]],
+        current_screenshot_b64: str,
+        available_actions: List[str],
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Execute a single agent step: generate prompt, get AI decision, return decision.
+        This is the core step logic extracted for reuse.
+        """
+        history_text = self.generate_history_text(history)
+        image_b64_for_prompt = self.get_history_images(history) + [
+            current_screenshot_b64
+        ]
+        prompt = TEST_AGENT_PROMPT_TEMPLATE.format(
+            history_text=history_text,
+            available_actions=available_actions,
+        )
+        try:
+            message = self._create_message_with_history(
+                prompt, image_b64_for_prompt[-1:]
+            )
+            response = self.model.invoke(message)
+            decision = self._parse_agent_response(response)
+        except Exception as e:
+            print(f"Error during model invocation: {e}")
+            decision = None
+        if not decision:
+            print(
+                "Response parsing failed or model error. Using default recovery action: PAN_RIGHT."
+            )
+            decision = {
+                "reasoning": "Recovery due to parsing failure or model error.",
+                "action_details": {"action": "PAN_RIGHT"},
+                "current_prediction": "N/A",
+                "debug_message": f"{response.content.strip()}",
+            }
+        return decision
     def execute_action(self, action: str) -> bool:
         """
         Execute the given action using the controller.
             self.controller.pan_view("right")
         return True
+    def test_run_agent_loop(self, max_steps: int = 10, step_callback=None) -> Optional[list[Tuple[float, float]]]:
+        history = self.init_history()
+        predictions = []
+        for step in range(max_steps, 0, -1):
+            # Setup and screenshot
+            self.controller.setup_clean_environment()
+            self.controller.label_arrows_on_screen()
+            screenshot_bytes = self.controller.take_street_view_screenshot()
+            if not screenshot_bytes:
+                print("Failed to take screenshot. Ending agent loop.")
+                return None
+            current_screenshot_b64 = self.pil_to_base64(
+                image=Image.open(BytesIO(screenshot_bytes))
+            )
+            available_actions = self.controller.get_test_available_actions()
+            print(f"Available actions: {available_actions}")
+            # Normal step execution
+            decision = self.execute_test_agent_step(
+                history, current_screenshot_b64, available_actions
+            )
+            # Create step_info with current history BEFORE adding current step
+            # This shows the history up to (but not including) the current step
+            step_info = {
+                "max_steps": max_steps,
+                "remaining_steps": step,
+                "screenshot_bytes": screenshot_bytes,
+                "screenshot_b64": current_screenshot_b64,
+                "available_actions": available_actions,
+                "is_final_step": step == 1,
+                "reasoning": decision.get("reasoning", "N/A"),
+                "action_details": decision.get("action_details", {"action": "N/A"}),
+                "history": history.copy(),  # History up to current step (excluding current)
+                "debug_message": decision.get("debug_message", "N/A"),
+                "current_prediction": decision.get("current_prediction", "N/A"),
+            }
+            action_details = decision.get("action_details", {})
+            action = action_details.get("action")
+            print(f"AI Reasoning: {decision.get('reasoning', 'N/A')}")
+            print(f"AI Current Prediction: {decision.get('current_prediction', 'N/A')}")
+            print(f"AI Action: {action}")
+            # Add step to history AFTER callback (so next iteration has this step in history)
+            self.add_step_to_history(history, current_screenshot_b64, decision)
+            predictions.append(decision.get("current_prediction", "N/A"))
+            self.execute_action(action)
+        return predictions
     def run_agent_loop(
         self, max_steps: int = 10, step_callback=None
     ) -> Optional[Tuple[float, float]]:

mapcrunch_controller.py CHANGED Viewed

@@ -214,6 +214,16 @@ class MapCrunchController:
             base_actions.extend(["MOVE_FORWARD", "MOVE_BACKWARD"])
         return base_actions
     def get_current_address(self) -> Optional[str]:
         try:
             address_element = self.wait.until(

             base_actions.extend(["MOVE_FORWARD", "MOVE_BACKWARD"])
         return base_actions
+    def get_test_available_actions(self) -> List[str]:
+        """
+        Checks for movement links via JavaScript.
+        """
+        base_actions = ["PAN_LEFT", "PAN_RIGHT"]
+        links = self.driver.execute_script("return window.panorama.getLinks();")
+        if links and len(links) > 0:
+            base_actions.extend(["MOVE_FORWARD", "MOVE_BACKWARD"])
+        return base_actions
     def get_current_address(self) -> Optional[str]:
         try:
             address_element = self.wait.until(