Andy Lee commited on
Commit
1c04950
ยท
1 Parent(s): 78ec24e

revert: better ui

Browse files
Files changed (1) hide show
  1. app.py +185 -121
app.py CHANGED
@@ -57,7 +57,7 @@ st.title("๐Ÿ—บ๏ธ MapCrunch AI Agent")
57
 
58
  # Sidebar
59
  with st.sidebar:
60
- st.header("โš™๏ธ Configuration")
61
 
62
  dataset_choice = st.selectbox("Dataset", get_available_datasets())
63
  model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
@@ -91,137 +91,201 @@ if start_button:
91
  ) as bot:
92
  for i, sample in enumerate(test_samples):
93
  st.divider()
94
- st.header(f"Sample {i + 1}/{num_samples}")
95
 
96
  bot.controller.load_location_from_data(sample)
97
  bot.controller.setup_clean_environment()
98
 
99
- col1, col2 = st.columns([2, 3])
100
-
101
- with col1:
102
- image_placeholder = st.empty()
103
- with col2:
104
- reasoning_placeholder = st.empty()
105
- action_placeholder = st.empty()
106
-
107
- history = []
108
- final_guess = None
109
-
110
- for step in range(steps_per_sample):
111
- step_num = step + 1
112
- reasoning_placeholder.info(f"๐Ÿค” Step {step_num}/{steps_per_sample}")
113
-
114
- bot.controller.label_arrows_on_screen()
115
- screenshot_bytes = bot.controller.take_street_view_screenshot()
116
- image_placeholder.image(screenshot_bytes, caption=f"Step {step_num}")
117
-
118
- current_step = {
119
- "image_b64": bot.pil_to_base64(
120
- Image.open(BytesIO(screenshot_bytes))
121
- ),
122
- "action": "N/A",
123
- }
124
- history.append(current_step)
125
-
126
- available_actions = bot.controller.get_available_actions()
127
- history_text = "\n".join(
128
- [f"Step {j + 1}: {h['action']}" for j, h in enumerate(history[:-1])]
129
- )
130
- if not history_text:
131
- history_text = "First step."
132
-
133
- prompt = AGENT_PROMPT_TEMPLATE.format(
134
- remaining_steps=steps_per_sample - step,
135
- history_text=history_text,
136
- available_actions=json.dumps(available_actions),
137
- )
138
-
139
- message = bot._create_message_with_history(
140
- prompt, [h["image_b64"] for h in history]
141
- )
142
- response = bot.model.invoke(message)
143
- decision = bot._parse_agent_response(response)
144
-
145
- if not decision:
146
- decision = {
147
- "action_details": {"action": "PAN_RIGHT"},
148
- "reasoning": "Fallback",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  }
150
-
151
- action = decision.get("action_details", {}).get("action")
152
- history[-1]["action"] = action
153
-
154
- reasoning_placeholder.success("โœ… Decision Made")
155
- action_placeholder.success(f"๐ŸŽฏ Action: `{action}`")
156
-
157
- with action_placeholder:
158
- with st.expander("Reasoning"):
159
- st.write(decision.get("reasoning", "N/A"))
160
-
161
- if step_num == steps_per_sample and action != "GUESS":
162
- action = "GUESS"
163
-
164
- if action == "GUESS":
165
- lat = decision.get("action_details", {}).get("lat")
166
- lon = decision.get("action_details", {}).get("lon")
167
- if lat is not None and lon is not None:
168
- final_guess = (lat, lon)
169
- break
170
- elif action == "MOVE_FORWARD":
171
- bot.controller.move("forward")
172
- elif action == "MOVE_BACKWARD":
173
- bot.controller.move("backward")
174
- elif action == "PAN_LEFT":
175
- bot.controller.pan_view("left")
176
- elif action == "PAN_RIGHT":
177
- bot.controller.pan_view("right")
178
-
179
- time.sleep(1)
180
-
181
- # Results
182
- true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
183
- distance_km = None
184
- is_success = False
185
-
186
- if final_guess:
187
- distance_km = benchmark_helper.calculate_distance(
188
- true_coords, final_guess
189
- )
190
- if distance_km is not None:
191
- is_success = distance_km <= SUCCESS_THRESHOLD_KM
192
-
193
- st.subheader("๐ŸŽฏ Result")
194
- col1, col2, col3 = st.columns(3)
195
- col1.metric("Guess", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}")
196
- col2.metric(
197
- "Truth", f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}"
198
- )
199
- col3.metric(
200
- "Distance",
201
- f"{distance_km:.1f} km",
202
- delta="Success" if is_success else "Failed",
203
  )
204
 
205
- all_results.append(
206
- {
207
- "sample_id": sample.get("id"),
208
- "model": model_choice,
209
- "true_coordinates": true_coords,
210
- "predicted_coordinates": final_guess,
211
- "distance_km": distance_km,
212
- "success": is_success,
213
- }
214
- )
215
-
216
  progress_bar.progress((i + 1) / num_samples)
217
 
218
- # Summary
219
  st.divider()
220
- st.header("๐Ÿ Summary")
 
221
  summary = benchmark_helper.generate_summary(all_results)
222
  if summary and model_choice in summary:
223
  stats = summary[model_choice]
224
- col1, col2 = st.columns(2)
 
 
225
  col1.metric("Success Rate", f"{stats.get('success_rate', 0) * 100:.1f}%")
226
- col2.metric("Avg Distance", f"{stats.get('average_distance_km', 0):.1f} km")
227
- st.dataframe(all_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # Sidebar
59
  with st.sidebar:
60
+ st.header("Configuration")
61
 
62
  dataset_choice = st.selectbox("Dataset", get_available_datasets())
63
  model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
 
91
  ) as bot:
92
  for i, sample in enumerate(test_samples):
93
  st.divider()
94
+ st.header(f"Sample {i + 1}/{num_samples} - ID: {sample.get('id', 'N/A')}")
95
 
96
  bot.controller.load_location_from_data(sample)
97
  bot.controller.setup_clean_environment()
98
 
99
+ # Create scrollable container for this sample
100
+ sample_container = st.container()
101
+
102
+ with sample_container:
103
+ # Initialize step tracking
104
+ history = []
105
+ final_guess = None
106
+
107
+ for step in range(steps_per_sample):
108
+ step_num = step + 1
109
+
110
+ # Create step container
111
+ with st.container():
112
+ st.subheader(f"Step {step_num}/{steps_per_sample}")
113
+
114
+ # Take screenshot and show
115
+ bot.controller.label_arrows_on_screen()
116
+ screenshot_bytes = bot.controller.take_street_view_screenshot()
117
+
118
+ col1, col2 = st.columns([1, 2])
119
+
120
+ with col1:
121
+ st.image(
122
+ screenshot_bytes,
123
+ caption=f"What AI sees",
124
+ use_column_width=True,
125
+ )
126
+
127
+ with col2:
128
+ # Build history for AI
129
+ current_step = {
130
+ "image_b64": bot.pil_to_base64(
131
+ Image.open(BytesIO(screenshot_bytes))
132
+ ),
133
+ "action": "N/A",
134
+ }
135
+ history.append(current_step)
136
+
137
+ available_actions = bot.controller.get_available_actions()
138
+ history_text = "\n".join(
139
+ [
140
+ f"Step {j + 1}: {h['action']}"
141
+ for j, h in enumerate(history[:-1])
142
+ ]
143
+ )
144
+ if not history_text:
145
+ history_text = "First step."
146
+
147
+ prompt = AGENT_PROMPT_TEMPLATE.format(
148
+ remaining_steps=steps_per_sample - step,
149
+ history_text=history_text,
150
+ available_actions=json.dumps(available_actions),
151
+ )
152
+
153
+ # Show AI context
154
+ st.write("**Available Actions:**")
155
+ st.code(json.dumps(available_actions, indent=2))
156
+
157
+ st.write("**AI Context:**")
158
+ st.text_area(
159
+ "History",
160
+ history_text,
161
+ height=100,
162
+ disabled=True,
163
+ key=f"history_{i}_{step}",
164
+ )
165
+
166
+ # Get AI response
167
+ with st.spinner("AI thinking..."):
168
+ message = bot._create_message_with_history(
169
+ prompt, [h["image_b64"] for h in history]
170
+ )
171
+ response = bot.model.invoke(message)
172
+ decision = bot._parse_agent_response(response)
173
+
174
+ if not decision:
175
+ decision = {
176
+ "action_details": {"action": "PAN_RIGHT"},
177
+ "reasoning": "Fallback",
178
+ }
179
+
180
+ action = decision.get("action_details", {}).get("action")
181
+ history[-1]["action"] = action
182
+
183
+ # Show AI decision
184
+ st.write("**AI Reasoning:**")
185
+ st.info(decision.get("reasoning", "N/A"))
186
+
187
+ st.write("**AI Action:**")
188
+ st.success(f"`{action}`")
189
+
190
+ # Show raw response
191
+ with st.expander("Raw AI Response"):
192
+ st.text(response.content)
193
+
194
+ # Force guess on last step
195
+ if step_num == steps_per_sample and action != "GUESS":
196
+ st.warning("Max steps reached. Forcing GUESS.")
197
+ action = "GUESS"
198
+
199
+ # Execute action
200
+ if action == "GUESS":
201
+ lat = decision.get("action_details", {}).get("lat")
202
+ lon = decision.get("action_details", {}).get("lon")
203
+ if lat is not None and lon is not None:
204
+ final_guess = (lat, lon)
205
+ st.success(f"Final Guess: {lat:.4f}, {lon:.4f}")
206
+ break
207
+ elif action == "MOVE_FORWARD":
208
+ bot.controller.move("forward")
209
+ elif action == "MOVE_BACKWARD":
210
+ bot.controller.move("backward")
211
+ elif action == "PAN_LEFT":
212
+ bot.controller.pan_view("left")
213
+ elif action == "PAN_RIGHT":
214
+ bot.controller.pan_view("right")
215
+
216
+ time.sleep(1)
217
+
218
+ # Sample Results
219
+ st.subheader("Sample Result")
220
+ true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
221
+ distance_km = None
222
+ is_success = False
223
+
224
+ if final_guess:
225
+ distance_km = benchmark_helper.calculate_distance(
226
+ true_coords, final_guess
227
+ )
228
+ if distance_km is not None:
229
+ is_success = distance_km <= SUCCESS_THRESHOLD_KM
230
+
231
+ col1, col2, col3 = st.columns(3)
232
+ col1.metric(
233
+ "Final Guess", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}"
234
+ )
235
+ col2.metric(
236
+ "Ground Truth",
237
+ f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}",
238
+ )
239
+ col3.metric(
240
+ "Distance",
241
+ f"{distance_km:.1f} km",
242
+ delta="Success" if is_success else "Failed",
243
+ )
244
+ else:
245
+ st.error("No final guess made")
246
+
247
+ all_results.append(
248
+ {
249
+ "sample_id": sample.get("id"),
250
+ "model": model_choice,
251
+ "true_coordinates": true_coords,
252
+ "predicted_coordinates": final_guess,
253
+ "distance_km": distance_km,
254
+ "success": is_success,
255
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  )
257
 
 
 
 
 
 
 
 
 
 
 
 
258
  progress_bar.progress((i + 1) / num_samples)
259
 
260
+ # Final Summary
261
  st.divider()
262
+ st.header("๐Ÿ Final Results")
263
+
264
  summary = benchmark_helper.generate_summary(all_results)
265
  if summary and model_choice in summary:
266
  stats = summary[model_choice]
267
+
268
+ # Overall metrics
269
+ col1, col2, col3 = st.columns(3)
270
  col1.metric("Success Rate", f"{stats.get('success_rate', 0) * 100:.1f}%")
271
+ col2.metric("Average Distance", f"{stats.get('average_distance_km', 0):.1f} km")
272
+ col3.metric("Total Samples", len(all_results))
273
+
274
+ # Detailed results table
275
+ st.subheader("Detailed Results")
276
+ st.dataframe(all_results, use_container_width=True)
277
+
278
+ # Success breakdown
279
+ successes = [r for r in all_results if r["success"]]
280
+ failures = [r for r in all_results if not r["success"]]
281
+
282
+ if successes:
283
+ st.subheader("Successful Samples")
284
+ st.dataframe(successes, use_container_width=True)
285
+
286
+ if failures:
287
+ st.subheader("Failed Samples")
288
+ st.dataframe(failures, use_container_width=True)
289
+ else:
290
+ st.error("Could not generate summary")
291
+ st.dataframe(all_results, use_container_width=True)