Spaces:
Sleeping
Sleeping
Andy Lee
commited on
Commit
ยท
1c04950
1
Parent(s):
78ec24e
revert: better ui
Browse files
app.py
CHANGED
@@ -57,7 +57,7 @@ st.title("๐บ๏ธ MapCrunch AI Agent")
|
|
57 |
|
58 |
# Sidebar
|
59 |
with st.sidebar:
|
60 |
-
st.header("
|
61 |
|
62 |
dataset_choice = st.selectbox("Dataset", get_available_datasets())
|
63 |
model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
|
@@ -91,137 +91,201 @@ if start_button:
|
|
91 |
) as bot:
|
92 |
for i, sample in enumerate(test_samples):
|
93 |
st.divider()
|
94 |
-
st.header(f"Sample {i + 1}/{num_samples}")
|
95 |
|
96 |
bot.controller.load_location_from_data(sample)
|
97 |
bot.controller.setup_clean_environment()
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
}
|
150 |
-
|
151 |
-
action = decision.get("action_details", {}).get("action")
|
152 |
-
history[-1]["action"] = action
|
153 |
-
|
154 |
-
reasoning_placeholder.success("โ
Decision Made")
|
155 |
-
action_placeholder.success(f"๐ฏ Action: `{action}`")
|
156 |
-
|
157 |
-
with action_placeholder:
|
158 |
-
with st.expander("Reasoning"):
|
159 |
-
st.write(decision.get("reasoning", "N/A"))
|
160 |
-
|
161 |
-
if step_num == steps_per_sample and action != "GUESS":
|
162 |
-
action = "GUESS"
|
163 |
-
|
164 |
-
if action == "GUESS":
|
165 |
-
lat = decision.get("action_details", {}).get("lat")
|
166 |
-
lon = decision.get("action_details", {}).get("lon")
|
167 |
-
if lat is not None and lon is not None:
|
168 |
-
final_guess = (lat, lon)
|
169 |
-
break
|
170 |
-
elif action == "MOVE_FORWARD":
|
171 |
-
bot.controller.move("forward")
|
172 |
-
elif action == "MOVE_BACKWARD":
|
173 |
-
bot.controller.move("backward")
|
174 |
-
elif action == "PAN_LEFT":
|
175 |
-
bot.controller.pan_view("left")
|
176 |
-
elif action == "PAN_RIGHT":
|
177 |
-
bot.controller.pan_view("right")
|
178 |
-
|
179 |
-
time.sleep(1)
|
180 |
-
|
181 |
-
# Results
|
182 |
-
true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
|
183 |
-
distance_km = None
|
184 |
-
is_success = False
|
185 |
-
|
186 |
-
if final_guess:
|
187 |
-
distance_km = benchmark_helper.calculate_distance(
|
188 |
-
true_coords, final_guess
|
189 |
-
)
|
190 |
-
if distance_km is not None:
|
191 |
-
is_success = distance_km <= SUCCESS_THRESHOLD_KM
|
192 |
-
|
193 |
-
st.subheader("๐ฏ Result")
|
194 |
-
col1, col2, col3 = st.columns(3)
|
195 |
-
col1.metric("Guess", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}")
|
196 |
-
col2.metric(
|
197 |
-
"Truth", f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}"
|
198 |
-
)
|
199 |
-
col3.metric(
|
200 |
-
"Distance",
|
201 |
-
f"{distance_km:.1f} km",
|
202 |
-
delta="Success" if is_success else "Failed",
|
203 |
)
|
204 |
|
205 |
-
all_results.append(
|
206 |
-
{
|
207 |
-
"sample_id": sample.get("id"),
|
208 |
-
"model": model_choice,
|
209 |
-
"true_coordinates": true_coords,
|
210 |
-
"predicted_coordinates": final_guess,
|
211 |
-
"distance_km": distance_km,
|
212 |
-
"success": is_success,
|
213 |
-
}
|
214 |
-
)
|
215 |
-
|
216 |
progress_bar.progress((i + 1) / num_samples)
|
217 |
|
218 |
-
# Summary
|
219 |
st.divider()
|
220 |
-
st.header("๐
|
|
|
221 |
summary = benchmark_helper.generate_summary(all_results)
|
222 |
if summary and model_choice in summary:
|
223 |
stats = summary[model_choice]
|
224 |
-
|
|
|
|
|
225 |
col1.metric("Success Rate", f"{stats.get('success_rate', 0) * 100:.1f}%")
|
226 |
-
col2.metric("
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# Sidebar
|
59 |
with st.sidebar:
|
60 |
+
st.header("Configuration")
|
61 |
|
62 |
dataset_choice = st.selectbox("Dataset", get_available_datasets())
|
63 |
model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
|
|
|
91 |
) as bot:
|
92 |
for i, sample in enumerate(test_samples):
|
93 |
st.divider()
|
94 |
+
st.header(f"Sample {i + 1}/{num_samples} - ID: {sample.get('id', 'N/A')}")
|
95 |
|
96 |
bot.controller.load_location_from_data(sample)
|
97 |
bot.controller.setup_clean_environment()
|
98 |
|
99 |
+
# Create scrollable container for this sample
|
100 |
+
sample_container = st.container()
|
101 |
+
|
102 |
+
with sample_container:
|
103 |
+
# Initialize step tracking
|
104 |
+
history = []
|
105 |
+
final_guess = None
|
106 |
+
|
107 |
+
for step in range(steps_per_sample):
|
108 |
+
step_num = step + 1
|
109 |
+
|
110 |
+
# Create step container
|
111 |
+
with st.container():
|
112 |
+
st.subheader(f"Step {step_num}/{steps_per_sample}")
|
113 |
+
|
114 |
+
# Take screenshot and show
|
115 |
+
bot.controller.label_arrows_on_screen()
|
116 |
+
screenshot_bytes = bot.controller.take_street_view_screenshot()
|
117 |
+
|
118 |
+
col1, col2 = st.columns([1, 2])
|
119 |
+
|
120 |
+
with col1:
|
121 |
+
st.image(
|
122 |
+
screenshot_bytes,
|
123 |
+
caption=f"What AI sees",
|
124 |
+
use_column_width=True,
|
125 |
+
)
|
126 |
+
|
127 |
+
with col2:
|
128 |
+
# Build history for AI
|
129 |
+
current_step = {
|
130 |
+
"image_b64": bot.pil_to_base64(
|
131 |
+
Image.open(BytesIO(screenshot_bytes))
|
132 |
+
),
|
133 |
+
"action": "N/A",
|
134 |
+
}
|
135 |
+
history.append(current_step)
|
136 |
+
|
137 |
+
available_actions = bot.controller.get_available_actions()
|
138 |
+
history_text = "\n".join(
|
139 |
+
[
|
140 |
+
f"Step {j + 1}: {h['action']}"
|
141 |
+
for j, h in enumerate(history[:-1])
|
142 |
+
]
|
143 |
+
)
|
144 |
+
if not history_text:
|
145 |
+
history_text = "First step."
|
146 |
+
|
147 |
+
prompt = AGENT_PROMPT_TEMPLATE.format(
|
148 |
+
remaining_steps=steps_per_sample - step,
|
149 |
+
history_text=history_text,
|
150 |
+
available_actions=json.dumps(available_actions),
|
151 |
+
)
|
152 |
+
|
153 |
+
# Show AI context
|
154 |
+
st.write("**Available Actions:**")
|
155 |
+
st.code(json.dumps(available_actions, indent=2))
|
156 |
+
|
157 |
+
st.write("**AI Context:**")
|
158 |
+
st.text_area(
|
159 |
+
"History",
|
160 |
+
history_text,
|
161 |
+
height=100,
|
162 |
+
disabled=True,
|
163 |
+
key=f"history_{i}_{step}",
|
164 |
+
)
|
165 |
+
|
166 |
+
# Get AI response
|
167 |
+
with st.spinner("AI thinking..."):
|
168 |
+
message = bot._create_message_with_history(
|
169 |
+
prompt, [h["image_b64"] for h in history]
|
170 |
+
)
|
171 |
+
response = bot.model.invoke(message)
|
172 |
+
decision = bot._parse_agent_response(response)
|
173 |
+
|
174 |
+
if not decision:
|
175 |
+
decision = {
|
176 |
+
"action_details": {"action": "PAN_RIGHT"},
|
177 |
+
"reasoning": "Fallback",
|
178 |
+
}
|
179 |
+
|
180 |
+
action = decision.get("action_details", {}).get("action")
|
181 |
+
history[-1]["action"] = action
|
182 |
+
|
183 |
+
# Show AI decision
|
184 |
+
st.write("**AI Reasoning:**")
|
185 |
+
st.info(decision.get("reasoning", "N/A"))
|
186 |
+
|
187 |
+
st.write("**AI Action:**")
|
188 |
+
st.success(f"`{action}`")
|
189 |
+
|
190 |
+
# Show raw response
|
191 |
+
with st.expander("Raw AI Response"):
|
192 |
+
st.text(response.content)
|
193 |
+
|
194 |
+
# Force guess on last step
|
195 |
+
if step_num == steps_per_sample and action != "GUESS":
|
196 |
+
st.warning("Max steps reached. Forcing GUESS.")
|
197 |
+
action = "GUESS"
|
198 |
+
|
199 |
+
# Execute action
|
200 |
+
if action == "GUESS":
|
201 |
+
lat = decision.get("action_details", {}).get("lat")
|
202 |
+
lon = decision.get("action_details", {}).get("lon")
|
203 |
+
if lat is not None and lon is not None:
|
204 |
+
final_guess = (lat, lon)
|
205 |
+
st.success(f"Final Guess: {lat:.4f}, {lon:.4f}")
|
206 |
+
break
|
207 |
+
elif action == "MOVE_FORWARD":
|
208 |
+
bot.controller.move("forward")
|
209 |
+
elif action == "MOVE_BACKWARD":
|
210 |
+
bot.controller.move("backward")
|
211 |
+
elif action == "PAN_LEFT":
|
212 |
+
bot.controller.pan_view("left")
|
213 |
+
elif action == "PAN_RIGHT":
|
214 |
+
bot.controller.pan_view("right")
|
215 |
+
|
216 |
+
time.sleep(1)
|
217 |
+
|
218 |
+
# Sample Results
|
219 |
+
st.subheader("Sample Result")
|
220 |
+
true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")}
|
221 |
+
distance_km = None
|
222 |
+
is_success = False
|
223 |
+
|
224 |
+
if final_guess:
|
225 |
+
distance_km = benchmark_helper.calculate_distance(
|
226 |
+
true_coords, final_guess
|
227 |
+
)
|
228 |
+
if distance_km is not None:
|
229 |
+
is_success = distance_km <= SUCCESS_THRESHOLD_KM
|
230 |
+
|
231 |
+
col1, col2, col3 = st.columns(3)
|
232 |
+
col1.metric(
|
233 |
+
"Final Guess", f"{final_guess[0]:.3f}, {final_guess[1]:.3f}"
|
234 |
+
)
|
235 |
+
col2.metric(
|
236 |
+
"Ground Truth",
|
237 |
+
f"{true_coords['lat']:.3f}, {true_coords['lng']:.3f}",
|
238 |
+
)
|
239 |
+
col3.metric(
|
240 |
+
"Distance",
|
241 |
+
f"{distance_km:.1f} km",
|
242 |
+
delta="Success" if is_success else "Failed",
|
243 |
+
)
|
244 |
+
else:
|
245 |
+
st.error("No final guess made")
|
246 |
+
|
247 |
+
all_results.append(
|
248 |
+
{
|
249 |
+
"sample_id": sample.get("id"),
|
250 |
+
"model": model_choice,
|
251 |
+
"true_coordinates": true_coords,
|
252 |
+
"predicted_coordinates": final_guess,
|
253 |
+
"distance_km": distance_km,
|
254 |
+
"success": is_success,
|
255 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
)
|
257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
progress_bar.progress((i + 1) / num_samples)
|
259 |
|
260 |
+
# Final Summary
|
261 |
st.divider()
|
262 |
+
st.header("๐ Final Results")
|
263 |
+
|
264 |
summary = benchmark_helper.generate_summary(all_results)
|
265 |
if summary and model_choice in summary:
|
266 |
stats = summary[model_choice]
|
267 |
+
|
268 |
+
# Overall metrics
|
269 |
+
col1, col2, col3 = st.columns(3)
|
270 |
col1.metric("Success Rate", f"{stats.get('success_rate', 0) * 100:.1f}%")
|
271 |
+
col2.metric("Average Distance", f"{stats.get('average_distance_km', 0):.1f} km")
|
272 |
+
col3.metric("Total Samples", len(all_results))
|
273 |
+
|
274 |
+
# Detailed results table
|
275 |
+
st.subheader("Detailed Results")
|
276 |
+
st.dataframe(all_results, use_container_width=True)
|
277 |
+
|
278 |
+
# Success breakdown
|
279 |
+
successes = [r for r in all_results if r["success"]]
|
280 |
+
failures = [r for r in all_results if not r["success"]]
|
281 |
+
|
282 |
+
if successes:
|
283 |
+
st.subheader("Successful Samples")
|
284 |
+
st.dataframe(successes, use_container_width=True)
|
285 |
+
|
286 |
+
if failures:
|
287 |
+
st.subheader("Failed Samples")
|
288 |
+
st.dataframe(failures, use_container_width=True)
|
289 |
+
else:
|
290 |
+
st.error("Could not generate summary")
|
291 |
+
st.dataframe(all_results, use_container_width=True)
|