Spaces:
Running
Running
add debug message to benchmark mode in hf UI
Browse files- when the output of model can't be successfully parsed, the original model response will be displayed
benchmark mode add failure signal
- app.py +3 -1
- benchmark.py +6 -1
- geo_bot.py +2 -0
app.py
CHANGED
@@ -306,7 +306,9 @@ if start_button:
|
|
306 |
|
307 |
st.write("**AI Reasoning:**")
|
308 |
st.info(step_info.get("reasoning", "N/A"))
|
309 |
-
|
|
|
|
|
310 |
st.write("**AI Action:**")
|
311 |
if action == "GUESS":
|
312 |
lat = step_info.get("action_details", {}).get("lat")
|
|
|
306 |
|
307 |
st.write("**AI Reasoning:**")
|
308 |
st.info(step_info.get("reasoning", "N/A"))
|
309 |
+
if step_info.get("debug_message") != "N/A":
|
310 |
+
st.write("**AI Debug Message:**")
|
311 |
+
st.code(step_info.get("debug_message"), language="json")
|
312 |
st.write("**AI Action:**")
|
313 |
if action == "GUESS":
|
314 |
lat = step_info.get("action_details", {}).get("lat")
|
benchmark.py
CHANGED
@@ -99,6 +99,9 @@ class MapGuesserBenchmark:
|
|
99 |
print(f"π Sample {i + 1}/{len(test_samples)}")
|
100 |
try:
|
101 |
result = self.run_single_test_with_bot(bot, sample)
|
|
|
|
|
|
|
102 |
all_results.append(result)
|
103 |
|
104 |
status = (
|
@@ -154,6 +157,8 @@ class MapGuesserBenchmark:
|
|
154 |
}
|
155 |
|
156 |
predicted_lat_lon = bot.analyze_image(screenshot)
|
|
|
|
|
157 |
inference_time = time.time() - start_time
|
158 |
|
159 |
true_coords = {"lat": location_data.get("lat"), "lng": location_data.get("lng")}
|
@@ -163,7 +168,7 @@ class MapGuesserBenchmark:
|
|
163 |
print(f"π True coords: {true_coords}")
|
164 |
print(f"π Predicted coords: {predicted_lat_lon}")
|
165 |
distance_km = self.calculate_distance(true_coords, predicted_lat_lon)
|
166 |
-
|
167 |
is_success = distance_km is not None and distance_km <= SUCCESS_THRESHOLD_KM
|
168 |
|
169 |
return {
|
|
|
99 |
print(f"π Sample {i + 1}/{len(test_samples)}")
|
100 |
try:
|
101 |
result = self.run_single_test_with_bot(bot, sample)
|
102 |
+
if result is None:
|
103 |
+
print(f"β Sample_{i+1} test failed: No predicted coords")
|
104 |
+
continue
|
105 |
all_results.append(result)
|
106 |
|
107 |
status = (
|
|
|
157 |
}
|
158 |
|
159 |
predicted_lat_lon = bot.analyze_image(screenshot)
|
160 |
+
if predicted_lat_lon is None:
|
161 |
+
return None
|
162 |
inference_time = time.time() - start_time
|
163 |
|
164 |
true_coords = {"lat": location_data.get("lat"), "lng": location_data.get("lng")}
|
|
|
168 |
print(f"π True coords: {true_coords}")
|
169 |
print(f"π Predicted coords: {predicted_lat_lon}")
|
170 |
distance_km = self.calculate_distance(true_coords, predicted_lat_lon)
|
171 |
+
|
172 |
is_success = distance_km is not None and distance_km <= SUCCESS_THRESHOLD_KM
|
173 |
|
174 |
return {
|
geo_bot.py
CHANGED
@@ -250,6 +250,7 @@ class GeoBot:
|
|
250 |
decision = {
|
251 |
"reasoning": "Recovery due to parsing failure or model error.",
|
252 |
"action_details": {"action": "PAN_RIGHT"},
|
|
|
253 |
}
|
254 |
|
255 |
return decision
|
@@ -347,6 +348,7 @@ class GeoBot:
|
|
347 |
"reasoning": decision.get("reasoning", "N/A"),
|
348 |
"action_details": decision.get("action_details", {"action": "N/A"}),
|
349 |
"history": history.copy(), # History up to current step (excluding current)
|
|
|
350 |
}
|
351 |
|
352 |
action_details = decision.get("action_details", {})
|
|
|
250 |
decision = {
|
251 |
"reasoning": "Recovery due to parsing failure or model error.",
|
252 |
"action_details": {"action": "PAN_RIGHT"},
|
253 |
+
"debug_message": f"{response.content.strip()}",
|
254 |
}
|
255 |
|
256 |
return decision
|
|
|
348 |
"reasoning": decision.get("reasoning", "N/A"),
|
349 |
"action_details": decision.get("action_details", {"action": "N/A"}),
|
350 |
"history": history.copy(), # History up to current step (excluding current)
|
351 |
+
"debug_message": decision.get("debug_message", "N/A"),
|
352 |
}
|
353 |
|
354 |
action_details = decision.get("action_details", {})
|