Spaces:
Running
Running
benediktstroebl
commited on
Commit
·
5a7e21a
1
Parent(s):
caec940
added failure report and two new swebench variants
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7 |
import os
|
8 |
import json
|
9 |
from utils.data import parse_json_files
|
10 |
-
from utils.viz import create_scatter_plot, create_flow_chart
|
11 |
from utils.processing import check_and_process_uploads
|
12 |
from huggingface_hub import snapshot_download
|
13 |
from apscheduler.schedulers.background import BackgroundScheduler
|
@@ -21,7 +21,6 @@ import weave
|
|
21 |
|
22 |
|
23 |
from datetime import datetime
|
24 |
-
weave.init(f'leaderboard_testing_{datetime.now().strftime("%Y%m%d%H%M%S")}')
|
25 |
|
26 |
abs_path = Path(__file__).parent
|
27 |
|
@@ -43,29 +42,46 @@ def download_latest_results():
|
|
43 |
|
44 |
# Global variable to store preprocessed data
|
45 |
preprocessed_traces = {}
|
|
|
46 |
def preprocess_traces():
|
47 |
global preprocessed_traces
|
|
|
48 |
processed_dir = Path("evals_live")
|
49 |
for file in processed_dir.glob('*.json'):
|
50 |
-
|
51 |
-
with open(file, 'r') as f:
|
52 |
data = json.load(f)
|
53 |
agent_name = data['config']['agent_name']
|
54 |
benchmark_name = data['config']['benchmark_name']
|
55 |
if benchmark_name not in preprocessed_traces:
|
56 |
preprocessed_traces[benchmark_name] = {}
|
|
|
|
|
57 |
|
58 |
-
|
59 |
-
|
|
|
60 |
except AssertionError as e:
|
61 |
preprocessed_traces[benchmark_name][agent_name] = None
|
62 |
except Exception as e:
|
63 |
print(f"Error preprocessing {file}: {e}")
|
64 |
preprocessed_traces[benchmark_name][agent_name] = None
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
def get_analyzed_traces(agent_name, benchmark_name):
|
67 |
return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
|
68 |
|
|
|
|
|
|
|
69 |
def update_agent_dropdown(benchmark_name, metric):
|
70 |
df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
|
71 |
agents = df['Agent Name'].tolist()
|
@@ -200,6 +216,33 @@ def format_call_info(step, step_index):
|
|
200 |
return formatted_info
|
201 |
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
with gr.Blocks() as demo:
|
204 |
gr.Markdown("""
|
205 |
# 🥇 Agent Leaderboard
|
@@ -223,7 +266,7 @@ with gr.Blocks() as demo:
|
|
223 |
)
|
224 |
with gr.Row():
|
225 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
226 |
-
gr.Markdown("
|
227 |
with gr.Row():
|
228 |
with gr.Column(scale=1):
|
229 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
@@ -245,7 +288,7 @@ with gr.Blocks() as demo:
|
|
245 |
inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
|
246 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
247 |
|
248 |
-
gr.Markdown("
|
249 |
with gr.Row():
|
250 |
with gr.Column(scale=1):
|
251 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
@@ -303,23 +346,200 @@ with gr.Blocks() as demo:
|
|
303 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
304 |
outputs=[raw_call_details])
|
305 |
|
306 |
-
with gr.Tab("SWE-Bench"):
|
307 |
with gr.Row():
|
308 |
with gr.Column(scale=2):
|
309 |
Leaderboard(
|
310 |
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
|
311 |
select_columns=SelectColumns(
|
312 |
-
default_selection=config.
|
313 |
cant_deselect=["Agent Name"],
|
314 |
label="Select Columns to Display:",
|
315 |
),
|
316 |
-
search_columns=config.
|
317 |
column_widths={"Agent Name": 40,
|
318 |
"Accuracy": 20,
|
319 |
"Total Cost": 20},
|
320 |
)
|
321 |
with gr.Row():
|
322 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
|
324 |
with gr.Tab("About"):
|
325 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
@@ -332,8 +552,8 @@ async def main():
|
|
332 |
# Preprocess traces
|
333 |
preprocess_traces()
|
334 |
|
335 |
-
# Download the results from the Hugging Face Hub
|
336 |
-
await asyncio.to_thread(download_latest_results)
|
337 |
|
338 |
# Check for new uploads and process them
|
339 |
await check_and_process_uploads()
|
@@ -347,4 +567,5 @@ async def main():
|
|
347 |
await demo.launch()
|
348 |
|
349 |
if __name__ == "__main__":
|
|
|
350 |
asyncio.run(main())
|
|
|
7 |
import os
|
8 |
import json
|
9 |
from utils.data import parse_json_files
|
10 |
+
from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
|
11 |
from utils.processing import check_and_process_uploads
|
12 |
from huggingface_hub import snapshot_download
|
13 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
21 |
|
22 |
|
23 |
from datetime import datetime
|
|
|
24 |
|
25 |
abs_path = Path(__file__).parent
|
26 |
|
|
|
42 |
|
43 |
# Global variable to store preprocessed data
|
44 |
preprocessed_traces = {}
|
45 |
+
failure_reports = {}
|
46 |
def preprocess_traces():
|
47 |
global preprocessed_traces
|
48 |
+
global failure_reports
|
49 |
processed_dir = Path("evals_live")
|
50 |
for file in processed_dir.glob('*.json'):
|
51 |
+
with open(file, 'r') as f:
|
|
|
52 |
data = json.load(f)
|
53 |
agent_name = data['config']['agent_name']
|
54 |
benchmark_name = data['config']['benchmark_name']
|
55 |
if benchmark_name not in preprocessed_traces:
|
56 |
preprocessed_traces[benchmark_name] = {}
|
57 |
+
if benchmark_name not in failure_reports:
|
58 |
+
failure_reports[benchmark_name] = {}
|
59 |
|
60 |
+
try:
|
61 |
+
assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
|
62 |
+
preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
|
63 |
except AssertionError as e:
|
64 |
preprocessed_traces[benchmark_name][agent_name] = None
|
65 |
except Exception as e:
|
66 |
print(f"Error preprocessing {file}: {e}")
|
67 |
preprocessed_traces[benchmark_name][agent_name] = None
|
68 |
|
69 |
+
try:
|
70 |
+
assert type(data['failure_report']) == dict, f"Invalid format for failure_report: {type(data['failure_report'])}"
|
71 |
+
failure_reports[benchmark_name][agent_name] = data['failure_report']
|
72 |
+
except AssertionError as e:
|
73 |
+
failure_reports[benchmark_name][agent_name] = None
|
74 |
+
except Exception as e:
|
75 |
+
print(f"Error preprocessing {file}: {e}")
|
76 |
+
failure_reports[benchmark_name][agent_name] = None
|
77 |
+
|
78 |
+
|
79 |
def get_analyzed_traces(agent_name, benchmark_name):
|
80 |
return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
|
81 |
|
82 |
+
def get_failure_report(agent_name, benchmark_name):
|
83 |
+
return failure_reports.get(benchmark_name, {}).get(agent_name)
|
84 |
+
|
85 |
def update_agent_dropdown(benchmark_name, metric):
|
86 |
df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
|
87 |
agents = df['Agent Name'].tolist()
|
|
|
216 |
return formatted_info
|
217 |
|
218 |
|
219 |
+
def update_failure_report(agent_name):
|
220 |
+
failure_report = get_failure_report(agent_name, "swebench_lite")
|
221 |
+
if not failure_report:
|
222 |
+
return "No failure report available for this agent.", None
|
223 |
+
|
224 |
+
# Create overview of failure categories
|
225 |
+
categories_overview = "## Failure Categories Overview\n\n"
|
226 |
+
for category in failure_report['failure_categories']:
|
227 |
+
categories_overview += f"### {category['category_name']}\n"
|
228 |
+
categories_overview += f"{category['description']}\n\n"
|
229 |
+
|
230 |
+
# Count tasks affected by each category
|
231 |
+
category_counts = {}
|
232 |
+
for task, classification in failure_report['task_classifications'].items():
|
233 |
+
category_id = classification['category_id']
|
234 |
+
category_counts[category_id] = category_counts.get(category_id, 0) + 1
|
235 |
+
|
236 |
+
# Prepare data for bar chart
|
237 |
+
categories = [cat['category_name'] for cat in failure_report['failure_categories']]
|
238 |
+
counts = [category_counts.get(str(i+1), 0) for i in range(len(categories))]
|
239 |
+
|
240 |
+
# Create bar chart
|
241 |
+
chart = create_bar_chart(categories, counts, "Failure Categories", "Number of Affected Tasks", "Failure Categories Distribution")
|
242 |
+
|
243 |
+
return categories_overview, chart
|
244 |
+
|
245 |
+
|
246 |
with gr.Blocks() as demo:
|
247 |
gr.Markdown("""
|
248 |
# 🥇 Agent Leaderboard
|
|
|
266 |
)
|
267 |
with gr.Row():
|
268 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
269 |
+
gr.Markdown("# Agent Monitor")
|
270 |
with gr.Row():
|
271 |
with gr.Column(scale=1):
|
272 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
288 |
inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
|
289 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
290 |
|
291 |
+
gr.Markdown("# Raw Predictions")
|
292 |
with gr.Row():
|
293 |
with gr.Column(scale=1):
|
294 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
346 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
347 |
outputs=[raw_call_details])
|
348 |
|
349 |
+
with gr.Tab("SWE-Bench Lite"):
|
350 |
with gr.Row():
|
351 |
with gr.Column(scale=2):
|
352 |
Leaderboard(
|
353 |
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
|
354 |
select_columns=SelectColumns(
|
355 |
+
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
|
356 |
cant_deselect=["Agent Name"],
|
357 |
label="Select Columns to Display:",
|
358 |
),
|
359 |
+
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
360 |
column_widths={"Agent Name": 40,
|
361 |
"Accuracy": 20,
|
362 |
"Total Cost": 20},
|
363 |
)
|
364 |
with gr.Row():
|
365 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
366 |
+
|
367 |
+
gr.Markdown("# Failure Report")
|
368 |
+
with gr.Row():
|
369 |
+
with gr.Column(scale=1):
|
370 |
+
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
371 |
+
with gr.Row():
|
372 |
+
with gr.Column(scale=1):
|
373 |
+
failure_categories_overview = gr.Markdown()
|
374 |
+
|
375 |
+
with gr.Column(scale=1):
|
376 |
+
failure_categories_chart = gr.Plot()
|
377 |
+
|
378 |
+
# Initialize the failure report agent dropdown with all agents
|
379 |
+
demo.load(update_agent_dropdown,
|
380 |
+
inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
381 |
+
outputs=[failure_report_agent_dropdown])
|
382 |
+
|
383 |
+
# Update failure report when agent is selected
|
384 |
+
failure_report_agent_dropdown.change(update_failure_report,
|
385 |
+
inputs=[failure_report_agent_dropdown],
|
386 |
+
outputs=[failure_categories_overview, failure_categories_chart])
|
387 |
+
|
388 |
+
gr.Markdown("# Raw Predictions")
|
389 |
+
with gr.Row():
|
390 |
+
with gr.Column(scale=1):
|
391 |
+
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
392 |
+
with gr.Column(scale=1):
|
393 |
+
raw_task_dropdown = gr.Dropdown(label="Select Task")
|
394 |
+
with gr.Column(scale=1):
|
395 |
+
raw_step_dropdown = gr.Dropdown(label="Select Step")
|
396 |
+
|
397 |
+
with gr.Row():
|
398 |
+
raw_call_details = gr.HTML()
|
399 |
+
|
400 |
+
def update_raw_task_dropdown(agent_name):
|
401 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
|
402 |
+
if not analyzed_traces:
|
403 |
+
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
404 |
+
task_ids = list(analyzed_traces.keys())
|
405 |
+
steps = analyzed_traces[task_ids[0]]['steps']
|
406 |
+
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
|
407 |
+
|
408 |
+
def update_raw_step_dropdown(agent_name, task_id):
|
409 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
|
410 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
411 |
+
return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
412 |
+
steps = analyzed_traces[task_id]['steps']
|
413 |
+
return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
414 |
+
|
415 |
+
def update_raw_call_details(agent_name, task_id, step_index):
|
416 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
|
417 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
418 |
+
return "No data available for this selection."
|
419 |
+
steps = analyzed_traces[task_id]['steps']
|
420 |
+
if step_index is None:
|
421 |
+
return "Invalid step selection."
|
422 |
+
step = steps[step_index]
|
423 |
+
return format_call_info(step, step_index)
|
424 |
+
|
425 |
+
# Initialize the raw agent dropdown with all agents
|
426 |
+
demo.load(update_agent_dropdown,
|
427 |
+
inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
428 |
+
outputs=[raw_agent_dropdown])
|
429 |
+
demo.load(update_raw_task_dropdown,
|
430 |
+
inputs=[raw_agent_dropdown],
|
431 |
+
outputs=[raw_task_dropdown, raw_step_dropdown])
|
432 |
+
demo.load(update_raw_call_details,
|
433 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
434 |
+
outputs=[raw_call_details])
|
435 |
+
|
436 |
+
raw_agent_dropdown.change(update_raw_task_dropdown,
|
437 |
+
inputs=[raw_agent_dropdown],
|
438 |
+
outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
439 |
+
raw_task_dropdown.change(update_raw_step_dropdown,
|
440 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown],
|
441 |
+
outputs=[raw_step_dropdown, raw_call_details])
|
442 |
+
raw_step_dropdown.change(update_raw_call_details,
|
443 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
444 |
+
outputs=[raw_call_details])
|
445 |
+
|
446 |
+
|
447 |
+
with gr.Tab("SWE-Bench Verified"):
|
448 |
+
with gr.Row():
|
449 |
+
with gr.Column(scale=2):
|
450 |
+
Leaderboard(
|
451 |
+
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
|
452 |
+
select_columns=SelectColumns(
|
453 |
+
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
|
454 |
+
cant_deselect=["Agent Name"],
|
455 |
+
label="Select Columns to Display:",
|
456 |
+
),
|
457 |
+
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
458 |
+
column_widths={"Agent Name": 40,
|
459 |
+
"Accuracy": 20,
|
460 |
+
"Total Cost": 20},
|
461 |
+
)
|
462 |
+
with gr.Row():
|
463 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
464 |
+
|
465 |
+
gr.Markdown("# Failure Report")
|
466 |
+
with gr.Row():
|
467 |
+
with gr.Column(scale=1):
|
468 |
+
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
469 |
+
with gr.Row():
|
470 |
+
with gr.Column(scale=1):
|
471 |
+
failure_categories_overview = gr.Markdown()
|
472 |
+
|
473 |
+
with gr.Column(scale=1):
|
474 |
+
failure_categories_chart = gr.Plot()
|
475 |
+
|
476 |
+
# Initialize the failure report agent dropdown with all agents
|
477 |
+
demo.load(update_agent_dropdown,
|
478 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
479 |
+
outputs=[failure_report_agent_dropdown])
|
480 |
+
|
481 |
+
# Update failure report when agent is selected
|
482 |
+
failure_report_agent_dropdown.change(update_failure_report,
|
483 |
+
inputs=[failure_report_agent_dropdown],
|
484 |
+
outputs=[failure_categories_overview, failure_categories_chart])
|
485 |
+
|
486 |
+
gr.Markdown("# Raw Predictions")
|
487 |
+
with gr.Row():
|
488 |
+
with gr.Column(scale=1):
|
489 |
+
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
490 |
+
with gr.Column(scale=1):
|
491 |
+
raw_task_dropdown = gr.Dropdown(label="Select Task")
|
492 |
+
with gr.Column(scale=1):
|
493 |
+
raw_step_dropdown = gr.Dropdown(label="Select Step")
|
494 |
+
|
495 |
+
with gr.Row():
|
496 |
+
raw_call_details = gr.HTML()
|
497 |
+
|
498 |
+
def update_raw_task_dropdown(agent_name):
|
499 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
500 |
+
if not analyzed_traces:
|
501 |
+
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
502 |
+
task_ids = list(analyzed_traces.keys())
|
503 |
+
steps = analyzed_traces[task_ids[0]]['steps']
|
504 |
+
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
|
505 |
+
|
506 |
+
def update_raw_step_dropdown(agent_name, task_id):
|
507 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
508 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
509 |
+
return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
510 |
+
steps = analyzed_traces[task_id]['steps']
|
511 |
+
return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
512 |
+
|
513 |
+
def update_raw_call_details(agent_name, task_id, step_index):
|
514 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
515 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
516 |
+
return "No data available for this selection."
|
517 |
+
steps = analyzed_traces[task_id]['steps']
|
518 |
+
if step_index is None:
|
519 |
+
return "Invalid step selection."
|
520 |
+
step = steps[step_index]
|
521 |
+
return format_call_info(step, step_index)
|
522 |
+
|
523 |
+
# Initialize the raw agent dropdown with all agents
|
524 |
+
demo.load(update_agent_dropdown,
|
525 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
526 |
+
outputs=[raw_agent_dropdown])
|
527 |
+
demo.load(update_raw_task_dropdown,
|
528 |
+
inputs=[raw_agent_dropdown],
|
529 |
+
outputs=[raw_task_dropdown, raw_step_dropdown])
|
530 |
+
demo.load(update_raw_call_details,
|
531 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
532 |
+
outputs=[raw_call_details])
|
533 |
+
|
534 |
+
raw_agent_dropdown.change(update_raw_task_dropdown,
|
535 |
+
inputs=[raw_agent_dropdown],
|
536 |
+
outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
537 |
+
raw_task_dropdown.change(update_raw_step_dropdown,
|
538 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown],
|
539 |
+
outputs=[raw_step_dropdown, raw_call_details])
|
540 |
+
raw_step_dropdown.change(update_raw_call_details,
|
541 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
542 |
+
outputs=[raw_call_details])
|
543 |
|
544 |
with gr.Tab("About"):
|
545 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
|
|
552 |
# Preprocess traces
|
553 |
preprocess_traces()
|
554 |
|
555 |
+
# # Download the results from the Hugging Face Hub
|
556 |
+
# await asyncio.to_thread(download_latest_results)
|
557 |
|
558 |
# Check for new uploads and process them
|
559 |
await check_and_process_uploads()
|
|
|
567 |
await demo.launch()
|
568 |
|
569 |
if __name__ == "__main__":
|
570 |
+
weave.init(f'leaderboard_testing_{datetime.now().strftime("%Y%m%d%H%M%S")}')
|
571 |
asyncio.run(main())
|
evals_live/swebench_lite_example_agent_1722587866.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a6751248329f37cf663f523759211383585062cc698d613b648e948293783f3c
|
3 |
-
size 8444
|
|
|
|
|
|
|
|
evals_live/swebench_lite_example_agent_17227906123.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ccf570b28e70b4ce6beff1fccf70eaf7ea0ff52730c338ff9a721f95bfa2131
|
3 |
+
size 21960
|
evals_live/swebench_lite_example_agent_1722790656.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:571a7158c57485e184e195214addc25e8c58a0b8191a1d280f323a24fd81eb54
|
3 |
-
size 10381
|
|
|
|
|
|
|
|
utils/processing.py
CHANGED
@@ -7,7 +7,9 @@ from email.mime.multipart import MIMEMultipart
|
|
7 |
import asyncio
|
8 |
import aiofiles
|
9 |
import aiosmtplib
|
10 |
-
from agent_monitor.monitor import analyze_agent_steps
|
|
|
|
|
11 |
|
12 |
async def check_and_process_uploads():
|
13 |
upload_dir = "evals_upload"
|
@@ -31,11 +33,11 @@ async def check_and_process_uploads():
|
|
31 |
if not os.path.exists(live_path) and not os.path.exists(processed_path):
|
32 |
unprocessed_uploads.append(upload)
|
33 |
elif os.path.exists(processed_path):
|
34 |
-
with open(upload_path, 'r') as f:
|
35 |
-
|
36 |
|
37 |
-
with open(processed_path, 'r') as f:
|
38 |
-
|
39 |
|
40 |
# TODO we can use a better comparison method with exact comparison
|
41 |
# if new_data != processed_data:
|
@@ -70,10 +72,10 @@ async def process_single_upload(upload_path, processed_path):
|
|
70 |
|
71 |
if check_result['is_valid']:
|
72 |
# Process the file
|
73 |
-
|
74 |
|
75 |
# Move the file to processed directory
|
76 |
-
await asyncio.to_thread(shutil.move, upload_path, processed_path)
|
77 |
|
78 |
# Send email notification
|
79 |
# await send_email_notification(upload_path.name, check_result, "Processing successful")
|
@@ -121,14 +123,15 @@ async def process_upload(input_path, output_path):
|
|
121 |
openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
|
122 |
|
123 |
try:
|
124 |
-
processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client)
|
|
|
|
|
|
|
125 |
except Exception as e:
|
|
|
126 |
print(f"Error in processing: {str(e)}")
|
127 |
return
|
128 |
|
129 |
-
# Save the processed data
|
130 |
-
data['raw_logging_results'] = processed_calls
|
131 |
-
|
132 |
with open(output_path, 'w') as f:
|
133 |
json.dump(data, f, indent=4)
|
134 |
|
|
|
7 |
import asyncio
|
8 |
import aiofiles
|
9 |
import aiosmtplib
|
10 |
+
from agent_monitor.monitor import analyze_agent_steps
|
11 |
+
from agent_monitor.failure_report import analyze_agent_performance, AsyncOpenAIClient
|
12 |
+
import traceback
|
13 |
|
14 |
async def check_and_process_uploads():
|
15 |
upload_dir = "evals_upload"
|
|
|
33 |
if not os.path.exists(live_path) and not os.path.exists(processed_path):
|
34 |
unprocessed_uploads.append(upload)
|
35 |
elif os.path.exists(processed_path):
|
36 |
+
# with open(upload_path, 'r') as f:
|
37 |
+
# new_data = json.load(f)
|
38 |
|
39 |
+
# with open(processed_path, 'r') as f:
|
40 |
+
# processed_data = json.load(f)
|
41 |
|
42 |
# TODO we can use a better comparison method with exact comparison
|
43 |
# if new_data != processed_data:
|
|
|
72 |
|
73 |
if check_result['is_valid']:
|
74 |
# Process the file
|
75 |
+
await process_upload(upload_path, processed_path)
|
76 |
|
77 |
# Move the file to processed directory
|
78 |
+
# await asyncio.to_thread(shutil.move, upload_path, processed_path)
|
79 |
|
80 |
# Send email notification
|
81 |
# await send_email_notification(upload_path.name, check_result, "Processing successful")
|
|
|
123 |
openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
|
124 |
|
125 |
try:
|
126 |
+
processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=False)
|
127 |
+
failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
|
128 |
+
data['raw_logging_results'] = processed_calls
|
129 |
+
data['failure_report'] = failure_report
|
130 |
except Exception as e:
|
131 |
+
traceback.print_exc()
|
132 |
print(f"Error in processing: {str(e)}")
|
133 |
return
|
134 |
|
|
|
|
|
|
|
135 |
with open(output_path, 'w') as f:
|
136 |
json.dump(data, f, indent=4)
|
137 |
|
utils/viz.py
CHANGED
@@ -4,6 +4,58 @@ from utils.pareto import Agent, compute_pareto_frontier
|
|
4 |
import plotly.graph_objects as go
|
5 |
import textwrap
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
|
8 |
agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
|
9 |
pareto_frontier = compute_pareto_frontier(agents)
|
|
|
4 |
import plotly.graph_objects as go
|
5 |
import textwrap
|
6 |
|
7 |
+
def create_bar_chart(categories, values, x_label, y_label, title):
|
8 |
+
# Sort categories and values based on values in descending order
|
9 |
+
sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
|
10 |
+
categories, values = zip(*sorted_data)
|
11 |
+
|
12 |
+
fig = go.Figure(data=[go.Bar(
|
13 |
+
y=categories,
|
14 |
+
x=values,
|
15 |
+
orientation='h',
|
16 |
+
marker_color='#1b9e77', # Same color as the scatter plot
|
17 |
+
text=values,
|
18 |
+
textposition='auto',
|
19 |
+
textfont=dict(color='black', size=14, family='Arial', weight=2),
|
20 |
+
hovertemplate='<b>%{y}</b><br>' +
|
21 |
+
'Affected Tasks: %{x}<br>'
|
22 |
+
)])
|
23 |
+
|
24 |
+
fig.update_layout(
|
25 |
+
height=600,
|
26 |
+
xaxis=dict(
|
27 |
+
showline=True,
|
28 |
+
linecolor='black',
|
29 |
+
showgrid=False
|
30 |
+
),
|
31 |
+
yaxis=dict(
|
32 |
+
showline=True,
|
33 |
+
linecolor='black',
|
34 |
+
showgrid=False,
|
35 |
+
autorange="reversed" # This will put the category with the highest value at the top
|
36 |
+
),
|
37 |
+
plot_bgcolor='white',
|
38 |
+
paper_bgcolor='white',
|
39 |
+
bargap=0.2,
|
40 |
+
bargroupgap=0.1,
|
41 |
+
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
|
42 |
+
modebar=dict(
|
43 |
+
activecolor='#1f77b4',
|
44 |
+
orientation='h',
|
45 |
+
bgcolor='rgba(255,255,255,0.8)',
|
46 |
+
color='#777',
|
47 |
+
add=['pan2d'],
|
48 |
+
remove=[
|
49 |
+
'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d',
|
50 |
+
'hoverClosestCartesian', 'hoverCompareCartesian',
|
51 |
+
'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select'
|
52 |
+
]
|
53 |
+
),
|
54 |
+
dragmode='pan'
|
55 |
+
)
|
56 |
+
|
57 |
+
return fig
|
58 |
+
|
59 |
def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
|
60 |
agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
|
61 |
pareto_frontier = compute_pareto_frontier(agents)
|