Spaces:
Running
Running
benediktstroebl
commited on
Commit
·
22fef14
1
Parent(s):
9f9bed8
Added MLAgentBench
Browse files- app.py +140 -19
- config.py +10 -2
- utils/db.py +31 -4
app.py
CHANGED
@@ -343,6 +343,126 @@ with gr.Blocks() as demo:
|
|
343 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
344 |
outputs=[raw_call_details])
|
345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
with gr.Tab("SWE-Bench Lite"):
|
347 |
with gr.Row():
|
348 |
with gr.Column(scale=2):
|
@@ -462,25 +582,25 @@ with gr.Blocks() as demo:
|
|
462 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
463 |
outputs=[raw_call_details])
|
464 |
|
465 |
-
|
466 |
-
with gr.Tab("SWE-Bench Verified"):
|
467 |
with gr.Row():
|
468 |
with gr.Column(scale=2):
|
469 |
Leaderboard(
|
470 |
-
value=parse_json_files(os.path.join(abs_path, "evals_live"), '
|
471 |
select_columns=SelectColumns(
|
472 |
-
default_selection=config.
|
473 |
cant_deselect=["Agent Name"],
|
474 |
label="Select Columns to Display:",
|
475 |
),
|
476 |
-
search_columns=config.
|
|
|
477 |
column_widths={"Agent Name": 40,
|
478 |
-
"
|
479 |
"Total Cost": 20},
|
480 |
)
|
481 |
with gr.Row():
|
482 |
-
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), '
|
483 |
-
|
484 |
gr.Markdown("# Failure Report")
|
485 |
with gr.Row():
|
486 |
with gr.Column(scale=1):
|
@@ -494,12 +614,12 @@ with gr.Blocks() as demo:
|
|
494 |
|
495 |
# Initialize the failure report agent dropdown with all agents
|
496 |
demo.load(update_agent_dropdown,
|
497 |
-
inputs=[gr.Textbox(value="
|
498 |
outputs=[failure_report_agent_dropdown])
|
499 |
|
500 |
# Update failure report when agent is selected
|
501 |
failure_report_agent_dropdown.change(update_failure_report,
|
502 |
-
inputs=[failure_report_agent_dropdown, gr.Textbox(value="
|
503 |
outputs=[failure_categories_overview, failure_categories_chart])
|
504 |
|
505 |
gr.Markdown("# Agent Monitor")
|
@@ -514,16 +634,16 @@ with gr.Blocks() as demo:
|
|
514 |
flow_chart = gr.Plot(label="Task Flow")
|
515 |
|
516 |
# Initialize the agent dropdown with the best agent
|
517 |
-
demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="
|
518 |
-
demo.load(update_task_analysis, inputs=[gr.Textbox(value="
|
519 |
|
520 |
agent_dropdown.change(update_task_analysis,
|
521 |
-
inputs=[gr.Textbox(value="
|
522 |
outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
523 |
task_dropdown.change(update_task_details,
|
524 |
-
inputs=[gr.Textbox(value="
|
525 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
526 |
-
|
527 |
gr.Markdown("# Raw Predictions")
|
528 |
with gr.Row():
|
529 |
with gr.Column(scale=1):
|
@@ -537,7 +657,7 @@ with gr.Blocks() as demo:
|
|
537 |
raw_call_details = gr.HTML()
|
538 |
|
539 |
def update_raw_task_dropdown(agent_name):
|
540 |
-
analyzed_traces = get_analyzed_traces(agent_name, "
|
541 |
if not analyzed_traces:
|
542 |
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
543 |
task_ids = list(analyzed_traces.keys())
|
@@ -545,14 +665,14 @@ with gr.Blocks() as demo:
|
|
545 |
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
|
546 |
|
547 |
def update_raw_step_dropdown(agent_name, task_id):
|
548 |
-
analyzed_traces = get_analyzed_traces(agent_name, "
|
549 |
if not analyzed_traces or task_id not in analyzed_traces:
|
550 |
return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
551 |
steps = analyzed_traces[task_id]['steps']
|
552 |
return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
553 |
|
554 |
def update_raw_call_details(agent_name, task_id, step_index):
|
555 |
-
analyzed_traces = get_analyzed_traces(agent_name, "
|
556 |
if not analyzed_traces or task_id not in analyzed_traces:
|
557 |
return "No data available for this selection."
|
558 |
steps = analyzed_traces[task_id]['steps']
|
@@ -563,7 +683,7 @@ with gr.Blocks() as demo:
|
|
563 |
|
564 |
# Initialize the raw agent dropdown with all agents
|
565 |
demo.load(update_agent_dropdown,
|
566 |
-
inputs=[gr.Textbox(value="
|
567 |
outputs=[raw_agent_dropdown])
|
568 |
demo.load(update_raw_task_dropdown,
|
569 |
inputs=[raw_agent_dropdown],
|
@@ -581,6 +701,7 @@ with gr.Blocks() as demo:
|
|
581 |
raw_step_dropdown.change(update_raw_call_details,
|
582 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
583 |
outputs=[raw_call_details])
|
|
|
584 |
|
585 |
with gr.Tab("About"):
|
586 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
|
|
343 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
344 |
outputs=[raw_call_details])
|
345 |
|
346 |
+
|
347 |
+
with gr.Tab("SWE-Bench Verified"):
|
348 |
+
with gr.Row():
|
349 |
+
with gr.Column(scale=2):
|
350 |
+
Leaderboard(
|
351 |
+
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
|
352 |
+
select_columns=SelectColumns(
|
353 |
+
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
|
354 |
+
cant_deselect=["Agent Name"],
|
355 |
+
label="Select Columns to Display:",
|
356 |
+
),
|
357 |
+
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
358 |
+
column_widths={"Agent Name": 40,
|
359 |
+
"Accuracy": 20,
|
360 |
+
"Total Cost": 20},
|
361 |
+
)
|
362 |
+
with gr.Row():
|
363 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
364 |
+
|
365 |
+
gr.Markdown("# Failure Report")
|
366 |
+
with gr.Row():
|
367 |
+
with gr.Column(scale=1):
|
368 |
+
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
369 |
+
with gr.Row():
|
370 |
+
with gr.Column(scale=1):
|
371 |
+
failure_categories_overview = gr.Markdown()
|
372 |
+
|
373 |
+
with gr.Column(scale=1):
|
374 |
+
failure_categories_chart = gr.Plot()
|
375 |
+
|
376 |
+
# Initialize the failure report agent dropdown with all agents
|
377 |
+
demo.load(update_agent_dropdown,
|
378 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
379 |
+
outputs=[failure_report_agent_dropdown])
|
380 |
+
|
381 |
+
# Update failure report when agent is selected
|
382 |
+
failure_report_agent_dropdown.change(update_failure_report,
|
383 |
+
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
|
384 |
+
outputs=[failure_categories_overview, failure_categories_chart])
|
385 |
+
|
386 |
+
gr.Markdown("# Agent Monitor")
|
387 |
+
with gr.Row():
|
388 |
+
with gr.Column(scale=1):
|
389 |
+
agent_dropdown = gr.Dropdown(label="Select Agent")
|
390 |
+
with gr.Column(scale=1):
|
391 |
+
task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
|
392 |
+
with gr.Row():
|
393 |
+
task_overview = gr.Markdown()
|
394 |
+
with gr.Row():
|
395 |
+
flow_chart = gr.Plot(label="Task Flow")
|
396 |
+
|
397 |
+
# Initialize the agent dropdown with the best agent
|
398 |
+
demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
|
399 |
+
demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
400 |
+
|
401 |
+
agent_dropdown.change(update_task_analysis,
|
402 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
|
403 |
+
outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
404 |
+
task_dropdown.change(update_task_details,
|
405 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
|
406 |
+
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
407 |
+
|
408 |
+
gr.Markdown("# Raw Predictions")
|
409 |
+
with gr.Row():
|
410 |
+
with gr.Column(scale=1):
|
411 |
+
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
412 |
+
with gr.Column(scale=1):
|
413 |
+
raw_task_dropdown = gr.Dropdown(label="Select Task")
|
414 |
+
with gr.Column(scale=1):
|
415 |
+
raw_step_dropdown = gr.Dropdown(label="Select Step")
|
416 |
+
|
417 |
+
with gr.Row():
|
418 |
+
raw_call_details = gr.HTML()
|
419 |
+
|
420 |
+
def update_raw_task_dropdown(agent_name):
|
421 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
422 |
+
if not analyzed_traces:
|
423 |
+
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
424 |
+
task_ids = list(analyzed_traces.keys())
|
425 |
+
steps = analyzed_traces[task_ids[0]]['steps']
|
426 |
+
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
|
427 |
+
|
428 |
+
def update_raw_step_dropdown(agent_name, task_id):
|
429 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
430 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
431 |
+
return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
432 |
+
steps = analyzed_traces[task_id]['steps']
|
433 |
+
return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
434 |
+
|
435 |
+
def update_raw_call_details(agent_name, task_id, step_index):
|
436 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
437 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
438 |
+
return "No data available for this selection."
|
439 |
+
steps = analyzed_traces[task_id]['steps']
|
440 |
+
if step_index is None:
|
441 |
+
return "Invalid step selection."
|
442 |
+
step = steps[step_index]
|
443 |
+
return format_call_info(step, step_index)
|
444 |
+
|
445 |
+
# Initialize the raw agent dropdown with all agents
|
446 |
+
demo.load(update_agent_dropdown,
|
447 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
448 |
+
outputs=[raw_agent_dropdown])
|
449 |
+
demo.load(update_raw_task_dropdown,
|
450 |
+
inputs=[raw_agent_dropdown],
|
451 |
+
outputs=[raw_task_dropdown, raw_step_dropdown])
|
452 |
+
demo.load(update_raw_call_details,
|
453 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
454 |
+
outputs=[raw_call_details])
|
455 |
+
|
456 |
+
raw_agent_dropdown.change(update_raw_task_dropdown,
|
457 |
+
inputs=[raw_agent_dropdown],
|
458 |
+
outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
459 |
+
raw_task_dropdown.change(update_raw_step_dropdown,
|
460 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown],
|
461 |
+
outputs=[raw_step_dropdown, raw_call_details])
|
462 |
+
raw_step_dropdown.change(update_raw_call_details,
|
463 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
464 |
+
outputs=[raw_call_details])
|
465 |
+
|
466 |
with gr.Tab("SWE-Bench Lite"):
|
467 |
with gr.Row():
|
468 |
with gr.Column(scale=2):
|
|
|
582 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
583 |
outputs=[raw_call_details])
|
584 |
|
585 |
+
with gr.Tab("MLAgentBench"):
|
|
|
586 |
with gr.Row():
|
587 |
with gr.Column(scale=2):
|
588 |
Leaderboard(
|
589 |
+
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'),
|
590 |
select_columns=SelectColumns(
|
591 |
+
default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS,
|
592 |
cant_deselect=["Agent Name"],
|
593 |
label="Select Columns to Display:",
|
594 |
),
|
595 |
+
search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
|
596 |
+
hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
|
597 |
column_widths={"Agent Name": 40,
|
598 |
+
"Overall Score": 20,
|
599 |
"Total Cost": 20},
|
600 |
)
|
601 |
with gr.Row():
|
602 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
|
603 |
+
|
604 |
gr.Markdown("# Failure Report")
|
605 |
with gr.Row():
|
606 |
with gr.Column(scale=1):
|
|
|
614 |
|
615 |
# Initialize the failure report agent dropdown with all agents
|
616 |
demo.load(update_agent_dropdown,
|
617 |
+
inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
|
618 |
outputs=[failure_report_agent_dropdown])
|
619 |
|
620 |
# Update failure report when agent is selected
|
621 |
failure_report_agent_dropdown.change(update_failure_report,
|
622 |
+
inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
|
623 |
outputs=[failure_categories_overview, failure_categories_chart])
|
624 |
|
625 |
gr.Markdown("# Agent Monitor")
|
|
|
634 |
flow_chart = gr.Plot(label="Task Flow")
|
635 |
|
636 |
# Initialize the agent dropdown with the best agent
|
637 |
+
demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
|
638 |
+
demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
639 |
|
640 |
agent_dropdown.change(update_task_analysis,
|
641 |
+
inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
|
642 |
outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
643 |
task_dropdown.change(update_task_details,
|
644 |
+
inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
|
645 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
646 |
+
|
647 |
gr.Markdown("# Raw Predictions")
|
648 |
with gr.Row():
|
649 |
with gr.Column(scale=1):
|
|
|
657 |
raw_call_details = gr.HTML()
|
658 |
|
659 |
def update_raw_task_dropdown(agent_name):
|
660 |
+
analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
|
661 |
if not analyzed_traces:
|
662 |
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
663 |
task_ids = list(analyzed_traces.keys())
|
|
|
665 |
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
|
666 |
|
667 |
def update_raw_step_dropdown(agent_name, task_id):
|
668 |
+
analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
|
669 |
if not analyzed_traces or task_id not in analyzed_traces:
|
670 |
return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
671 |
steps = analyzed_traces[task_id]['steps']
|
672 |
return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
673 |
|
674 |
def update_raw_call_details(agent_name, task_id, step_index):
|
675 |
+
analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
|
676 |
if not analyzed_traces or task_id not in analyzed_traces:
|
677 |
return "No data available for this selection."
|
678 |
steps = analyzed_traces[task_id]['steps']
|
|
|
683 |
|
684 |
# Initialize the raw agent dropdown with all agents
|
685 |
demo.load(update_agent_dropdown,
|
686 |
+
inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
|
687 |
outputs=[raw_agent_dropdown])
|
688 |
demo.load(update_raw_task_dropdown,
|
689 |
inputs=[raw_agent_dropdown],
|
|
|
701 |
raw_step_dropdown.change(update_raw_call_details,
|
702 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
703 |
outputs=[raw_call_details])
|
704 |
+
|
705 |
|
706 |
with gr.Tab("About"):
|
707 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
config.py
CHANGED
@@ -11,7 +11,7 @@ SWEBENCH_ON_LOAD_COLUMNS = [
|
|
11 |
"Accuracy",
|
12 |
"Total Cost",
|
13 |
]
|
14 |
-
SWEBENCH_SEARCH_COLUMNS = ['Total Cost']
|
15 |
SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
|
16 |
|
17 |
USACO_ON_LOAD_COLUMNS = [
|
@@ -19,9 +19,17 @@ USACO_ON_LOAD_COLUMNS = [
|
|
19 |
"Accuracy",
|
20 |
"Total Cost",
|
21 |
]
|
22 |
-
USACO_SEARCH_COLUMNS = ['Total Cost']
|
23 |
USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
NUMERIC_INTERVALS = {
|
27 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
11 |
"Accuracy",
|
12 |
"Total Cost",
|
13 |
]
|
14 |
+
SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
|
15 |
SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
|
16 |
|
17 |
USACO_ON_LOAD_COLUMNS = [
|
|
|
19 |
"Accuracy",
|
20 |
"Total Cost",
|
21 |
]
|
22 |
+
USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
|
23 |
USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
|
24 |
|
25 |
+
MLAGENTBENCH_ON_LOAD_COLUMNS = [
|
26 |
+
"Agent Name",
|
27 |
+
"Overall Score",
|
28 |
+
"Total Cost",
|
29 |
+
]
|
30 |
+
MLAGENTBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
|
31 |
+
MLAGENTBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Accuracy']
|
32 |
+
|
33 |
|
34 |
NUMERIC_INTERVALS = {
|
35 |
"?": pd.Interval(-1, 0, closed="right"),
|
utils/db.py
CHANGED
@@ -46,6 +46,15 @@ class TracePreprocessor:
|
|
46 |
recall REAL,
|
47 |
f1_score REAL,
|
48 |
auc REAL,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
PRIMARY KEY (benchmark_name, agent_name)
|
50 |
)
|
51 |
''')
|
@@ -86,8 +95,8 @@ class TracePreprocessor:
|
|
86 |
with self.get_conn() as conn:
|
87 |
conn.execute('''
|
88 |
INSERT OR REPLACE INTO parsed_results
|
89 |
-
(benchmark_name, agent_name, date, total_cost, accuracy, precision, recall, f1_score, auc)
|
90 |
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
91 |
''', (
|
92 |
benchmark_name,
|
93 |
agent_name,
|
@@ -97,7 +106,16 @@ class TracePreprocessor:
|
|
97 |
results.get('precision'),
|
98 |
results.get('recall'),
|
99 |
results.get('f1_score'),
|
100 |
-
results.get('auc')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
))
|
102 |
except Exception as e:
|
103 |
print(f"Error preprocessing parsed results in {file}: {e}")
|
@@ -138,7 +156,7 @@ class TracePreprocessor:
|
|
138 |
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
139 |
|
140 |
# Round float columns to 3 decimal places
|
141 |
-
float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc']
|
142 |
for column in float_columns:
|
143 |
if column in df.columns:
|
144 |
df[column] = df[column].round(3)
|
@@ -153,6 +171,15 @@ class TracePreprocessor:
|
|
153 |
'recall': 'Recall',
|
154 |
'f1_score': 'F1 Score',
|
155 |
'auc': 'AUC',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
})
|
157 |
|
158 |
return df
|
|
|
46 |
recall REAL,
|
47 |
f1_score REAL,
|
48 |
auc REAL,
|
49 |
+
overall_score REAL,
|
50 |
+
vectorization_score REAL,
|
51 |
+
fathomnet_score REAL,
|
52 |
+
feedback_score REAL,
|
53 |
+
house_price_score REAL,
|
54 |
+
spaceship_titanic_score REAL,
|
55 |
+
amp_parkinsons_disease_progression_prediction_score REAL,
|
56 |
+
cifar10_score REAL,
|
57 |
+
imdb_score REAL,
|
58 |
PRIMARY KEY (benchmark_name, agent_name)
|
59 |
)
|
60 |
''')
|
|
|
95 |
with self.get_conn() as conn:
|
96 |
conn.execute('''
|
97 |
INSERT OR REPLACE INTO parsed_results
|
98 |
+
(benchmark_name, agent_name, date, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
|
99 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
100 |
''', (
|
101 |
benchmark_name,
|
102 |
agent_name,
|
|
|
106 |
results.get('precision'),
|
107 |
results.get('recall'),
|
108 |
results.get('f1_score'),
|
109 |
+
results.get('auc'),
|
110 |
+
results.get('overall_score'),
|
111 |
+
results.get('vectorization_score'),
|
112 |
+
results.get('fathomnet_score'),
|
113 |
+
results.get('feedback_score'),
|
114 |
+
results.get('house-price_score'),
|
115 |
+
results.get('spaceship-titanic_score'),
|
116 |
+
results.get('amp-parkinsons-disease-progression-prediction_score'),
|
117 |
+
results.get('cifar10_score'),
|
118 |
+
results.get('imdb_score')
|
119 |
))
|
120 |
except Exception as e:
|
121 |
print(f"Error preprocessing parsed results in {file}: {e}")
|
|
|
156 |
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
157 |
|
158 |
# Round float columns to 3 decimal places
|
159 |
+
float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
|
160 |
for column in float_columns:
|
161 |
if column in df.columns:
|
162 |
df[column] = df[column].round(3)
|
|
|
171 |
'recall': 'Recall',
|
172 |
'f1_score': 'F1 Score',
|
173 |
'auc': 'AUC',
|
174 |
+
'overall_score': 'Overall Score',
|
175 |
+
'vectorization_score': 'Vectorization Score',
|
176 |
+
'fathomnet_score': 'Fathomnet Score',
|
177 |
+
'feedback_score': 'Feedback Score',
|
178 |
+
'house_price_score': 'House Price Score',
|
179 |
+
'spaceship_titanic_score': 'Spaceship Titanic Score',
|
180 |
+
'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
|
181 |
+
'cifar10_score': 'CIFAR10 Score',
|
182 |
+
'imdb_score': 'IMDB Score'
|
183 |
})
|
184 |
|
185 |
return df
|