benediktstroebl commited on
Commit
22fef14
·
1 Parent(s): 9f9bed8

Added MLAgentBench

Browse files
Files changed (3) hide show
  1. app.py +140 -19
  2. config.py +10 -2
  3. utils/db.py +31 -4
app.py CHANGED
@@ -343,6 +343,126 @@ with gr.Blocks() as demo:
343
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
344
  outputs=[raw_call_details])
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  with gr.Tab("SWE-Bench Lite"):
347
  with gr.Row():
348
  with gr.Column(scale=2):
@@ -462,25 +582,25 @@ with gr.Blocks() as demo:
462
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
463
  outputs=[raw_call_details])
464
 
465
-
466
- with gr.Tab("SWE-Bench Verified"):
467
  with gr.Row():
468
  with gr.Column(scale=2):
469
  Leaderboard(
470
- value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
471
  select_columns=SelectColumns(
472
- default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
473
  cant_deselect=["Agent Name"],
474
  label="Select Columns to Display:",
475
  ),
476
- search_columns=config.SWEBENCH_SEARCH_COLUMNS,
 
477
  column_widths={"Agent Name": 40,
478
- "Accuracy": 20,
479
  "Total Cost": 20},
480
  )
481
  with gr.Row():
482
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
483
-
484
  gr.Markdown("# Failure Report")
485
  with gr.Row():
486
  with gr.Column(scale=1):
@@ -494,12 +614,12 @@ with gr.Blocks() as demo:
494
 
495
  # Initialize the failure report agent dropdown with all agents
496
  demo.load(update_agent_dropdown,
497
- inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
498
  outputs=[failure_report_agent_dropdown])
499
 
500
  # Update failure report when agent is selected
501
  failure_report_agent_dropdown.change(update_failure_report,
502
- inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
503
  outputs=[failure_categories_overview, failure_categories_chart])
504
 
505
  gr.Markdown("# Agent Monitor")
@@ -514,16 +634,16 @@ with gr.Blocks() as demo:
514
  flow_chart = gr.Plot(label="Task Flow")
515
 
516
  # Initialize the agent dropdown with the best agent
517
- demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
518
- demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
519
 
520
  agent_dropdown.change(update_task_analysis,
521
- inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
522
  outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
523
  task_dropdown.change(update_task_details,
524
- inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
525
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
526
-
527
  gr.Markdown("# Raw Predictions")
528
  with gr.Row():
529
  with gr.Column(scale=1):
@@ -537,7 +657,7 @@ with gr.Blocks() as demo:
537
  raw_call_details = gr.HTML()
538
 
539
  def update_raw_task_dropdown(agent_name):
540
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
541
  if not analyzed_traces:
542
  return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
543
  task_ids = list(analyzed_traces.keys())
@@ -545,14 +665,14 @@ with gr.Blocks() as demo:
545
  return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
546
 
547
  def update_raw_step_dropdown(agent_name, task_id):
548
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
549
  if not analyzed_traces or task_id not in analyzed_traces:
550
  return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
551
  steps = analyzed_traces[task_id]['steps']
552
  return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
553
 
554
  def update_raw_call_details(agent_name, task_id, step_index):
555
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
556
  if not analyzed_traces or task_id not in analyzed_traces:
557
  return "No data available for this selection."
558
  steps = analyzed_traces[task_id]['steps']
@@ -563,7 +683,7 @@ with gr.Blocks() as demo:
563
 
564
  # Initialize the raw agent dropdown with all agents
565
  demo.load(update_agent_dropdown,
566
- inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
567
  outputs=[raw_agent_dropdown])
568
  demo.load(update_raw_task_dropdown,
569
  inputs=[raw_agent_dropdown],
@@ -581,6 +701,7 @@ with gr.Blocks() as demo:
581
  raw_step_dropdown.change(update_raw_call_details,
582
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
583
  outputs=[raw_call_details])
 
584
 
585
  with gr.Tab("About"):
586
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
 
343
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
344
  outputs=[raw_call_details])
345
 
346
+
347
+ with gr.Tab("SWE-Bench Verified"):
348
+ with gr.Row():
349
+ with gr.Column(scale=2):
350
+ Leaderboard(
351
+ value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
352
+ select_columns=SelectColumns(
353
+ default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
354
+ cant_deselect=["Agent Name"],
355
+ label="Select Columns to Display:",
356
+ ),
357
+ search_columns=config.SWEBENCH_SEARCH_COLUMNS,
358
+ column_widths={"Agent Name": 40,
359
+ "Accuracy": 20,
360
+ "Total Cost": 20},
361
+ )
362
+ with gr.Row():
363
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
364
+
365
+ gr.Markdown("# Failure Report")
366
+ with gr.Row():
367
+ with gr.Column(scale=1):
368
+ failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
369
+ with gr.Row():
370
+ with gr.Column(scale=1):
371
+ failure_categories_overview = gr.Markdown()
372
+
373
+ with gr.Column(scale=1):
374
+ failure_categories_chart = gr.Plot()
375
+
376
+ # Initialize the failure report agent dropdown with all agents
377
+ demo.load(update_agent_dropdown,
378
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
379
+ outputs=[failure_report_agent_dropdown])
380
+
381
+ # Update failure report when agent is selected
382
+ failure_report_agent_dropdown.change(update_failure_report,
383
+ inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
384
+ outputs=[failure_categories_overview, failure_categories_chart])
385
+
386
+ gr.Markdown("# Agent Monitor")
387
+ with gr.Row():
388
+ with gr.Column(scale=1):
389
+ agent_dropdown = gr.Dropdown(label="Select Agent")
390
+ with gr.Column(scale=1):
391
+ task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
392
+ with gr.Row():
393
+ task_overview = gr.Markdown()
394
+ with gr.Row():
395
+ flow_chart = gr.Plot(label="Task Flow")
396
+
397
+ # Initialize the agent dropdown with the best agent
398
+ demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
399
+ demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
400
+
401
+ agent_dropdown.change(update_task_analysis,
402
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
403
+ outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
404
+ task_dropdown.change(update_task_details,
405
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
406
+ outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
407
+
408
+ gr.Markdown("# Raw Predictions")
409
+ with gr.Row():
410
+ with gr.Column(scale=1):
411
+ raw_agent_dropdown = gr.Dropdown(label="Select Agent")
412
+ with gr.Column(scale=1):
413
+ raw_task_dropdown = gr.Dropdown(label="Select Task")
414
+ with gr.Column(scale=1):
415
+ raw_step_dropdown = gr.Dropdown(label="Select Step")
416
+
417
+ with gr.Row():
418
+ raw_call_details = gr.HTML()
419
+
420
+ def update_raw_task_dropdown(agent_name):
421
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
422
+ if not analyzed_traces:
423
+ return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
424
+ task_ids = list(analyzed_traces.keys())
425
+ steps = analyzed_traces[task_ids[0]]['steps']
426
+ return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
427
+
428
+ def update_raw_step_dropdown(agent_name, task_id):
429
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
430
+ if not analyzed_traces or task_id not in analyzed_traces:
431
+ return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
432
+ steps = analyzed_traces[task_id]['steps']
433
+ return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
434
+
435
+ def update_raw_call_details(agent_name, task_id, step_index):
436
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
437
+ if not analyzed_traces or task_id not in analyzed_traces:
438
+ return "No data available for this selection."
439
+ steps = analyzed_traces[task_id]['steps']
440
+ if step_index is None:
441
+ return "Invalid step selection."
442
+ step = steps[step_index]
443
+ return format_call_info(step, step_index)
444
+
445
+ # Initialize the raw agent dropdown with all agents
446
+ demo.load(update_agent_dropdown,
447
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
448
+ outputs=[raw_agent_dropdown])
449
+ demo.load(update_raw_task_dropdown,
450
+ inputs=[raw_agent_dropdown],
451
+ outputs=[raw_task_dropdown, raw_step_dropdown])
452
+ demo.load(update_raw_call_details,
453
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
454
+ outputs=[raw_call_details])
455
+
456
+ raw_agent_dropdown.change(update_raw_task_dropdown,
457
+ inputs=[raw_agent_dropdown],
458
+ outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
459
+ raw_task_dropdown.change(update_raw_step_dropdown,
460
+ inputs=[raw_agent_dropdown, raw_task_dropdown],
461
+ outputs=[raw_step_dropdown, raw_call_details])
462
+ raw_step_dropdown.change(update_raw_call_details,
463
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
464
+ outputs=[raw_call_details])
465
+
466
  with gr.Tab("SWE-Bench Lite"):
467
  with gr.Row():
468
  with gr.Column(scale=2):
 
582
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
583
  outputs=[raw_call_details])
584
 
585
+ with gr.Tab("MLAgentBench"):
 
586
  with gr.Row():
587
  with gr.Column(scale=2):
588
  Leaderboard(
589
+ value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'),
590
  select_columns=SelectColumns(
591
+ default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS,
592
  cant_deselect=["Agent Name"],
593
  label="Select Columns to Display:",
594
  ),
595
+ search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
596
+ hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
597
  column_widths={"Agent Name": 40,
598
+ "Overall Score": 20,
599
  "Total Cost": 20},
600
  )
601
  with gr.Row():
602
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
603
+
604
  gr.Markdown("# Failure Report")
605
  with gr.Row():
606
  with gr.Column(scale=1):
 
614
 
615
  # Initialize the failure report agent dropdown with all agents
616
  demo.load(update_agent_dropdown,
617
+ inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
618
  outputs=[failure_report_agent_dropdown])
619
 
620
  # Update failure report when agent is selected
621
  failure_report_agent_dropdown.change(update_failure_report,
622
+ inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
623
  outputs=[failure_categories_overview, failure_categories_chart])
624
 
625
  gr.Markdown("# Agent Monitor")
 
634
  flow_chart = gr.Plot(label="Task Flow")
635
 
636
  # Initialize the agent dropdown with the best agent
637
+ demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
638
+ demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
639
 
640
  agent_dropdown.change(update_task_analysis,
641
+ inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
642
  outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
643
  task_dropdown.change(update_task_details,
644
+ inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
645
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
646
+
647
  gr.Markdown("# Raw Predictions")
648
  with gr.Row():
649
  with gr.Column(scale=1):
 
657
  raw_call_details = gr.HTML()
658
 
659
  def update_raw_task_dropdown(agent_name):
660
+ analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
661
  if not analyzed_traces:
662
  return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
663
  task_ids = list(analyzed_traces.keys())
 
665
  return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
666
 
667
  def update_raw_step_dropdown(agent_name, task_id):
668
+ analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
669
  if not analyzed_traces or task_id not in analyzed_traces:
670
  return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
671
  steps = analyzed_traces[task_id]['steps']
672
  return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
673
 
674
  def update_raw_call_details(agent_name, task_id, step_index):
675
+ analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
676
  if not analyzed_traces or task_id not in analyzed_traces:
677
  return "No data available for this selection."
678
  steps = analyzed_traces[task_id]['steps']
 
683
 
684
  # Initialize the raw agent dropdown with all agents
685
  demo.load(update_agent_dropdown,
686
+ inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
687
  outputs=[raw_agent_dropdown])
688
  demo.load(update_raw_task_dropdown,
689
  inputs=[raw_agent_dropdown],
 
701
  raw_step_dropdown.change(update_raw_call_details,
702
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
703
  outputs=[raw_call_details])
704
+
705
 
706
  with gr.Tab("About"):
707
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
config.py CHANGED
@@ -11,7 +11,7 @@ SWEBENCH_ON_LOAD_COLUMNS = [
11
  "Accuracy",
12
  "Total Cost",
13
  ]
14
- SWEBENCH_SEARCH_COLUMNS = ['Total Cost']
15
  SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
16
 
17
  USACO_ON_LOAD_COLUMNS = [
@@ -19,9 +19,17 @@ USACO_ON_LOAD_COLUMNS = [
19
  "Accuracy",
20
  "Total Cost",
21
  ]
22
- USACO_SEARCH_COLUMNS = ['Total Cost']
23
  USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
24
 
 
 
 
 
 
 
 
 
25
 
26
  NUMERIC_INTERVALS = {
27
  "?": pd.Interval(-1, 0, closed="right"),
 
11
  "Accuracy",
12
  "Total Cost",
13
  ]
14
+ SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
15
  SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
16
 
17
  USACO_ON_LOAD_COLUMNS = [
 
19
  "Accuracy",
20
  "Total Cost",
21
  ]
22
+ USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
23
  USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
24
 
25
+ MLAGENTBENCH_ON_LOAD_COLUMNS = [
26
+ "Agent Name",
27
+ "Overall Score",
28
+ "Total Cost",
29
+ ]
30
+ MLAGENTBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
31
+ MLAGENTBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Accuracy']
32
+
33
 
34
  NUMERIC_INTERVALS = {
35
  "?": pd.Interval(-1, 0, closed="right"),
utils/db.py CHANGED
@@ -46,6 +46,15 @@ class TracePreprocessor:
46
  recall REAL,
47
  f1_score REAL,
48
  auc REAL,
 
 
 
 
 
 
 
 
 
49
  PRIMARY KEY (benchmark_name, agent_name)
50
  )
51
  ''')
@@ -86,8 +95,8 @@ class TracePreprocessor:
86
  with self.get_conn() as conn:
87
  conn.execute('''
88
  INSERT OR REPLACE INTO parsed_results
89
- (benchmark_name, agent_name, date, total_cost, accuracy, precision, recall, f1_score, auc)
90
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
91
  ''', (
92
  benchmark_name,
93
  agent_name,
@@ -97,7 +106,16 @@ class TracePreprocessor:
97
  results.get('precision'),
98
  results.get('recall'),
99
  results.get('f1_score'),
100
- results.get('auc')
 
 
 
 
 
 
 
 
 
101
  ))
102
  except Exception as e:
103
  print(f"Error preprocessing parsed results in {file}: {e}")
@@ -138,7 +156,7 @@ class TracePreprocessor:
138
  df = pd.read_sql_query(query, conn, params=(benchmark_name,))
139
 
140
  # Round float columns to 3 decimal places
141
- float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc']
142
  for column in float_columns:
143
  if column in df.columns:
144
  df[column] = df[column].round(3)
@@ -153,6 +171,15 @@ class TracePreprocessor:
153
  'recall': 'Recall',
154
  'f1_score': 'F1 Score',
155
  'auc': 'AUC',
 
 
 
 
 
 
 
 
 
156
  })
157
 
158
  return df
 
46
  recall REAL,
47
  f1_score REAL,
48
  auc REAL,
49
+ overall_score REAL,
50
+ vectorization_score REAL,
51
+ fathomnet_score REAL,
52
+ feedback_score REAL,
53
+ house_price_score REAL,
54
+ spaceship_titanic_score REAL,
55
+ amp_parkinsons_disease_progression_prediction_score REAL,
56
+ cifar10_score REAL,
57
+ imdb_score REAL,
58
  PRIMARY KEY (benchmark_name, agent_name)
59
  )
60
  ''')
 
95
  with self.get_conn() as conn:
96
  conn.execute('''
97
  INSERT OR REPLACE INTO parsed_results
98
+ (benchmark_name, agent_name, date, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
99
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
100
  ''', (
101
  benchmark_name,
102
  agent_name,
 
106
  results.get('precision'),
107
  results.get('recall'),
108
  results.get('f1_score'),
109
+ results.get('auc'),
110
+ results.get('overall_score'),
111
+ results.get('vectorization_score'),
112
+ results.get('fathomnet_score'),
113
+ results.get('feedback_score'),
114
+ results.get('house-price_score'),
115
+ results.get('spaceship-titanic_score'),
116
+ results.get('amp-parkinsons-disease-progression-prediction_score'),
117
+ results.get('cifar10_score'),
118
+ results.get('imdb_score')
119
  ))
120
  except Exception as e:
121
  print(f"Error preprocessing parsed results in {file}: {e}")
 
156
  df = pd.read_sql_query(query, conn, params=(benchmark_name,))
157
 
158
  # Round float columns to 3 decimal places
159
+ float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
160
  for column in float_columns:
161
  if column in df.columns:
162
  df[column] = df[column].round(3)
 
171
  'recall': 'Recall',
172
  'f1_score': 'F1 Score',
173
  'auc': 'AUC',
174
+ 'overall_score': 'Overall Score',
175
+ 'vectorization_score': 'Vectorization Score',
176
+ 'fathomnet_score': 'Fathomnet Score',
177
+ 'feedback_score': 'Feedback Score',
178
+ 'house_price_score': 'House Price Score',
179
+ 'spaceship_titanic_score': 'Spaceship Titanic Score',
180
+ 'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
181
+ 'cifar10_score': 'CIFAR10 Score',
182
+ 'imdb_score': 'IMDB Score'
183
  })
184
 
185
  return df