benediktstroebl commited on
Commit
5a7e21a
·
1 Parent(s): caec940

added failure report and two new swebench variants

Browse files
app.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
  import os
8
  import json
9
  from utils.data import parse_json_files
10
- from utils.viz import create_scatter_plot, create_flow_chart
11
  from utils.processing import check_and_process_uploads
12
  from huggingface_hub import snapshot_download
13
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -21,7 +21,6 @@ import weave
21
 
22
 
23
  from datetime import datetime
24
- weave.init(f'leaderboard_testing_{datetime.now().strftime("%Y%m%d%H%M%S")}')
25
 
26
  abs_path = Path(__file__).parent
27
 
@@ -43,29 +42,46 @@ def download_latest_results():
43
 
44
  # Global variable to store preprocessed data
45
  preprocessed_traces = {}
 
46
  def preprocess_traces():
47
  global preprocessed_traces
 
48
  processed_dir = Path("evals_live")
49
  for file in processed_dir.glob('*.json'):
50
- try:
51
- with open(file, 'r') as f:
52
  data = json.load(f)
53
  agent_name = data['config']['agent_name']
54
  benchmark_name = data['config']['benchmark_name']
55
  if benchmark_name not in preprocessed_traces:
56
  preprocessed_traces[benchmark_name] = {}
 
 
57
 
58
- assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
59
- preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
 
60
  except AssertionError as e:
61
  preprocessed_traces[benchmark_name][agent_name] = None
62
  except Exception as e:
63
  print(f"Error preprocessing {file}: {e}")
64
  preprocessed_traces[benchmark_name][agent_name] = None
65
 
 
 
 
 
 
 
 
 
 
 
66
  def get_analyzed_traces(agent_name, benchmark_name):
67
  return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
68
 
 
 
 
69
  def update_agent_dropdown(benchmark_name, metric):
70
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
71
  agents = df['Agent Name'].tolist()
@@ -200,6 +216,33 @@ def format_call_info(step, step_index):
200
  return formatted_info
201
 
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  with gr.Blocks() as demo:
204
  gr.Markdown("""
205
  # 🥇 Agent Leaderboard
@@ -223,7 +266,7 @@ with gr.Blocks() as demo:
223
  )
224
  with gr.Row():
225
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
226
- gr.Markdown("## Agent Monitor")
227
  with gr.Row():
228
  with gr.Column(scale=1):
229
  agent_dropdown = gr.Dropdown(label="Select Agent")
@@ -245,7 +288,7 @@ with gr.Blocks() as demo:
245
  inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
246
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
247
 
248
- gr.Markdown("## Raw Predictions")
249
  with gr.Row():
250
  with gr.Column(scale=1):
251
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
@@ -303,23 +346,200 @@ with gr.Blocks() as demo:
303
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
304
  outputs=[raw_call_details])
305
 
306
- with gr.Tab("SWE-Bench"):
307
  with gr.Row():
308
  with gr.Column(scale=2):
309
  Leaderboard(
310
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
311
  select_columns=SelectColumns(
312
- default_selection=config.USACO_ON_LOAD_COLUMNS,
313
  cant_deselect=["Agent Name"],
314
  label="Select Columns to Display:",
315
  ),
316
- search_columns=config.USACO_SEARCH_COLUMNS,
317
  column_widths={"Agent Name": 40,
318
  "Accuracy": 20,
319
  "Total Cost": 20},
320
  )
321
  with gr.Row():
322
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  with gr.Tab("About"):
325
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
@@ -332,8 +552,8 @@ async def main():
332
  # Preprocess traces
333
  preprocess_traces()
334
 
335
- # Download the results from the Hugging Face Hub
336
- await asyncio.to_thread(download_latest_results)
337
 
338
  # Check for new uploads and process them
339
  await check_and_process_uploads()
@@ -347,4 +567,5 @@ async def main():
347
  await demo.launch()
348
 
349
  if __name__ == "__main__":
 
350
  asyncio.run(main())
 
7
  import os
8
  import json
9
  from utils.data import parse_json_files
10
+ from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
11
  from utils.processing import check_and_process_uploads
12
  from huggingface_hub import snapshot_download
13
  from apscheduler.schedulers.background import BackgroundScheduler
 
21
 
22
 
23
  from datetime import datetime
 
24
 
25
  abs_path = Path(__file__).parent
26
 
 
42
 
43
  # Global variable to store preprocessed data
44
  preprocessed_traces = {}
45
+ failure_reports = {}
46
  def preprocess_traces():
47
  global preprocessed_traces
48
+ global failure_reports
49
  processed_dir = Path("evals_live")
50
  for file in processed_dir.glob('*.json'):
51
+ with open(file, 'r') as f:
 
52
  data = json.load(f)
53
  agent_name = data['config']['agent_name']
54
  benchmark_name = data['config']['benchmark_name']
55
  if benchmark_name not in preprocessed_traces:
56
  preprocessed_traces[benchmark_name] = {}
57
+ if benchmark_name not in failure_reports:
58
+ failure_reports[benchmark_name] = {}
59
 
60
+ try:
61
+ assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
62
+ preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
63
  except AssertionError as e:
64
  preprocessed_traces[benchmark_name][agent_name] = None
65
  except Exception as e:
66
  print(f"Error preprocessing {file}: {e}")
67
  preprocessed_traces[benchmark_name][agent_name] = None
68
 
69
+ try:
70
+ assert type(data['failure_report']) == dict, f"Invalid format for failure_report: {type(data['failure_report'])}"
71
+ failure_reports[benchmark_name][agent_name] = data['failure_report']
72
+ except AssertionError as e:
73
+ failure_reports[benchmark_name][agent_name] = None
74
+ except Exception as e:
75
+ print(f"Error preprocessing {file}: {e}")
76
+ failure_reports[benchmark_name][agent_name] = None
77
+
78
+
79
  def get_analyzed_traces(agent_name, benchmark_name):
80
  return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
81
 
82
+ def get_failure_report(agent_name, benchmark_name):
83
+ return failure_reports.get(benchmark_name, {}).get(agent_name)
84
+
85
  def update_agent_dropdown(benchmark_name, metric):
86
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
87
  agents = df['Agent Name'].tolist()
 
216
  return formatted_info
217
 
218
 
219
+ def update_failure_report(agent_name):
220
+ failure_report = get_failure_report(agent_name, "swebench_lite")
221
+ if not failure_report:
222
+ return "No failure report available for this agent.", None
223
+
224
+ # Create overview of failure categories
225
+ categories_overview = "## Failure Categories Overview\n\n"
226
+ for category in failure_report['failure_categories']:
227
+ categories_overview += f"### {category['category_name']}\n"
228
+ categories_overview += f"{category['description']}\n\n"
229
+
230
+ # Count tasks affected by each category
231
+ category_counts = {}
232
+ for task, classification in failure_report['task_classifications'].items():
233
+ category_id = classification['category_id']
234
+ category_counts[category_id] = category_counts.get(category_id, 0) + 1
235
+
236
+ # Prepare data for bar chart
237
+ categories = [cat['category_name'] for cat in failure_report['failure_categories']]
238
+ counts = [category_counts.get(str(i+1), 0) for i in range(len(categories))]
239
+
240
+ # Create bar chart
241
+ chart = create_bar_chart(categories, counts, "Failure Categories", "Number of Affected Tasks", "Failure Categories Distribution")
242
+
243
+ return categories_overview, chart
244
+
245
+
246
  with gr.Blocks() as demo:
247
  gr.Markdown("""
248
  # 🥇 Agent Leaderboard
 
266
  )
267
  with gr.Row():
268
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
269
+ gr.Markdown("# Agent Monitor")
270
  with gr.Row():
271
  with gr.Column(scale=1):
272
  agent_dropdown = gr.Dropdown(label="Select Agent")
 
288
  inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
289
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
290
 
291
+ gr.Markdown("# Raw Predictions")
292
  with gr.Row():
293
  with gr.Column(scale=1):
294
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
 
346
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
347
  outputs=[raw_call_details])
348
 
349
+ with gr.Tab("SWE-Bench Lite"):
350
  with gr.Row():
351
  with gr.Column(scale=2):
352
  Leaderboard(
353
  value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
354
  select_columns=SelectColumns(
355
+ default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
356
  cant_deselect=["Agent Name"],
357
  label="Select Columns to Display:",
358
  ),
359
+ search_columns=config.SWEBENCH_SEARCH_COLUMNS,
360
  column_widths={"Agent Name": 40,
361
  "Accuracy": 20,
362
  "Total Cost": 20},
363
  )
364
  with gr.Row():
365
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
366
+
367
+ gr.Markdown("# Failure Report")
368
+ with gr.Row():
369
+ with gr.Column(scale=1):
370
+ failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
371
+ with gr.Row():
372
+ with gr.Column(scale=1):
373
+ failure_categories_overview = gr.Markdown()
374
+
375
+ with gr.Column(scale=1):
376
+ failure_categories_chart = gr.Plot()
377
+
378
+ # Initialize the failure report agent dropdown with all agents
379
+ demo.load(update_agent_dropdown,
380
+ inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
381
+ outputs=[failure_report_agent_dropdown])
382
+
383
+ # Update failure report when agent is selected
384
+ failure_report_agent_dropdown.change(update_failure_report,
385
+ inputs=[failure_report_agent_dropdown],
386
+ outputs=[failure_categories_overview, failure_categories_chart])
387
+
388
+ gr.Markdown("# Raw Predictions")
389
+ with gr.Row():
390
+ with gr.Column(scale=1):
391
+ raw_agent_dropdown = gr.Dropdown(label="Select Agent")
392
+ with gr.Column(scale=1):
393
+ raw_task_dropdown = gr.Dropdown(label="Select Task")
394
+ with gr.Column(scale=1):
395
+ raw_step_dropdown = gr.Dropdown(label="Select Step")
396
+
397
+ with gr.Row():
398
+ raw_call_details = gr.HTML()
399
+
400
+ def update_raw_task_dropdown(agent_name):
401
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
402
+ if not analyzed_traces:
403
+ return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
404
+ task_ids = list(analyzed_traces.keys())
405
+ steps = analyzed_traces[task_ids[0]]['steps']
406
+ return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
407
+
408
+ def update_raw_step_dropdown(agent_name, task_id):
409
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
410
+ if not analyzed_traces or task_id not in analyzed_traces:
411
+ return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
412
+ steps = analyzed_traces[task_id]['steps']
413
+ return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
414
+
415
+ def update_raw_call_details(agent_name, task_id, step_index):
416
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
417
+ if not analyzed_traces or task_id not in analyzed_traces:
418
+ return "No data available for this selection."
419
+ steps = analyzed_traces[task_id]['steps']
420
+ if step_index is None:
421
+ return "Invalid step selection."
422
+ step = steps[step_index]
423
+ return format_call_info(step, step_index)
424
+
425
+ # Initialize the raw agent dropdown with all agents
426
+ demo.load(update_agent_dropdown,
427
+ inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
428
+ outputs=[raw_agent_dropdown])
429
+ demo.load(update_raw_task_dropdown,
430
+ inputs=[raw_agent_dropdown],
431
+ outputs=[raw_task_dropdown, raw_step_dropdown])
432
+ demo.load(update_raw_call_details,
433
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
434
+ outputs=[raw_call_details])
435
+
436
+ raw_agent_dropdown.change(update_raw_task_dropdown,
437
+ inputs=[raw_agent_dropdown],
438
+ outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
439
+ raw_task_dropdown.change(update_raw_step_dropdown,
440
+ inputs=[raw_agent_dropdown, raw_task_dropdown],
441
+ outputs=[raw_step_dropdown, raw_call_details])
442
+ raw_step_dropdown.change(update_raw_call_details,
443
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
444
+ outputs=[raw_call_details])
445
+
446
+
447
+ with gr.Tab("SWE-Bench Verified"):
448
+ with gr.Row():
449
+ with gr.Column(scale=2):
450
+ Leaderboard(
451
+ value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
452
+ select_columns=SelectColumns(
453
+ default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
454
+ cant_deselect=["Agent Name"],
455
+ label="Select Columns to Display:",
456
+ ),
457
+ search_columns=config.SWEBENCH_SEARCH_COLUMNS,
458
+ column_widths={"Agent Name": 40,
459
+ "Accuracy": 20,
460
+ "Total Cost": 20},
461
+ )
462
+ with gr.Row():
463
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
464
+
465
+ gr.Markdown("# Failure Report")
466
+ with gr.Row():
467
+ with gr.Column(scale=1):
468
+ failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
469
+ with gr.Row():
470
+ with gr.Column(scale=1):
471
+ failure_categories_overview = gr.Markdown()
472
+
473
+ with gr.Column(scale=1):
474
+ failure_categories_chart = gr.Plot()
475
+
476
+ # Initialize the failure report agent dropdown with all agents
477
+ demo.load(update_agent_dropdown,
478
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
479
+ outputs=[failure_report_agent_dropdown])
480
+
481
+ # Update failure report when agent is selected
482
+ failure_report_agent_dropdown.change(update_failure_report,
483
+ inputs=[failure_report_agent_dropdown],
484
+ outputs=[failure_categories_overview, failure_categories_chart])
485
+
486
+ gr.Markdown("# Raw Predictions")
487
+ with gr.Row():
488
+ with gr.Column(scale=1):
489
+ raw_agent_dropdown = gr.Dropdown(label="Select Agent")
490
+ with gr.Column(scale=1):
491
+ raw_task_dropdown = gr.Dropdown(label="Select Task")
492
+ with gr.Column(scale=1):
493
+ raw_step_dropdown = gr.Dropdown(label="Select Step")
494
+
495
+ with gr.Row():
496
+ raw_call_details = gr.HTML()
497
+
498
+ def update_raw_task_dropdown(agent_name):
499
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
500
+ if not analyzed_traces:
501
+ return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
502
+ task_ids = list(analyzed_traces.keys())
503
+ steps = analyzed_traces[task_ids[0]]['steps']
504
+ return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
505
+
506
+ def update_raw_step_dropdown(agent_name, task_id):
507
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
508
+ if not analyzed_traces or task_id not in analyzed_traces:
509
+ return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
510
+ steps = analyzed_traces[task_id]['steps']
511
+ return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
512
+
513
+ def update_raw_call_details(agent_name, task_id, step_index):
514
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
515
+ if not analyzed_traces or task_id not in analyzed_traces:
516
+ return "No data available for this selection."
517
+ steps = analyzed_traces[task_id]['steps']
518
+ if step_index is None:
519
+ return "Invalid step selection."
520
+ step = steps[step_index]
521
+ return format_call_info(step, step_index)
522
+
523
+ # Initialize the raw agent dropdown with all agents
524
+ demo.load(update_agent_dropdown,
525
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
526
+ outputs=[raw_agent_dropdown])
527
+ demo.load(update_raw_task_dropdown,
528
+ inputs=[raw_agent_dropdown],
529
+ outputs=[raw_task_dropdown, raw_step_dropdown])
530
+ demo.load(update_raw_call_details,
531
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
532
+ outputs=[raw_call_details])
533
+
534
+ raw_agent_dropdown.change(update_raw_task_dropdown,
535
+ inputs=[raw_agent_dropdown],
536
+ outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
537
+ raw_task_dropdown.change(update_raw_step_dropdown,
538
+ inputs=[raw_agent_dropdown, raw_task_dropdown],
539
+ outputs=[raw_step_dropdown, raw_call_details])
540
+ raw_step_dropdown.change(update_raw_call_details,
541
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
542
+ outputs=[raw_call_details])
543
 
544
  with gr.Tab("About"):
545
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
 
552
  # Preprocess traces
553
  preprocess_traces()
554
 
555
+ # # Download the results from the Hugging Face Hub
556
+ # await asyncio.to_thread(download_latest_results)
557
 
558
  # Check for new uploads and process them
559
  await check_and_process_uploads()
 
567
  await demo.launch()
568
 
569
  if __name__ == "__main__":
570
+ weave.init(f'leaderboard_testing_{datetime.now().strftime("%Y%m%d%H%M%S")}')
571
  asyncio.run(main())
evals_live/swebench_lite_example_agent_1722587866.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6751248329f37cf663f523759211383585062cc698d613b648e948293783f3c
3
- size 8444
 
 
 
 
evals_live/swebench_lite_example_agent_17227906123.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2c176cd5a6d6fcb0fdf83b42a919391797665c2cc9226d14ffded5586033ef9
3
- size 10381
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ccf570b28e70b4ce6beff1fccf70eaf7ea0ff52730c338ff9a721f95bfa2131
3
+ size 21960
evals_live/swebench_lite_example_agent_1722790656.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:571a7158c57485e184e195214addc25e8c58a0b8191a1d280f323a24fd81eb54
3
- size 10381
 
 
 
 
utils/processing.py CHANGED
@@ -7,7 +7,9 @@ from email.mime.multipart import MIMEMultipart
7
  import asyncio
8
  import aiofiles
9
  import aiosmtplib
10
- from agent_monitor.monitor import analyze_agent_steps, AsyncOpenAIClient
 
 
11
 
12
  async def check_and_process_uploads():
13
  upload_dir = "evals_upload"
@@ -31,11 +33,11 @@ async def check_and_process_uploads():
31
  if not os.path.exists(live_path) and not os.path.exists(processed_path):
32
  unprocessed_uploads.append(upload)
33
  elif os.path.exists(processed_path):
34
- with open(upload_path, 'r') as f:
35
- new_data = json.load(f)
36
 
37
- with open(processed_path, 'r') as f:
38
- processed_data = json.load(f)
39
 
40
  # TODO we can use a better comparison method with exact comparison
41
  # if new_data != processed_data:
@@ -70,10 +72,10 @@ async def process_single_upload(upload_path, processed_path):
70
 
71
  if check_result['is_valid']:
72
  # Process the file
73
- # await process_upload(upload_path, processed_path)
74
 
75
  # Move the file to processed directory
76
- await asyncio.to_thread(shutil.move, upload_path, processed_path)
77
 
78
  # Send email notification
79
  # await send_email_notification(upload_path.name, check_result, "Processing successful")
@@ -121,14 +123,15 @@ async def process_upload(input_path, output_path):
121
  openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
122
 
123
  try:
124
- processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client)
 
 
 
125
  except Exception as e:
 
126
  print(f"Error in processing: {str(e)}")
127
  return
128
 
129
- # Save the processed data
130
- data['raw_logging_results'] = processed_calls
131
-
132
  with open(output_path, 'w') as f:
133
  json.dump(data, f, indent=4)
134
 
 
7
  import asyncio
8
  import aiofiles
9
  import aiosmtplib
10
+ from agent_monitor.monitor import analyze_agent_steps
11
+ from agent_monitor.failure_report import analyze_agent_performance, AsyncOpenAIClient
12
+ import traceback
13
 
14
  async def check_and_process_uploads():
15
  upload_dir = "evals_upload"
 
33
  if not os.path.exists(live_path) and not os.path.exists(processed_path):
34
  unprocessed_uploads.append(upload)
35
  elif os.path.exists(processed_path):
36
+ # with open(upload_path, 'r') as f:
37
+ # new_data = json.load(f)
38
 
39
+ # with open(processed_path, 'r') as f:
40
+ # processed_data = json.load(f)
41
 
42
  # TODO we can use a better comparison method with exact comparison
43
  # if new_data != processed_data:
 
72
 
73
  if check_result['is_valid']:
74
  # Process the file
75
+ await process_upload(upload_path, processed_path)
76
 
77
  # Move the file to processed directory
78
+ # await asyncio.to_thread(shutil.move, upload_path, processed_path)
79
 
80
  # Send email notification
81
  # await send_email_notification(upload_path.name, check_result, "Processing successful")
 
123
  openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
124
 
125
  try:
126
+ processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=False)
127
+ failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
128
+ data['raw_logging_results'] = processed_calls
129
+ data['failure_report'] = failure_report
130
  except Exception as e:
131
+ traceback.print_exc()
132
  print(f"Error in processing: {str(e)}")
133
  return
134
 
 
 
 
135
  with open(output_path, 'w') as f:
136
  json.dump(data, f, indent=4)
137
 
utils/viz.py CHANGED
@@ -4,6 +4,58 @@ from utils.pareto import Agent, compute_pareto_frontier
4
  import plotly.graph_objects as go
5
  import textwrap
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
8
  agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
9
  pareto_frontier = compute_pareto_frontier(agents)
 
4
  import plotly.graph_objects as go
5
  import textwrap
6
 
7
+ def create_bar_chart(categories, values, x_label, y_label, title):
8
+ # Sort categories and values based on values in descending order
9
+ sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
10
+ categories, values = zip(*sorted_data)
11
+
12
+ fig = go.Figure(data=[go.Bar(
13
+ y=categories,
14
+ x=values,
15
+ orientation='h',
16
+ marker_color='#1b9e77', # Same color as the scatter plot
17
+ text=values,
18
+ textposition='auto',
19
+ textfont=dict(color='black', size=14, family='Arial', weight=2),
20
+ hovertemplate='<b>%{y}</b><br>' +
21
+ 'Affected Tasks: %{x}<br>'
22
+ )])
23
+
24
+ fig.update_layout(
25
+ height=600,
26
+ xaxis=dict(
27
+ showline=True,
28
+ linecolor='black',
29
+ showgrid=False
30
+ ),
31
+ yaxis=dict(
32
+ showline=True,
33
+ linecolor='black',
34
+ showgrid=False,
35
+ autorange="reversed" # This will put the category with the highest value at the top
36
+ ),
37
+ plot_bgcolor='white',
38
+ paper_bgcolor='white',
39
+ bargap=0.2,
40
+ bargroupgap=0.1,
41
+ hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
42
+ modebar=dict(
43
+ activecolor='#1f77b4',
44
+ orientation='h',
45
+ bgcolor='rgba(255,255,255,0.8)',
46
+ color='#777',
47
+ add=['pan2d'],
48
+ remove=[
49
+ 'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d',
50
+ 'hoverClosestCartesian', 'hoverCompareCartesian',
51
+ 'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select'
52
+ ]
53
+ ),
54
+ dragmode='pan'
55
+ )
56
+
57
+ return fig
58
+
59
  def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
60
  agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
61
  pareto_frontier = compute_pareto_frontier(agents)