benediktstroebl commited on
Commit
221fb8a
·
1 Parent(s): 766750f

refactoring and USACO as default front page

Browse files
Files changed (2) hide show
  1. app.py +19 -55
  2. utils/data.py +20 -0
app.py CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
6
  import pandas as pd
7
  import os
8
  import json
9
- from utils.data import parse_json_files
10
  from utils.viz import create_scatter_plot, create_flow_chart
11
  from utils.processing import check_and_process_uploads
12
  from huggingface_hub import snapshot_download
@@ -40,26 +40,6 @@ abs_path = Path(__file__).parent
40
  # Global variable to store preprocessed data
41
  preprocessed_traces = {}
42
 
43
- def preprocess_traces():
44
- global preprocessed_traces
45
- processed_dir = abs_path / "evals_live"
46
- for file in processed_dir.glob('*.json'):
47
- try:
48
- with open(file, 'r') as f:
49
- data = json.load(f)
50
- agent_name = data['config']['agent_name']
51
- benchmark_name = data['config']['benchmark_name']
52
- if benchmark_name not in preprocessed_traces:
53
- preprocessed_traces[benchmark_name] = {}
54
-
55
- assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
56
- preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
57
- except AssertionError as e:
58
- preprocessed_traces[benchmark_name][agent_name] = None
59
- except Exception as e:
60
- print(f"Error preprocessing {file}: {e}")
61
- preprocessed_traces[benchmark_name][agent_name] = None
62
-
63
  def get_analyzed_traces(agent_name, benchmark_name):
64
  return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
65
 
@@ -194,45 +174,12 @@ def format_call_info(step, step_index):
194
  return formatted_info
195
 
196
 
197
- def update_call_details(task_id, call_index):
198
- if task_id not in analyzed_traces or call_index is None:
199
- return "Please select a task and step to view details."
200
-
201
- calls = analyzed_traces[task_id]['steps']
202
- if isinstance(call_index, tuple):
203
- call_index = call_index[1]
204
-
205
- if call_index < 0 or call_index >= len(calls):
206
- return f"Invalid call index: {call_index}"
207
-
208
- call = calls[call_index]
209
- return format_call_info(call, call_index)
210
-
211
-
212
-
213
  with gr.Blocks() as demo:
214
  gr.Markdown("""
215
  # 🥇 Agent Leaderboard
216
  """)
217
 
218
  with gr.Tabs():
219
- with gr.Tab("SWE-Bench"):
220
- with gr.Row():
221
- with gr.Column(scale=1):
222
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "results_total_cost", "results_accuracy", "Cost (in USD)", "Accuracy", ["agent_name"]))
223
- with gr.Column(scale=1):
224
- Leaderboard(
225
- value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
226
- select_columns=SelectColumns(
227
- default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
228
- cant_deselect=["agent_name"],
229
- label="Select Columns to Display:",
230
- ),
231
- search_columns=config.SWEBENCH_SEARCH_COLUMNS,
232
- column_widths={"agent_name": 40,
233
- "results_accuracy": 20,
234
- "results_total_cost": 20},
235
- )
236
  with gr.Tab("USACO"):
237
  with gr.Row():
238
  with gr.Column(scale=1):
@@ -329,7 +276,24 @@ with gr.Blocks() as demo:
329
  raw_step_dropdown.change(update_raw_call_details,
330
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
331
  outputs=[raw_call_details])
332
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
  with gr.Tab("About"):
335
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
 
6
  import pandas as pd
7
  import os
8
  import json
9
+ from utils.data import parse_json_files, preprocess_traces
10
  from utils.viz import create_scatter_plot, create_flow_chart
11
  from utils.processing import check_and_process_uploads
12
  from huggingface_hub import snapshot_download
 
40
  # Global variable to store preprocessed data
41
  preprocessed_traces = {}
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def get_analyzed_traces(agent_name, benchmark_name):
44
  return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
45
 
 
174
  return formatted_info
175
 
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  with gr.Blocks() as demo:
178
  gr.Markdown("""
179
  # 🥇 Agent Leaderboard
180
  """)
181
 
182
  with gr.Tabs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  with gr.Tab("USACO"):
184
  with gr.Row():
185
  with gr.Column(scale=1):
 
276
  raw_step_dropdown.change(update_raw_call_details,
277
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
278
  outputs=[raw_call_details])
279
+
280
+ with gr.Tab("SWE-Bench"):
281
+ with gr.Row():
282
+ with gr.Column(scale=1):
283
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "results_total_cost", "results_accuracy", "Cost (in USD)", "Accuracy", ["agent_name"]))
284
+ with gr.Column(scale=1):
285
+ Leaderboard(
286
+ value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
287
+ select_columns=SelectColumns(
288
+ default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
289
+ cant_deselect=["agent_name"],
290
+ label="Select Columns to Display:",
291
+ ),
292
+ search_columns=config.SWEBENCH_SEARCH_COLUMNS,
293
+ column_widths={"agent_name": 40,
294
+ "results_accuracy": 20,
295
+ "results_total_cost": 20},
296
+ )
297
 
298
  with gr.Tab("About"):
299
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
utils/data.py CHANGED
@@ -6,6 +6,26 @@ from utils.pareto import Agent, compute_pareto_frontier
6
  import plotly.graph_objects as go
7
  import textwrap
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def parse_json_files(folder_path, benchmark_name):
10
  # Convert folder path to Path object
11
  folder = Path(folder_path)
 
6
  import plotly.graph_objects as go
7
  import textwrap
8
 
9
+ def preprocess_traces():
10
+ global preprocessed_traces
11
+ processed_dir = "evals_live"
12
+ for file in processed_dir.glob('*.json'):
13
+ try:
14
+ with open(file, 'r') as f:
15
+ data = json.load(f)
16
+ agent_name = data['config']['agent_name']
17
+ benchmark_name = data['config']['benchmark_name']
18
+ if benchmark_name not in preprocessed_traces:
19
+ preprocessed_traces[benchmark_name] = {}
20
+
21
+ assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
22
+ preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
23
+ except AssertionError as e:
24
+ preprocessed_traces[benchmark_name][agent_name] = None
25
+ except Exception as e:
26
+ print(f"Error preprocessing {file}: {e}")
27
+ preprocessed_traces[benchmark_name][agent_name] = None
28
+
29
  def parse_json_files(folder_path, benchmark_name):
30
  # Convert folder path to Path object
31
  folder = Path(folder_path)