Spaces:
Running
Running
benediktstroebl
commited on
Commit
·
221fb8a
1
Parent(s):
766750f
refactoring and USACO as default front page
Browse files- app.py +19 -55
- utils/data.py +20 -0
app.py
CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
6 |
import pandas as pd
|
7 |
import os
|
8 |
import json
|
9 |
-
from utils.data import parse_json_files
|
10 |
from utils.viz import create_scatter_plot, create_flow_chart
|
11 |
from utils.processing import check_and_process_uploads
|
12 |
from huggingface_hub import snapshot_download
|
@@ -40,26 +40,6 @@ abs_path = Path(__file__).parent
|
|
40 |
# Global variable to store preprocessed data
|
41 |
preprocessed_traces = {}
|
42 |
|
43 |
-
def preprocess_traces():
|
44 |
-
global preprocessed_traces
|
45 |
-
processed_dir = abs_path / "evals_live"
|
46 |
-
for file in processed_dir.glob('*.json'):
|
47 |
-
try:
|
48 |
-
with open(file, 'r') as f:
|
49 |
-
data = json.load(f)
|
50 |
-
agent_name = data['config']['agent_name']
|
51 |
-
benchmark_name = data['config']['benchmark_name']
|
52 |
-
if benchmark_name not in preprocessed_traces:
|
53 |
-
preprocessed_traces[benchmark_name] = {}
|
54 |
-
|
55 |
-
assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
|
56 |
-
preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
|
57 |
-
except AssertionError as e:
|
58 |
-
preprocessed_traces[benchmark_name][agent_name] = None
|
59 |
-
except Exception as e:
|
60 |
-
print(f"Error preprocessing {file}: {e}")
|
61 |
-
preprocessed_traces[benchmark_name][agent_name] = None
|
62 |
-
|
63 |
def get_analyzed_traces(agent_name, benchmark_name):
|
64 |
return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
|
65 |
|
@@ -194,45 +174,12 @@ def format_call_info(step, step_index):
|
|
194 |
return formatted_info
|
195 |
|
196 |
|
197 |
-
def update_call_details(task_id, call_index):
|
198 |
-
if task_id not in analyzed_traces or call_index is None:
|
199 |
-
return "Please select a task and step to view details."
|
200 |
-
|
201 |
-
calls = analyzed_traces[task_id]['steps']
|
202 |
-
if isinstance(call_index, tuple):
|
203 |
-
call_index = call_index[1]
|
204 |
-
|
205 |
-
if call_index < 0 or call_index >= len(calls):
|
206 |
-
return f"Invalid call index: {call_index}"
|
207 |
-
|
208 |
-
call = calls[call_index]
|
209 |
-
return format_call_info(call, call_index)
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
with gr.Blocks() as demo:
|
214 |
gr.Markdown("""
|
215 |
# 🥇 Agent Leaderboard
|
216 |
""")
|
217 |
|
218 |
with gr.Tabs():
|
219 |
-
with gr.Tab("SWE-Bench"):
|
220 |
-
with gr.Row():
|
221 |
-
with gr.Column(scale=1):
|
222 |
-
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "results_total_cost", "results_accuracy", "Cost (in USD)", "Accuracy", ["agent_name"]))
|
223 |
-
with gr.Column(scale=1):
|
224 |
-
Leaderboard(
|
225 |
-
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
|
226 |
-
select_columns=SelectColumns(
|
227 |
-
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
|
228 |
-
cant_deselect=["agent_name"],
|
229 |
-
label="Select Columns to Display:",
|
230 |
-
),
|
231 |
-
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
232 |
-
column_widths={"agent_name": 40,
|
233 |
-
"results_accuracy": 20,
|
234 |
-
"results_total_cost": 20},
|
235 |
-
)
|
236 |
with gr.Tab("USACO"):
|
237 |
with gr.Row():
|
238 |
with gr.Column(scale=1):
|
@@ -329,7 +276,24 @@ with gr.Blocks() as demo:
|
|
329 |
raw_step_dropdown.change(update_raw_call_details,
|
330 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
331 |
outputs=[raw_call_details])
|
332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
with gr.Tab("About"):
|
335 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
|
|
6 |
import pandas as pd
|
7 |
import os
|
8 |
import json
|
9 |
+
from utils.data import parse_json_files, preprocess_traces
|
10 |
from utils.viz import create_scatter_plot, create_flow_chart
|
11 |
from utils.processing import check_and_process_uploads
|
12 |
from huggingface_hub import snapshot_download
|
|
|
40 |
# Global variable to store preprocessed data
|
41 |
preprocessed_traces = {}
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
def get_analyzed_traces(agent_name, benchmark_name):
|
44 |
return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
|
45 |
|
|
|
174 |
return formatted_info
|
175 |
|
176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
with gr.Blocks() as demo:
|
178 |
gr.Markdown("""
|
179 |
# 🥇 Agent Leaderboard
|
180 |
""")
|
181 |
|
182 |
with gr.Tabs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
with gr.Tab("USACO"):
|
184 |
with gr.Row():
|
185 |
with gr.Column(scale=1):
|
|
|
276 |
raw_step_dropdown.change(update_raw_call_details,
|
277 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
278 |
outputs=[raw_call_details])
|
279 |
+
|
280 |
+
with gr.Tab("SWE-Bench"):
|
281 |
+
with gr.Row():
|
282 |
+
with gr.Column(scale=1):
|
283 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "results_total_cost", "results_accuracy", "Cost (in USD)", "Accuracy", ["agent_name"]))
|
284 |
+
with gr.Column(scale=1):
|
285 |
+
Leaderboard(
|
286 |
+
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
|
287 |
+
select_columns=SelectColumns(
|
288 |
+
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
|
289 |
+
cant_deselect=["agent_name"],
|
290 |
+
label="Select Columns to Display:",
|
291 |
+
),
|
292 |
+
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
293 |
+
column_widths={"agent_name": 40,
|
294 |
+
"results_accuracy": 20,
|
295 |
+
"results_total_cost": 20},
|
296 |
+
)
|
297 |
|
298 |
with gr.Tab("About"):
|
299 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
utils/data.py
CHANGED
@@ -6,6 +6,26 @@ from utils.pareto import Agent, compute_pareto_frontier
|
|
6 |
import plotly.graph_objects as go
|
7 |
import textwrap
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
def parse_json_files(folder_path, benchmark_name):
|
10 |
# Convert folder path to Path object
|
11 |
folder = Path(folder_path)
|
|
|
6 |
import plotly.graph_objects as go
|
7 |
import textwrap
|
8 |
|
9 |
+
def preprocess_traces():
|
10 |
+
global preprocessed_traces
|
11 |
+
processed_dir = "evals_live"
|
12 |
+
for file in processed_dir.glob('*.json'):
|
13 |
+
try:
|
14 |
+
with open(file, 'r') as f:
|
15 |
+
data = json.load(f)
|
16 |
+
agent_name = data['config']['agent_name']
|
17 |
+
benchmark_name = data['config']['benchmark_name']
|
18 |
+
if benchmark_name not in preprocessed_traces:
|
19 |
+
preprocessed_traces[benchmark_name] = {}
|
20 |
+
|
21 |
+
assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
|
22 |
+
preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
|
23 |
+
except AssertionError as e:
|
24 |
+
preprocessed_traces[benchmark_name][agent_name] = None
|
25 |
+
except Exception as e:
|
26 |
+
print(f"Error preprocessing {file}: {e}")
|
27 |
+
preprocessed_traces[benchmark_name][agent_name] = None
|
28 |
+
|
29 |
def parse_json_files(folder_path, benchmark_name):
|
30 |
# Convert folder path to Path object
|
31 |
folder = Path(folder_path)
|