Spaces:
Sleeping
Sleeping
Merge commit 'f6d9f43457bdadd36685181efda2fd45e813a02c'
Browse files- 0_📊_OpenDevin_Benchmark.py +2 -1
- utils/swe_bench.py +14 -0
0_📊_OpenDevin_Benchmark.py
CHANGED
|
@@ -46,7 +46,8 @@ swe_bench_results = swe_bench_results.drop(
|
|
| 46 |
swe_bench_results = swe_bench_results[[
|
| 47 |
'agent_name', 'note',
|
| 48 |
'model_name',
|
| 49 |
-
'success_rate', 'n_solved', 'n_error', 'n_stuck_in_loop',
|
|
|
|
| 50 |
'max_iterations', 'git_commit', 'start_time'
|
| 51 |
]]
|
| 52 |
swe_bench_results = swe_bench_results.sort_values(by='success_rate', ascending=False)
|
|
|
|
| 46 |
swe_bench_results = swe_bench_results[[
|
| 47 |
'agent_name', 'note',
|
| 48 |
'model_name',
|
| 49 |
+
'success_rate', 'n_solved', 'n_error', 'n_stuck_in_loop',
|
| 50 |
+
'total', 'total_cost',
|
| 51 |
'max_iterations', 'git_commit', 'start_time'
|
| 52 |
]]
|
| 53 |
swe_bench_results = swe_bench_results.sort_values(by='success_rate', ascending=False)
|
utils/swe_bench.py
CHANGED
|
@@ -110,6 +110,9 @@ def agg_stats(df):
|
|
| 110 |
obs_lengths.append(len(obs['content']))
|
| 111 |
obs_lengths = pd.Series(obs_lengths)
|
| 112 |
|
|
|
|
|
|
|
|
|
|
| 113 |
d = {
|
| 114 |
'idx': idx,
|
| 115 |
'instance_id': entry['instance_id'],
|
|
@@ -119,6 +122,7 @@ def agg_stats(df):
|
|
| 119 |
**test_result,
|
| 120 |
'agent_stuck_in_loop': agent_stuck_in_loop,
|
| 121 |
'contains_error': contains_error,
|
|
|
|
| 122 |
'empty_generation': empty_generation,
|
| 123 |
'apply_test_patch_success': apply_test_patch_success,
|
| 124 |
'test_cmd_exit_error': test_cmd_exit_error,
|
|
@@ -139,6 +143,15 @@ def agg_stats(df):
|
|
| 139 |
def get_resolved_stats_from_filepath(filepath):
|
| 140 |
df = load_df_from_selected_filepaths(filepath)
|
| 141 |
stats = agg_stats(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
resolved = stats['resolved'].sum() / len(stats)
|
| 143 |
num_contains_error = stats['contains_error'].sum()
|
| 144 |
num_agent_stuck_in_loop = stats['agent_stuck_in_loop'].sum()
|
|
@@ -149,4 +162,5 @@ def get_resolved_stats_from_filepath(filepath):
|
|
| 149 |
'n_error': num_contains_error,
|
| 150 |
'n_stuck_in_loop': num_agent_stuck_in_loop,
|
| 151 |
'total': tot_instances,
|
|
|
|
| 152 |
}
|
|
|
|
| 110 |
obs_lengths.append(len(obs['content']))
|
| 111 |
obs_lengths = pd.Series(obs_lengths)
|
| 112 |
|
| 113 |
+
metrics = entry.get('metrics', {})
|
| 114 |
+
cost = metrics.get('accumulated_cost', None)
|
| 115 |
+
|
| 116 |
d = {
|
| 117 |
'idx': idx,
|
| 118 |
'instance_id': entry['instance_id'],
|
|
|
|
| 122 |
**test_result,
|
| 123 |
'agent_stuck_in_loop': agent_stuck_in_loop,
|
| 124 |
'contains_error': contains_error,
|
| 125 |
+
'cost': cost,
|
| 126 |
'empty_generation': empty_generation,
|
| 127 |
'apply_test_patch_success': apply_test_patch_success,
|
| 128 |
'test_cmd_exit_error': test_cmd_exit_error,
|
|
|
|
| 143 |
def get_resolved_stats_from_filepath(filepath):
|
| 144 |
df = load_df_from_selected_filepaths(filepath)
|
| 145 |
stats = agg_stats(df)
|
| 146 |
+
if not len(stats):
|
| 147 |
+
return {
|
| 148 |
+
'success_rate': None,
|
| 149 |
+
'n_solved': None,
|
| 150 |
+
'n_error': None,
|
| 151 |
+
'total': None,
|
| 152 |
+
'total_cost': None,
|
| 153 |
+
}
|
| 154 |
+
tot_cost = stats['cost'].sum()
|
| 155 |
resolved = stats['resolved'].sum() / len(stats)
|
| 156 |
num_contains_error = stats['contains_error'].sum()
|
| 157 |
num_agent_stuck_in_loop = stats['agent_stuck_in_loop'].sum()
|
|
|
|
| 162 |
'n_error': num_contains_error,
|
| 163 |
'n_stuck_in_loop': num_agent_stuck_in_loop,
|
| 164 |
'total': tot_instances,
|
| 165 |
+
'total_cost': tot_cost,
|
| 166 |
}
|