OpenHands-evaluation

Sleeping

Xingyao Wang commited on May 26, 2024

Commit

d61638c

2 Parent(s): 4deac19 f6d9f43

Merge commit 'f6d9f43457bdadd36685181efda2fd45e813a02c'

Files changed (2) hide show

0_📊_OpenDevin_Benchmark.py CHANGED Viewed

@@ -46,7 +46,8 @@ swe_bench_results = swe_bench_results.drop(
 swe_bench_results = swe_bench_results[[
     'agent_name', 'note',
     'model_name',
-    'success_rate', 'n_solved', 'n_error', 'n_stuck_in_loop', 'total',
     'max_iterations', 'git_commit', 'start_time'
 ]]
 swe_bench_results = swe_bench_results.sort_values(by='success_rate', ascending=False)

 swe_bench_results = swe_bench_results[[
     'agent_name', 'note',
     'model_name',
+    'success_rate', 'n_solved', 'n_error', 'n_stuck_in_loop',
+    'total', 'total_cost',
     'max_iterations', 'git_commit', 'start_time'
 ]]
 swe_bench_results = swe_bench_results.sort_values(by='success_rate', ascending=False)

utils/swe_bench.py CHANGED Viewed

@@ -110,6 +110,9 @@ def agg_stats(df):
                 obs_lengths.append(len(obs['content']))
         obs_lengths = pd.Series(obs_lengths)
         d = {
             'idx': idx,
             'instance_id': entry['instance_id'],
@@ -119,6 +122,7 @@ def agg_stats(df):
             **test_result,
             'agent_stuck_in_loop': agent_stuck_in_loop,
             'contains_error': contains_error,
             'empty_generation': empty_generation,
             'apply_test_patch_success': apply_test_patch_success,
             'test_cmd_exit_error': test_cmd_exit_error,
@@ -139,6 +143,15 @@ def agg_stats(df):
 def get_resolved_stats_from_filepath(filepath):
     df = load_df_from_selected_filepaths(filepath)
     stats = agg_stats(df)
     resolved = stats['resolved'].sum() / len(stats)
     num_contains_error = stats['contains_error'].sum()
     num_agent_stuck_in_loop = stats['agent_stuck_in_loop'].sum()
@@ -149,4 +162,5 @@ def get_resolved_stats_from_filepath(filepath):
         'n_error': num_contains_error,
         'n_stuck_in_loop': num_agent_stuck_in_loop,
         'total': tot_instances,
     }

                 obs_lengths.append(len(obs['content']))
         obs_lengths = pd.Series(obs_lengths)
+        metrics = entry.get('metrics', {})
+        cost = metrics.get('accumulated_cost', None)
         d = {
             'idx': idx,
             'instance_id': entry['instance_id'],
             **test_result,
             'agent_stuck_in_loop': agent_stuck_in_loop,
             'contains_error': contains_error,
+            'cost': cost,
             'empty_generation': empty_generation,
             'apply_test_patch_success': apply_test_patch_success,
             'test_cmd_exit_error': test_cmd_exit_error,
 def get_resolved_stats_from_filepath(filepath):
     df = load_df_from_selected_filepaths(filepath)
     stats = agg_stats(df)
+    if not len(stats):
+        return {
+            'success_rate': None,
+            'n_solved': None,
+            'n_error': None,
+            'total': None,
+            'total_cost': None,
+        }
+    tot_cost = stats['cost'].sum()
     resolved = stats['resolved'].sum() / len(stats)
     num_contains_error = stats['contains_error'].sum()
     num_agent_stuck_in_loop = stats['agent_stuck_in_loop'].sum()
         'n_error': num_contains_error,
         'n_stuck_in_loop': num_agent_stuck_in_loop,
         'total': tot_instances,
+        'total_cost': tot_cost,
     }