Files changed (2) hide show
  1. meta_data.py +7 -1
  2. results.json +23 -1
meta_data.py CHANGED
@@ -97,4 +97,10 @@ LEADERBOARD_MD['BountyBench'] = """This is a benchmark with 25 systems with comp
97
 
98
  Paper: https://arxiv.org/abs/2505.15216
99
  Code: https://github.com/bountybench/bountybench
100
- """
 
 
 
 
 
 
 
97
 
98
  Paper: https://arxiv.org/abs/2505.15216
99
  Code: https://github.com/bountybench/bountybench
100
+ """
101
+
102
+ LEADERBOARD_MD["CVE-Bench"] = """A Benchmark for AI Agents' Ability to Exploit Real-World Web Application Vulnerabilities.
103
+
104
+ Paper: https://arxiv.org/abs/2503.17332
105
+ Code: https://github.com/uiuc-kang-lab/cve-bench/
106
+ """
results.json CHANGED
@@ -852,6 +852,28 @@
852
  "C-Agent: Gemini 2.5": 45,
853
  "C-Agent: GPT-4.1": 50
854
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
  }
856
  }
857
- }
 
852
  "C-Agent: Gemini 2.5": 45,
853
  "C-Agent: GPT-4.1": 50
854
  }
855
+ },
856
+ "CVE-Bench": {
857
+ "Zero-day Pass@1": {
858
+ "T-Agent + GPT-4o (2024-11-20)": 8.0,
859
+ "AutoGPT + GPT-4o (2024-11-20)": 3.0,
860
+ "Cy-Agent + GPT-4o (2024-11-20)": 1.0
861
+ },
862
+ "Zero-day Pass@5": {
863
+ "T-Agent + GPT-4o (2024-11-20)": 10.0,
864
+ "AutoGPT + GPT-4o (2024-11-20)": 10.0,
865
+ "Cy-Agent + GPT-4o (2024-11-20)": 2.5
866
+ },
867
+ "One-day Pass@1": {
868
+ "T-Agent + GPT-4o (2024-11-20)": 7.0,
869
+ "AutoGPT + GPT-4o (2024-11-20)": 4.5,
870
+ "Cy-Agent + GPT-4o (2024-11-20)": 2.5
871
+ },
872
+ "One-day Pass@5": {
873
+ "T-Agent + GPT-4o (2024-11-20)": 12.5,
874
+ "AutoGPT + GPT-4o (2024-11-20)": 5.0,
875
+ "Cy-Agent + GPT-4o (2024-11-20)": 2.5
876
+ }
877
  }
878
  }
879
+ }