CVE-Bench
#2
by
aokellermann
- opened
- meta_data.py +7 -1
- results.json +23 -1
meta_data.py
CHANGED
@@ -97,4 +97,10 @@ LEADERBOARD_MD['BountyBench'] = """This is a benchmark with 25 systems with comp
|
|
97 |
|
98 |
Paper: https://arxiv.org/abs/2505.15216
|
99 |
Code: https://github.com/bountybench/bountybench
|
100 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
Paper: https://arxiv.org/abs/2505.15216
|
99 |
Code: https://github.com/bountybench/bountybench
|
100 |
+
"""
|
101 |
+
|
102 |
+
LEADERBOARD_MD["CVE-Bench"] = """A Benchmark for AI Agents' Ability to Exploit Real-World Web Application Vulnerabilities.
|
103 |
+
|
104 |
+
Paper: https://arxiv.org/abs/2503.17332
|
105 |
+
Code: https://github.com/uiuc-kang-lab/cve-bench/
|
106 |
+
"""
|
results.json
CHANGED
@@ -852,6 +852,28 @@
|
|
852 |
"C-Agent: Gemini 2.5": 45,
|
853 |
"C-Agent: GPT-4.1": 50
|
854 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
855 |
}
|
856 |
}
|
857 |
-
}
|
|
|
852 |
"C-Agent: Gemini 2.5": 45,
|
853 |
"C-Agent: GPT-4.1": 50
|
854 |
}
|
855 |
+
},
|
856 |
+
"CVE-Bench": {
|
857 |
+
"Zero-day Pass@1": {
|
858 |
+
"T-Agent + GPT-4o (2024-11-20)": 8.0,
|
859 |
+
"AutoGPT + GPT-4o (2024-11-20)": 3.0,
|
860 |
+
"Cy-Agent + GPT-4o (2024-11-20)": 1.0
|
861 |
+
},
|
862 |
+
"Zero-day Pass@5": {
|
863 |
+
"T-Agent + GPT-4o (2024-11-20)": 10.0,
|
864 |
+
"AutoGPT + GPT-4o (2024-11-20)": 10.0,
|
865 |
+
"Cy-Agent + GPT-4o (2024-11-20)": 2.5
|
866 |
+
},
|
867 |
+
"One-day Pass@1": {
|
868 |
+
"T-Agent + GPT-4o (2024-11-20)": 7.0,
|
869 |
+
"AutoGPT + GPT-4o (2024-11-20)": 4.5,
|
870 |
+
"Cy-Agent + GPT-4o (2024-11-20)": 2.5
|
871 |
+
},
|
872 |
+
"One-day Pass@5": {
|
873 |
+
"T-Agent + GPT-4o (2024-11-20)": 12.5,
|
874 |
+
"AutoGPT + GPT-4o (2024-11-20)": 5.0,
|
875 |
+
"Cy-Agent + GPT-4o (2024-11-20)": 2.5
|
876 |
+
}
|
877 |
}
|
878 |
}
|
879 |
+
}
|