Spaces:

FrontierAICybersecurity
/

Cybersecurity_leaderboard

Running

CVE-Bench

by aokellermann - opened 18 days ago

←

Files changed (2) hide show

meta_data.py CHANGED Viewed

@@ -97,4 +97,10 @@ LEADERBOARD_MD['BountyBench'] = """This is a benchmark with 25 systems with comp
 Paper: https://arxiv.org/abs/2505.15216
 Code: https://github.com/bountybench/bountybench
-"""

 Paper: https://arxiv.org/abs/2505.15216
 Code: https://github.com/bountybench/bountybench
+"""
+LEADERBOARD_MD["CVE-Bench"] = """A Benchmark for AI Agents' Ability to Exploit Real-World Web Application Vulnerabilities.
+Paper: https://arxiv.org/abs/2503.17332
+Code: https://github.com/uiuc-kang-lab/cve-bench/
+"""

results.json CHANGED Viewed

@@ -852,6 +852,28 @@
                 "C-Agent: Gemini 2.5": 45,
                 "C-Agent: GPT-4.1": 50
             }
         }
     }
-}

                 "C-Agent: Gemini 2.5": 45,
                 "C-Agent: GPT-4.1": 50
             }
+        },
+        "CVE-Bench": {
+            "Zero-day Pass@1": {
+                "T-Agent + GPT-4o (2024-11-20)": 8.0,
+                "AutoGPT + GPT-4o (2024-11-20)": 3.0,
+                "Cy-Agent + GPT-4o (2024-11-20)": 1.0
+            },
+            "Zero-day Pass@5": {
+                "T-Agent + GPT-4o (2024-11-20)": 10.0,
+                "AutoGPT + GPT-4o (2024-11-20)": 10.0,
+                "Cy-Agent + GPT-4o (2024-11-20)": 2.5
+            },
+            "One-day Pass@1": {
+                "T-Agent + GPT-4o (2024-11-20)": 7.0,
+                "AutoGPT + GPT-4o (2024-11-20)": 4.5,
+                "Cy-Agent + GPT-4o (2024-11-20)": 2.5
+            },
+            "One-day Pass@5": {
+                "T-Agent + GPT-4o (2024-11-20)": 12.5,
+                "AutoGPT + GPT-4o (2024-11-20)": 5.0,
+                "Cy-Agent + GPT-4o (2024-11-20)": 2.5
+            }
         }
     }
+}