Spaces:

priamaiorg
/

cyber-bench-hub

Sleeping

App Files Files Community

ash-98 commited on Mar 25

Commit

2638ca2

1 Parent(s): 7d7287d

V 1.0

Browse files

Files changed (5) hide show

.gitignore +2 -0
.streamlit/config.toml +5 -0
Benchmark.csv +24 -0
app.py +110 -0
requirements.txt +37 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv
2	+ Dockerfile

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,5 @@

+[theme]
+primaryColor="#01d2fc"
+backgroundColor="#252040"
+secondaryBackgroundColor="#262626"
+textColor="#f4f4f4"

Benchmark.csv ADDED Viewed

	@@ -0,0 +1,24 @@

+model name, source, v1 metric, v2 metric
+OpenAI: GPT-4.5 (Preview),Proprietary Model,100%,97%
+OpenAI: o3 Mini High,Proprietary Model,100%,96%
+OpenAI: o3 Mini,Proprietary Model,100%,96%
+OpenAI: GPT-4o,Proprietary Model,99.09%,95%
+OpenAI: GPT-4o-mini,Proprietary Model,99.09%,97%
+Anthropic: Claude 3.5 Sonnet,Proprietary Model,99.09%,97%
+Anthropic: Claude 3.5 Haiku,Proprietary Model,100%,97%
+Anthropic: Claude 3.7 Sonnet,Proprietary Model,99.09%,98%
+Google: Gemma 3 27B ,Open Source,98.18%,95%
+Google: Gemini Flash 2.0,Proprietary Model,100%,99%
+Google: Gemini 2.0 Flash Lite,Proprietary Model,100%,97%
+DeepSeek: R1,Open Source,100%,98%
+DeepSeek: DeepSeek V3,Open Source,100%,97%
+Mistral: Mistral Small 3.1 24B,Open Source,100%,97%
+Mistral: Mistral Small 3,Open Source,99.09%,97%
+Mistral Large 2411,Open Source,99.09%,96%
+Meta: Llama 3.3 70B Instruct,Open Source,100%,97%
+Meta: Llama 3.2 3B Instruct,Open Source,78.18%,75%
+Qwen: QwQ 32B,Open Source,100.00%,96%
+Microsoft: Phi 4,Proprietary Model,100%,97%
+Microsoft: Phi-3.5 Mini 128K Instruct,Open Source,99.09%,97%
+Microsoft: Phi-3 Mini 128K Instruct,Open Source,98.18%,98%

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import streamlit as st
+import pandas as pd
+st.set_page_config(page_title="Cyber Benchmark Hub: SECQA Leaderboard", layout="wide")
+st.title("Cyber Benchmark Hub: SECQA Leaderboard")
+st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
+with st.sidebar:
+    st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True)
+    st.markdown("[Priam.ai](https://www.priam.ai/)")
+    st.divider()
+    dataset_categories = ["Multiple Choice"]
+    selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
+    datasets_by_category = {
+        "Multiple Choice": ["secQA"],
+    }
+    dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
+    st.divider()
+    st.header("Filters & Options")
+    dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
+    # For filtering the leaderboard by model type
+    # Note: The available model types will come from the CSV, once loaded.
+    # We'll load the CSV later and then update this filter accordingly.
+    source_filter_placeholder = st.empty()  # placeholder for source filter after data is loaded
+    st.markdown("---")
+    st.header("Test Parameters")
+    test_params = pd.DataFrame({
+        "Value": [0, 1, 0, 1, 0]
+    }, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"])
+    st.table(test_params)
+# Determine file path based on dataset choice.
+# For now, if dataset_choice is "secQA", we use "Benchmark.csv"
+if dataset_choice == "secQA":
+    file_path = "Benchmark.csv"  # Ensure this file is uploaded in your Hugging Face Space
+else:
+    file_path = "Benchmark.csv"  # Placeholder: update with actual file paths for future datasets
+# Function to load and clean CSV data
+@st.cache_data
+def load_data(file_path):
+    df = pd.read_csv(file_path)
+    # Remove any unnamed columns (caused by trailing commas)
+    df = df.loc[:, ~df.columns.str.contains('Unnamed', na=False)]
+    # Standardize column names
+    df.columns = df.columns.str.strip()
+    df.rename(columns={
+        "model name": "Model",
+        "source": "Type",
+        "v1 metric": "V1 Accuracy",
+        "v2 metric": "V2 Accuracy"
+    }, inplace=True)
+    # Convert percentage strings to floats (e.g., "100%" → 1.0)
+    for col in ["V1 Accuracy", "V2 Accuracy"]:
+        df[col] = df[col].astype(str).str.replace("%", "").str.strip()
+        df[col] = pd.to_numeric(df[col], errors='coerce') / 100
+    return df
+# Load dataset
+df = load_data(file_path)
+# Update the source filter with the actual options from the data
+source_filter = source_filter_placeholder.multiselect(
+    "Select Model Type",
+    options=df["Type"].unique().tolist(),
+    default=df["Type"].unique().tolist()
+)
+# Apply filtering based on the sidebar selections
+df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
+# Choose the correct metric version and compute Accuracy
+df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
+df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna()  # Drop rows with errors
+# Sort by Accuracy descending
+df_filtered = df_filtered.sort_values("Accuracy", ascending=False).reset_index(drop=True)
+# Compute dense ranking so that models with equal accuracy share the same rank
+df_filtered['Rank'] = df_filtered['Accuracy'].rank(method='dense', ascending=False).astype(int)
+df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']]
+# Use columns to display leaderboard and model details side-by-side
+col1, col2 = st.columns([2, 1])
+with col1:
+    st.subheader(f"Leaderboard for {dataset_choice.upper()} Version {dataset_version}")
+    st.dataframe(df_filtered.style.hide(axis='index'))
+with col2:
+    st.subheader("Model Details")
+    selected_model = st.selectbox("Select a Model", df_filtered["Model"].tolist())
+    model_details = df_filtered[df_filtered["Model"] == selected_model].iloc[0]
+    st.write(f"**Model:** {model_details['Model']}")
+    st.write(f"**Type:** {model_details['Type']}")
+    st.write(f"**Accuracy:** {model_details['Accuracy']:.2%}")
+    st.write(f"**Rank:** {model_details['Rank']}")
+# Footer
+st.markdown("---")
+st.info("More dataset benchmarks will be added to this hub in the future.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+altair==5.5.0
+attrs==25.3.0
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+gitdb==4.0.12
+GitPython==3.1.44
+idna==3.10
+Jinja2==3.1.6
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+MarkupSafe==3.0.2
+narwhals==1.31.0
+numpy==2.2.4
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+protobuf==5.29.4
+pyarrow==19.0.1
+pydeck==0.9.1
+python-dateutil==2.9.0.post0
+pytz==2025.1
+referencing==0.36.2
+requests==2.32.3
+rpds-py==0.23.1
+six==1.17.0
+smmap==5.0.2
+streamlit==1.43.2
+tenacity==9.0.0
+toml==0.10.2
+tornado==6.4.2
+typing_extensions==4.12.2
+tzdata==2025.2
+urllib3==2.3.0
+watchdog==6.0.0