ash-98 commited on
Commit
2638ca2
·
1 Parent(s): 7d7287d
Files changed (5) hide show
  1. .gitignore +2 -0
  2. .streamlit/config.toml +5 -0
  3. Benchmark.csv +24 -0
  4. app.py +110 -0
  5. requirements.txt +37 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv
2
+ Dockerfile
.streamlit/config.toml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#01d2fc"
3
+ backgroundColor="#252040"
4
+ secondaryBackgroundColor="#262626"
5
+ textColor="#f4f4f4"
Benchmark.csv ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model name, source, v1 metric, v2 metric
2
+ OpenAI: GPT-4.5 (Preview),Proprietary Model,100%,97%
3
+ OpenAI: o3 Mini High,Proprietary Model,100%,96%
4
+ OpenAI: o3 Mini,Proprietary Model,100%,96%
5
+ OpenAI: GPT-4o,Proprietary Model,99.09%,95%
6
+ OpenAI: GPT-4o-mini,Proprietary Model,99.09%,97%
7
+ Anthropic: Claude 3.5 Sonnet,Proprietary Model,99.09%,97%
8
+ Anthropic: Claude 3.5 Haiku,Proprietary Model,100%,97%
9
+ Anthropic: Claude 3.7 Sonnet,Proprietary Model,99.09%,98%
10
+ Google: Gemma 3 27B ,Open Source,98.18%,95%
11
+ Google: Gemini Flash 2.0,Proprietary Model,100%,99%
12
+ Google: Gemini 2.0 Flash Lite,Proprietary Model,100%,97%
13
+ DeepSeek: R1,Open Source,100%,98%
14
+ DeepSeek: DeepSeek V3,Open Source,100%,97%
15
+ Mistral: Mistral Small 3.1 24B,Open Source,100%,97%
16
+ Mistral: Mistral Small 3,Open Source,99.09%,97%
17
+ Mistral Large 2411,Open Source,99.09%,96%
18
+ Meta: Llama 3.3 70B Instruct,Open Source,100%,97%
19
+ Meta: Llama 3.2 3B Instruct,Open Source,78.18%,75%
20
+ Qwen: QwQ 32B,Open Source,100.00%,96%
21
+ Microsoft: Phi 4,Proprietary Model,100%,97%
22
+ Microsoft: Phi-3.5 Mini 128K Instruct,Open Source,99.09%,97%
23
+ Microsoft: Phi-3 Mini 128K Instruct,Open Source,98.18%,98%
24
+
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ st.set_page_config(page_title="Cyber Benchmark Hub: SECQA Leaderboard", layout="wide")
5
+
6
+ st.title("Cyber Benchmark Hub: SECQA Leaderboard")
7
+ st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
8
+
9
+ with st.sidebar:
10
+ st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True)
11
+ st.markdown("[Priam.ai](https://www.priam.ai/)")
12
+ st.divider()
13
+
14
+ dataset_categories = ["Multiple Choice"]
15
+ selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
16
+
17
+ datasets_by_category = {
18
+ "Multiple Choice": ["secQA"],
19
+ }
20
+ dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
21
+
22
+ st.divider()
23
+ st.header("Filters & Options")
24
+ dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
25
+ # For filtering the leaderboard by model type
26
+ # Note: The available model types will come from the CSV, once loaded.
27
+ # We'll load the CSV later and then update this filter accordingly.
28
+ source_filter_placeholder = st.empty() # placeholder for source filter after data is loaded
29
+
30
+ st.markdown("---")
31
+ st.header("Test Parameters")
32
+ test_params = pd.DataFrame({
33
+ "Value": [0, 1, 0, 1, 0]
34
+ }, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"])
35
+ st.table(test_params)
36
+
37
+ # Determine file path based on dataset choice.
38
+ # For now, if dataset_choice is "secQA", we use "Benchmark.csv"
39
+ if dataset_choice == "secQA":
40
+ file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space
41
+ else:
42
+ file_path = "Benchmark.csv" # Placeholder: update with actual file paths for future datasets
43
+
44
+ # Function to load and clean CSV data
45
+ @st.cache_data
46
+ def load_data(file_path):
47
+ df = pd.read_csv(file_path)
48
+
49
+ # Remove any unnamed columns (caused by trailing commas)
50
+ df = df.loc[:, ~df.columns.str.contains('Unnamed', na=False)]
51
+
52
+ # Standardize column names
53
+ df.columns = df.columns.str.strip()
54
+ df.rename(columns={
55
+ "model name": "Model",
56
+ "source": "Type",
57
+ "v1 metric": "V1 Accuracy",
58
+ "v2 metric": "V2 Accuracy"
59
+ }, inplace=True)
60
+
61
+ # Convert percentage strings to floats (e.g., "100%" → 1.0)
62
+ for col in ["V1 Accuracy", "V2 Accuracy"]:
63
+ df[col] = df[col].astype(str).str.replace("%", "").str.strip()
64
+ df[col] = pd.to_numeric(df[col], errors='coerce') / 100
65
+
66
+ return df
67
+
68
+ # Load dataset
69
+ df = load_data(file_path)
70
+
71
+ # Update the source filter with the actual options from the data
72
+ source_filter = source_filter_placeholder.multiselect(
73
+ "Select Model Type",
74
+ options=df["Type"].unique().tolist(),
75
+ default=df["Type"].unique().tolist()
76
+ )
77
+
78
+ # Apply filtering based on the sidebar selections
79
+ df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
80
+
81
+ # Choose the correct metric version and compute Accuracy
82
+ df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
83
+ df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors
84
+
85
+ # Sort by Accuracy descending
86
+ df_filtered = df_filtered.sort_values("Accuracy", ascending=False).reset_index(drop=True)
87
+
88
+ # Compute dense ranking so that models with equal accuracy share the same rank
89
+ df_filtered['Rank'] = df_filtered['Accuracy'].rank(method='dense', ascending=False).astype(int)
90
+ df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']]
91
+
92
+ # Use columns to display leaderboard and model details side-by-side
93
+ col1, col2 = st.columns([2, 1])
94
+
95
+ with col1:
96
+ st.subheader(f"Leaderboard for {dataset_choice.upper()} Version {dataset_version}")
97
+ st.dataframe(df_filtered.style.hide(axis='index'))
98
+
99
+ with col2:
100
+ st.subheader("Model Details")
101
+ selected_model = st.selectbox("Select a Model", df_filtered["Model"].tolist())
102
+ model_details = df_filtered[df_filtered["Model"] == selected_model].iloc[0]
103
+ st.write(f"**Model:** {model_details['Model']}")
104
+ st.write(f"**Type:** {model_details['Type']}")
105
+ st.write(f"**Accuracy:** {model_details['Accuracy']:.2%}")
106
+ st.write(f"**Rank:** {model_details['Rank']}")
107
+
108
+ # Footer
109
+ st.markdown("---")
110
+ st.info("More dataset benchmarks will be added to this hub in the future.")
requirements.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.5.0
2
+ attrs==25.3.0
3
+ blinker==1.9.0
4
+ cachetools==5.5.2
5
+ certifi==2025.1.31
6
+ charset-normalizer==3.4.1
7
+ click==8.1.8
8
+ gitdb==4.0.12
9
+ GitPython==3.1.44
10
+ idna==3.10
11
+ Jinja2==3.1.6
12
+ jsonschema==4.23.0
13
+ jsonschema-specifications==2024.10.1
14
+ MarkupSafe==3.0.2
15
+ narwhals==1.31.0
16
+ numpy==2.2.4
17
+ packaging==24.2
18
+ pandas==2.2.3
19
+ pillow==11.1.0
20
+ protobuf==5.29.4
21
+ pyarrow==19.0.1
22
+ pydeck==0.9.1
23
+ python-dateutil==2.9.0.post0
24
+ pytz==2025.1
25
+ referencing==0.36.2
26
+ requests==2.32.3
27
+ rpds-py==0.23.1
28
+ six==1.17.0
29
+ smmap==5.0.2
30
+ streamlit==1.43.2
31
+ tenacity==9.0.0
32
+ toml==0.10.2
33
+ tornado==6.4.2
34
+ typing_extensions==4.12.2
35
+ tzdata==2025.2
36
+ urllib3==2.3.0
37
+ watchdog==6.0.0