Jay
commited on
Commit
Β·
e4c9620
1
Parent(s):
774db71
feat: update new table
Browse files- app.py +49 -2
- assets/text.py +2 -1
- data/ChineseGuardBench.csv +33 -0
app.py
CHANGED
@@ -9,13 +9,13 @@ from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWL
|
|
9 |
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
|
10 |
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
|
11 |
|
12 |
-
|
13 |
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
|
14 |
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
|
15 |
|
|
|
16 |
|
17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
18 |
-
|
19 |
|
20 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
21 |
|
@@ -70,6 +70,19 @@ def format_number(x):
|
|
70 |
return float(f"{x:.3}")
|
71 |
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def get_dataset_csv(
|
74 |
model_size: List[str],
|
75 |
):
|
@@ -146,6 +159,17 @@ def get_dataset_classfier_gen(
|
|
146 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
147 |
return leaderboard_table
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
def get_dataset_classfier_per(
|
150 |
model_size: List[str],
|
151 |
main_choice: List[str],
|
@@ -200,6 +224,11 @@ with gr.Blocks() as demo:
|
|
200 |
dataframe_all_per = gr.components.Dataframe(
|
201 |
elem_id="leaderboard-table",
|
202 |
)
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
# ----------------- modify text -----------------
|
205 |
with gr.Row():
|
@@ -261,6 +290,24 @@ with gr.Blocks() as demo:
|
|
261 |
outputs=dataframe_all_gen,
|
262 |
)
|
263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
demo.launch(share=True)
|
266 |
|
|
|
9 |
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
|
10 |
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
|
11 |
|
|
|
12 |
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
|
13 |
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
|
14 |
|
15 |
+
ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
|
16 |
|
17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
18 |
+
# METRICS_GuardBench = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
19 |
|
20 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
21 |
|
|
|
70 |
return float(f"{x:.3}")
|
71 |
|
72 |
|
73 |
+
def get_dataset_new_csv(
|
74 |
+
model_size: List[str],
|
75 |
+
):
|
76 |
+
df = ORIGINAL_DF_NEW[ORIGINAL_DF_NEW['Size'].isin(model_size)]
|
77 |
+
df = df.drop(columns="Size")
|
78 |
+
|
79 |
+
leaderboard_table = gr.components.Dataframe(
|
80 |
+
value=df,
|
81 |
+
interactive=False,
|
82 |
+
visible=True,
|
83 |
+
)
|
84 |
+
return leaderboard_table
|
85 |
+
|
86 |
def get_dataset_csv(
|
87 |
model_size: List[str],
|
88 |
):
|
|
|
159 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
160 |
return leaderboard_table
|
161 |
|
162 |
+
def get_ChineseGuardBench(
|
163 |
+
model_size: List[str],
|
164 |
+
main_choice: List[str],
|
165 |
+
):
|
166 |
+
leaderboard_table = get_dataset_new_csv(model_size)
|
167 |
+
# elif main_choice != "Subclass":
|
168 |
+
# subclass_choice = main_choice
|
169 |
+
# leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
170 |
+
return leaderboard_table
|
171 |
+
|
172 |
+
|
173 |
def get_dataset_classfier_per(
|
174 |
model_size: List[str],
|
175 |
main_choice: List[str],
|
|
|
224 |
dataframe_all_per = gr.components.Dataframe(
|
225 |
elem_id="leaderboard-table",
|
226 |
)
|
227 |
+
|
228 |
+
with gr.TabItem("π
NEW", elem_id="od-benchmark-tab-table", id=7):
|
229 |
+
dataframe_all_guardbench = gr.components.Dataframe(
|
230 |
+
elem_id="leaderboard-table",
|
231 |
+
)
|
232 |
|
233 |
# ----------------- modify text -----------------
|
234 |
with gr.Row():
|
|
|
290 |
outputs=dataframe_all_gen,
|
291 |
)
|
292 |
|
293 |
+
# this is new results for ChineseGuardBench
|
294 |
+
# main_choice.change(
|
295 |
+
# get_ChineseGuardBench,
|
296 |
+
# inputs=[model_choice, main_choice],
|
297 |
+
# outputs=dataframe_all_guardbench,
|
298 |
+
# )
|
299 |
+
|
300 |
+
model_choice.change(
|
301 |
+
get_ChineseGuardBench,
|
302 |
+
inputs=[model_choice, main_choice],
|
303 |
+
outputs=dataframe_all_guardbench,
|
304 |
+
)
|
305 |
+
|
306 |
+
demo.load(
|
307 |
+
fn=get_ChineseGuardBench,
|
308 |
+
inputs=[model_choice, main_choice],
|
309 |
+
outputs=dataframe_all_guardbench,
|
310 |
+
)
|
311 |
|
312 |
demo.launch(share=True)
|
313 |
|
assets/text.py
CHANGED
@@ -35,7 +35,8 @@ EVALUTION_TEXT= """
|
|
35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
37 |
For generation, we use the content generated by the model to make prediction.
|
38 |
-
The following are the results of the evaluation.
|
|
|
39 |
</span> <br><br>
|
40 |
|
41 |
|
|
|
35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
37 |
For generation, we use the content generated by the model to make prediction.
|
38 |
+
The following are the results of the evaluation.
|
39 |
+
In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.πππ
|
40 |
</span> <br><br>
|
41 |
|
42 |
|
data/ChineseGuardBench.csv
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Size,F1,Accuracy,Precision,Recall,FPR,FNR
|
2 |
+
Deepexi-Guard-3B,1B~5B,89.63 ,89.72 ,85.53 ,94.15 ,14.24 ,5.85
|
3 |
+
Qwen3-32B,~30B,88.54 ,89.25 ,89.08 ,88.02 ,9.64 ,11.98
|
4 |
+
Qwen3-235B-A22B,>65B,87.92 ,88.96 ,90.86 ,85.17 ,7.66 ,14.83
|
5 |
+
Qwen3-235B-A22B-Instruct-2507,>65B,87.81 ,89.13 ,93.27 ,82.96 ,5.35 ,17.04
|
6 |
+
GLM-Z1-9B-0414,5B~10B,87.36 ,88.03 ,87.11 ,87.61 ,11.59 ,12.39
|
7 |
+
Qwen2.5-72B-Instruct,>65B,86.81 ,88.27 ,92.50 ,81.79 ,5.93 ,18.21
|
8 |
+
QwQ-32B,~30B,86.80 ,88.35 ,93.33 ,81.12 ,5.18 ,18.88
|
9 |
+
Phi-4,10B~20B,85.95 ,86.88 ,86.90 ,85.02 ,11.45 ,14.98
|
10 |
+
Gemma-3-27B-it,~30B,85.29 ,86.78 ,89.83 ,81.19 ,8.22 ,18.81
|
11 |
+
DeepSeek-R1-0528,>65B,85.24 ,87.47 ,96.02 ,76.63 ,2.84 ,23.37
|
12 |
+
Mistral-Small-3.2-24B-Instruct,~30B,85.07 ,87.03 ,93.14 ,78.29 ,5.15 ,21.71
|
13 |
+
GLM-4-9B-chat,5B~10B,84.85 ,86.27 ,88.47 ,81.52 ,9.49 ,18.48
|
14 |
+
MD-Judge-v0_2-internlm2_7B,5B~10B,84.63 ,85.88 ,87.03 ,82.37 ,10.98 ,17.63
|
15 |
+
DeepSeek-R1-Distill-Qwen-32B,~30B,84.55 ,86.64 ,93.05 ,77.47 ,5.17 ,22.53
|
16 |
+
Hunyuan-A13B-Instruct,>65B,84.32 ,86.21 ,90.97 ,78.58 ,6.98 ,21.42
|
17 |
+
Moonlight-16B-A3B-Instruct,10B~20B,84.21 ,84.35 ,80.41 ,88.38 ,19.25 ,11.62
|
18 |
+
GLM-Z1-32B-0414,~30B,83.40 ,85.75 ,92.63 ,75.85 ,5.40 ,24.15
|
19 |
+
Qwen3-8B,5B~10B,83.05 ,85.51 ,92.69 ,75.23 ,5.30 ,24.77
|
20 |
+
Qwen2.5-7B-Instruct,5B~10B,82.96 ,84.99 ,89.41 ,77.37 ,8.20 ,22.63
|
21 |
+
Qwen2.5-1.5B-Instruct,1B~5B,79.48 ,77.08 ,68.83 ,94.03 ,38.07 ,5.97
|
22 |
+
shieldgemma-2B,1B~5B,79.19 ,79.63 ,76.50 ,82.06 ,22.54 ,17.94
|
23 |
+
Qwen2.5-3B-Instruct,1B~5B,79.05 ,77.57 ,70.69 ,89.66 ,33.25 ,10.34
|
24 |
+
SHTEC_safety_fence_model_7B,5B~10B,78.44 ,82.48 ,93.54 ,67.54 ,4.17 ,32.46
|
25 |
+
Qwen3-4B,1B~5B,78.16 ,82.50 ,95.12 ,66.33 ,3.04 ,33.67
|
26 |
+
SmolLM3-3B,1B~5B,76.10 ,79.19 ,83.09 ,70.19 ,12.77 ,29.81
|
27 |
+
ERNIE-4.5-21B-A3B-Paddle,~20B,75.21 ,80.58 ,94.58 ,62.42 ,3.20 ,37.58
|
28 |
+
Qwen3-1.7B,1B~5B,74.46 ,79.34 ,89.36 ,63.82 ,6.79 ,36.18
|
29 |
+
internlm2_5-7B-chat,5B~10B,71.52 ,78.49 ,95.34 ,57.22 ,2.50 ,42.78
|
30 |
+
Llama-Guard-4-12B,10B~20B,65.66 ,74.64 ,90.99 ,51.36 ,4.54 ,48.64
|
31 |
+
Llama-Guard-3-8B,5B~10B,59.33 ,72.44 ,97.80 ,42.58 ,0.86 ,57.42
|
32 |
+
DeepSeek-R1-Distill-Qwen-7B,5B~10B,45.27 ,65.53 ,90.36 ,30.20 ,2.88 ,69.80
|
33 |
+
Gemma-3n-E4B-it,5B~10B,44.05 ,64.88 ,88.80 ,29.29 ,3.30 ,70.71
|