Jay commited on
Commit
e4c9620
Β·
1 Parent(s): 774db71

feat: update new table

Browse files
Files changed (3) hide show
  1. app.py +49 -2
  2. assets/text.py +2 -1
  3. data/ChineseGuardBench.csv +33 -0
app.py CHANGED
@@ -9,13 +9,13 @@ from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWL
9
  ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
10
  ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
11
 
12
-
13
  ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
14
  ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
15
 
 
16
 
17
  METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
18
-
19
 
20
  SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
21
 
@@ -70,6 +70,19 @@ def format_number(x):
70
  return float(f"{x:.3}")
71
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def get_dataset_csv(
74
  model_size: List[str],
75
  ):
@@ -146,6 +159,17 @@ def get_dataset_classfier_gen(
146
  leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
147
  return leaderboard_table
148
 
 
 
 
 
 
 
 
 
 
 
 
149
  def get_dataset_classfier_per(
150
  model_size: List[str],
151
  main_choice: List[str],
@@ -200,6 +224,11 @@ with gr.Blocks() as demo:
200
  dataframe_all_per = gr.components.Dataframe(
201
  elem_id="leaderboard-table",
202
  )
 
 
 
 
 
203
 
204
  # ----------------- modify text -----------------
205
  with gr.Row():
@@ -261,6 +290,24 @@ with gr.Blocks() as demo:
261
  outputs=dataframe_all_gen,
262
  )
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  demo.launch(share=True)
266
 
 
9
  ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
10
  ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
11
 
 
12
  ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
13
  ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
14
 
15
+ ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
16
 
17
  METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
18
+ # METRICS_GuardBench = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
19
 
20
  SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
21
 
 
70
  return float(f"{x:.3}")
71
 
72
 
73
+ def get_dataset_new_csv(
74
+ model_size: List[str],
75
+ ):
76
+ df = ORIGINAL_DF_NEW[ORIGINAL_DF_NEW['Size'].isin(model_size)]
77
+ df = df.drop(columns="Size")
78
+
79
+ leaderboard_table = gr.components.Dataframe(
80
+ value=df,
81
+ interactive=False,
82
+ visible=True,
83
+ )
84
+ return leaderboard_table
85
+
86
  def get_dataset_csv(
87
  model_size: List[str],
88
  ):
 
159
  leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
160
  return leaderboard_table
161
 
162
+ def get_ChineseGuardBench(
163
+ model_size: List[str],
164
+ main_choice: List[str],
165
+ ):
166
+ leaderboard_table = get_dataset_new_csv(model_size)
167
+ # elif main_choice != "Subclass":
168
+ # subclass_choice = main_choice
169
+ # leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
170
+ return leaderboard_table
171
+
172
+
173
  def get_dataset_classfier_per(
174
  model_size: List[str],
175
  main_choice: List[str],
 
224
  dataframe_all_per = gr.components.Dataframe(
225
  elem_id="leaderboard-table",
226
  )
227
+
228
+ with gr.TabItem("πŸ… NEW", elem_id="od-benchmark-tab-table", id=7):
229
+ dataframe_all_guardbench = gr.components.Dataframe(
230
+ elem_id="leaderboard-table",
231
+ )
232
 
233
  # ----------------- modify text -----------------
234
  with gr.Row():
 
290
  outputs=dataframe_all_gen,
291
  )
292
 
293
+ # this is new results for ChineseGuardBench
294
+ # main_choice.change(
295
+ # get_ChineseGuardBench,
296
+ # inputs=[model_choice, main_choice],
297
+ # outputs=dataframe_all_guardbench,
298
+ # )
299
+
300
+ model_choice.change(
301
+ get_ChineseGuardBench,
302
+ inputs=[model_choice, main_choice],
303
+ outputs=dataframe_all_guardbench,
304
+ )
305
+
306
+ demo.load(
307
+ fn=get_ChineseGuardBench,
308
+ inputs=[model_choice, main_choice],
309
+ outputs=dataframe_all_guardbench,
310
+ )
311
 
312
  demo.launch(share=True)
313
 
assets/text.py CHANGED
@@ -35,7 +35,8 @@ EVALUTION_TEXT= """
35
  We evaluate the models using two methods: perplexity(multiple choice) and generation.
36
  For perplexity, we select the label which is the lowest perplexity as the predicted results.
37
  For generation, we use the content generated by the model to make prediction.
38
- The following are the results of the evaluation. πŸ‘‡πŸ‘‡πŸ‘‡
 
39
  </span> <br><br>
40
 
41
 
 
35
  We evaluate the models using two methods: perplexity(multiple choice) and generation.
36
  For perplexity, we select the label which is the lowest perplexity as the predicted results.
37
  For generation, we use the content generated by the model to make prediction.
38
+ The following are the results of the evaluation.
39
+ In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.πŸ‘‡πŸ‘‡πŸ‘‡
40
  </span> <br><br>
41
 
42
 
data/ChineseGuardBench.csv ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Size,F1,Accuracy,Precision,Recall,FPR,FNR
2
+ Deepexi-Guard-3B,1B~5B,89.63 ,89.72 ,85.53 ,94.15 ,14.24 ,5.85
3
+ Qwen3-32B,~30B,88.54 ,89.25 ,89.08 ,88.02 ,9.64 ,11.98
4
+ Qwen3-235B-A22B,>65B,87.92 ,88.96 ,90.86 ,85.17 ,7.66 ,14.83
5
+ Qwen3-235B-A22B-Instruct-2507,>65B,87.81 ,89.13 ,93.27 ,82.96 ,5.35 ,17.04
6
+ GLM-Z1-9B-0414,5B~10B,87.36 ,88.03 ,87.11 ,87.61 ,11.59 ,12.39
7
+ Qwen2.5-72B-Instruct,>65B,86.81 ,88.27 ,92.50 ,81.79 ,5.93 ,18.21
8
+ QwQ-32B,~30B,86.80 ,88.35 ,93.33 ,81.12 ,5.18 ,18.88
9
+ Phi-4,10B~20B,85.95 ,86.88 ,86.90 ,85.02 ,11.45 ,14.98
10
+ Gemma-3-27B-it,~30B,85.29 ,86.78 ,89.83 ,81.19 ,8.22 ,18.81
11
+ DeepSeek-R1-0528,>65B,85.24 ,87.47 ,96.02 ,76.63 ,2.84 ,23.37
12
+ Mistral-Small-3.2-24B-Instruct,~30B,85.07 ,87.03 ,93.14 ,78.29 ,5.15 ,21.71
13
+ GLM-4-9B-chat,5B~10B,84.85 ,86.27 ,88.47 ,81.52 ,9.49 ,18.48
14
+ MD-Judge-v0_2-internlm2_7B,5B~10B,84.63 ,85.88 ,87.03 ,82.37 ,10.98 ,17.63
15
+ DeepSeek-R1-Distill-Qwen-32B,~30B,84.55 ,86.64 ,93.05 ,77.47 ,5.17 ,22.53
16
+ Hunyuan-A13B-Instruct,>65B,84.32 ,86.21 ,90.97 ,78.58 ,6.98 ,21.42
17
+ Moonlight-16B-A3B-Instruct,10B~20B,84.21 ,84.35 ,80.41 ,88.38 ,19.25 ,11.62
18
+ GLM-Z1-32B-0414,~30B,83.40 ,85.75 ,92.63 ,75.85 ,5.40 ,24.15
19
+ Qwen3-8B,5B~10B,83.05 ,85.51 ,92.69 ,75.23 ,5.30 ,24.77
20
+ Qwen2.5-7B-Instruct,5B~10B,82.96 ,84.99 ,89.41 ,77.37 ,8.20 ,22.63
21
+ Qwen2.5-1.5B-Instruct,1B~5B,79.48 ,77.08 ,68.83 ,94.03 ,38.07 ,5.97
22
+ shieldgemma-2B,1B~5B,79.19 ,79.63 ,76.50 ,82.06 ,22.54 ,17.94
23
+ Qwen2.5-3B-Instruct,1B~5B,79.05 ,77.57 ,70.69 ,89.66 ,33.25 ,10.34
24
+ SHTEC_safety_fence_model_7B,5B~10B,78.44 ,82.48 ,93.54 ,67.54 ,4.17 ,32.46
25
+ Qwen3-4B,1B~5B,78.16 ,82.50 ,95.12 ,66.33 ,3.04 ,33.67
26
+ SmolLM3-3B,1B~5B,76.10 ,79.19 ,83.09 ,70.19 ,12.77 ,29.81
27
+ ERNIE-4.5-21B-A3B-Paddle,~20B,75.21 ,80.58 ,94.58 ,62.42 ,3.20 ,37.58
28
+ Qwen3-1.7B,1B~5B,74.46 ,79.34 ,89.36 ,63.82 ,6.79 ,36.18
29
+ internlm2_5-7B-chat,5B~10B,71.52 ,78.49 ,95.34 ,57.22 ,2.50 ,42.78
30
+ Llama-Guard-4-12B,10B~20B,65.66 ,74.64 ,90.99 ,51.36 ,4.54 ,48.64
31
+ Llama-Guard-3-8B,5B~10B,59.33 ,72.44 ,97.80 ,42.58 ,0.86 ,57.42
32
+ DeepSeek-R1-Distill-Qwen-7B,5B~10B,45.27 ,65.53 ,90.36 ,30.20 ,2.88 ,69.80
33
+ Gemma-3n-E4B-it,5B~10B,44.05 ,64.88 ,88.80 ,29.29 ,3.30 ,70.71