Spaces:

SUSTech
/

ChineseSafe-Benchmark

Running

App Files Files Community

Jay commited on 22 days ago

Commit

e4c9620

1 Parent(s): 774db71

feat: update new table

Browse files

Files changed (3) hide show

app.py +49 -2
assets/text.py +2 -1
data/ChineseGuardBench.csv +33 -0

app.py CHANGED Viewed

@@ -9,13 +9,13 @@ from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWL
 ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
 ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
 ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
 ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
 SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
@@ -70,6 +70,19 @@ def format_number(x):
     return float(f"{x:.3}")
 def get_dataset_csv(
     model_size: List[str],
 ):
@@ -146,6 +159,17 @@ def get_dataset_classfier_gen(
         leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
     return leaderboard_table
 def get_dataset_classfier_per(
     model_size: List[str],
     main_choice: List[str],
@@ -200,6 +224,11 @@ with gr.Blocks() as demo:
             dataframe_all_per = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
     # ----------------- modify text -----------------
     with gr.Row():
@@ -261,6 +290,24 @@ with gr.Blocks() as demo:
         outputs=dataframe_all_gen,
     )
 demo.launch(share=True)

 ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
 ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
 ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
 ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
+ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
+# METRICS_GuardBench = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
 SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
     return float(f"{x:.3}")
+def get_dataset_new_csv(
+    model_size: List[str],
+):
+    df = ORIGINAL_DF_NEW[ORIGINAL_DF_NEW['Size'].isin(model_size)]
+    df = df.drop(columns="Size")
+    leaderboard_table = gr.components.Dataframe(
+        value=df,
+        interactive=False,
+        visible=True,
+    )
+    return leaderboard_table
 def get_dataset_csv(
     model_size: List[str],
 ):
         leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
     return leaderboard_table
+def get_ChineseGuardBench(
+    model_size: List[str],
+    main_choice: List[str],
+):
+    leaderboard_table = get_dataset_new_csv(model_size)
+    # elif main_choice != "Subclass":
+    #     subclass_choice = main_choice
+    #     leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
+    return leaderboard_table
 def get_dataset_classfier_per(
     model_size: List[str],
     main_choice: List[str],
             dataframe_all_per = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
+        with gr.TabItem("🏅 NEW", elem_id="od-benchmark-tab-table", id=7):
+            dataframe_all_guardbench = gr.components.Dataframe(
+                elem_id="leaderboard-table",
+            )
     # ----------------- modify text -----------------
     with gr.Row():
         outputs=dataframe_all_gen,
     )
+    # this is new results for ChineseGuardBench
+    # main_choice.change(
+    #     get_ChineseGuardBench,
+    #     inputs=[model_choice, main_choice],
+    #     outputs=dataframe_all_guardbench,
+    # )
+    model_choice.change(
+        get_ChineseGuardBench,
+        inputs=[model_choice, main_choice],
+        outputs=dataframe_all_guardbench,
+    )
+    demo.load(
+        fn=get_ChineseGuardBench,
+        inputs=[model_choice, main_choice],
+        outputs=dataframe_all_guardbench,
+    )
 demo.launch(share=True)

assets/text.py CHANGED Viewed

@@ -35,7 +35,8 @@ EVALUTION_TEXT= """
 We evaluate the models using two methods: perplexity(multiple choice) and generation.
 For perplexity, we select the label which is the lowest perplexity as the predicted results.
 For generation, we use the content generated by the model to make prediction.
-The following are the results of the evaluation. 👇👇👇
 </span> <br><br>

 We evaluate the models using two methods: perplexity(multiple choice) and generation.
 For perplexity, we select the label which is the lowest perplexity as the predicted results.
 For generation, we use the content generated by the model to make prediction.
+The following are the results of the evaluation.
+In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.👇👇👇
 </span> <br><br>

data/ChineseGuardBench.csv ADDED Viewed

	@@ -0,0 +1,33 @@

+Model,Size,F1,Accuracy,Precision,Recall,FPR,FNR
+Deepexi-Guard-3B,1B~5B,89.63 ,89.72 ,85.53 ,94.15 ,14.24 ,5.85
+Qwen3-32B,~30B,88.54 ,89.25 ,89.08 ,88.02 ,9.64 ,11.98
+Qwen3-235B-A22B,>65B,87.92 ,88.96 ,90.86 ,85.17 ,7.66 ,14.83
+Qwen3-235B-A22B-Instruct-2507,>65B,87.81 ,89.13 ,93.27 ,82.96 ,5.35 ,17.04
+GLM-Z1-9B-0414,5B~10B,87.36 ,88.03 ,87.11 ,87.61 ,11.59 ,12.39
+Qwen2.5-72B-Instruct,>65B,86.81 ,88.27 ,92.50 ,81.79 ,5.93 ,18.21
+QwQ-32B,~30B,86.80 ,88.35 ,93.33 ,81.12 ,5.18 ,18.88
+Phi-4,10B~20B,85.95 ,86.88 ,86.90 ,85.02 ,11.45 ,14.98
+Gemma-3-27B-it,~30B,85.29 ,86.78 ,89.83 ,81.19 ,8.22 ,18.81
+DeepSeek-R1-0528,>65B,85.24 ,87.47 ,96.02 ,76.63 ,2.84 ,23.37
+Mistral-Small-3.2-24B-Instruct,~30B,85.07 ,87.03 ,93.14 ,78.29 ,5.15 ,21.71
+GLM-4-9B-chat,5B~10B,84.85 ,86.27 ,88.47 ,81.52 ,9.49 ,18.48
+MD-Judge-v0_2-internlm2_7B,5B~10B,84.63 ,85.88 ,87.03 ,82.37 ,10.98 ,17.63
+DeepSeek-R1-Distill-Qwen-32B,~30B,84.55 ,86.64 ,93.05 ,77.47 ,5.17 ,22.53
+Hunyuan-A13B-Instruct,>65B,84.32 ,86.21 ,90.97 ,78.58 ,6.98 ,21.42
+Moonlight-16B-A3B-Instruct,10B~20B,84.21 ,84.35 ,80.41 ,88.38 ,19.25 ,11.62
+GLM-Z1-32B-0414,~30B,83.40 ,85.75 ,92.63 ,75.85 ,5.40 ,24.15
+Qwen3-8B,5B~10B,83.05 ,85.51 ,92.69 ,75.23 ,5.30 ,24.77
+Qwen2.5-7B-Instruct,5B~10B,82.96 ,84.99 ,89.41 ,77.37 ,8.20 ,22.63
+Qwen2.5-1.5B-Instruct,1B~5B,79.48 ,77.08 ,68.83 ,94.03 ,38.07 ,5.97
+shieldgemma-2B,1B~5B,79.19 ,79.63 ,76.50 ,82.06 ,22.54 ,17.94
+Qwen2.5-3B-Instruct,1B~5B,79.05 ,77.57 ,70.69 ,89.66 ,33.25 ,10.34
+SHTEC_safety_fence_model_7B,5B~10B,78.44 ,82.48 ,93.54 ,67.54 ,4.17 ,32.46
+Qwen3-4B,1B~5B,78.16 ,82.50 ,95.12 ,66.33 ,3.04 ,33.67
+SmolLM3-3B,1B~5B,76.10 ,79.19 ,83.09 ,70.19 ,12.77 ,29.81
+ERNIE-4.5-21B-A3B-Paddle,~20B,75.21 ,80.58 ,94.58 ,62.42 ,3.20 ,37.58
+Qwen3-1.7B,1B~5B,74.46 ,79.34 ,89.36 ,63.82 ,6.79 ,36.18
+internlm2_5-7B-chat,5B~10B,71.52 ,78.49 ,95.34 ,57.22 ,2.50 ,42.78
+Llama-Guard-4-12B,10B~20B,65.66 ,74.64 ,90.99 ,51.36 ,4.54 ,48.64
+Llama-Guard-3-8B,5B~10B,59.33 ,72.44 ,97.80 ,42.58 ,0.86 ,57.42
+DeepSeek-R1-Distill-Qwen-7B,5B~10B,45.27 ,65.53 ,90.36 ,30.20 ,2.88 ,69.80
+Gemma-3n-E4B-it,5B~10B,44.05 ,64.88 ,88.80 ,29.29 ,3.30 ,70.71