Jay commited on
Commit
b93bb99
Β·
1 Parent(s): 6fc8478

doc: update changelog

Browse files
Files changed (3) hide show
  1. app.py +3 -6
  2. assets/text.py +3 -4
  3. changelog.md +12 -1
app.py CHANGED
@@ -15,7 +15,6 @@ ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
15
  ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
16
 
17
  METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
18
- # METRICS_GuardBench = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
19
 
20
  SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
21
 
@@ -165,9 +164,6 @@ def get_ChineseGuardBench(
165
  main_choice: List[str],
166
  ):
167
  leaderboard_table = get_dataset_new_csv(model_size)
168
- # elif main_choice != "Subclass":
169
- # subclass_choice = main_choice
170
- # leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
171
  return leaderboard_table
172
 
173
 
@@ -216,12 +212,12 @@ with gr.Blocks() as demo:
216
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
217
  # ----------------- modify text -----------------
218
 
219
- with gr.TabItem("πŸ… Generation", elem_id="od-benchmark-tab-table", id=6):
220
  dataframe_all_gen = gr.components.Dataframe(
221
  elem_id="leaderboard-table",
222
  )
223
 
224
- with gr.TabItem("πŸ… Perplexity", elem_id="od-benchmark-tab-table", id=5):
225
  dataframe_all_per = gr.components.Dataframe(
226
  elem_id="leaderboard-table",
227
  )
@@ -292,6 +288,7 @@ with gr.Blocks() as demo:
292
  )
293
 
294
  # this is new results for ChineseGuardBench
 
295
  # main_choice.change(
296
  # get_ChineseGuardBench,
297
  # inputs=[model_choice, main_choice],
 
15
  ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
16
 
17
  METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
 
18
 
19
  SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
20
 
 
164
  main_choice: List[str],
165
  ):
166
  leaderboard_table = get_dataset_new_csv(model_size)
 
 
 
167
  return leaderboard_table
168
 
169
 
 
212
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
213
  # ----------------- modify text -----------------
214
 
215
+ with gr.TabItem("πŸ… Generation", elem_id="od-benchmark-tab-table", id=5):
216
  dataframe_all_gen = gr.components.Dataframe(
217
  elem_id="leaderboard-table",
218
  )
219
 
220
+ with gr.TabItem("πŸ… Perplexity", elem_id="od-benchmark-tab-table", id=6):
221
  dataframe_all_per = gr.components.Dataframe(
222
  elem_id="leaderboard-table",
223
  )
 
288
  )
289
 
290
  # this is new results for ChineseGuardBench
291
+
292
  # main_choice.change(
293
  # get_ChineseGuardBench,
294
  # inputs=[model_choice, main_choice],
assets/text.py CHANGED
@@ -34,14 +34,13 @@ EVALUTION_TEXT= """
34
  <span style="font-size:16px; font-family: 'Times New Roman', serif">
35
  We evaluate the models using two methods: perplexity(multiple choice) and generation.
36
  For perplexity, we select the label which is the lowest perplexity as the predicted results.
37
- For generation, we use the content generated by the model to make prediction.
38
- The following are the results of the evaluation.
39
- In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.πŸ‘‡πŸ‘‡πŸ‘‡
40
  </span> <br><br>
41
 
42
 
43
  """ # noqa
44
-
45
  REFERENCE_TEXT = """
46
  # References
47
  <span style="font-size:16px; font-family: 'Times New Roman', serif">
 
34
  <span style="font-size:16px; font-family: 'Times New Roman', serif">
35
  We evaluate the models using two methods: perplexity(multiple choice) and generation.
36
  For perplexity, we select the label which is the lowest perplexity as the predicted results.
37
+ For generation, we use the content generated by the model to make prediction.
38
+ In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.
39
+ The following are the results of the evaluation.πŸ‘‡πŸ‘‡πŸ‘‡
40
  </span> <br><br>
41
 
42
 
43
  """ # noqa
 
44
  REFERENCE_TEXT = """
45
  # References
46
  <span style="font-size:16px; font-family: 'Times New Roman', serif">
changelog.md CHANGED
@@ -1,5 +1,6 @@
1
  # CHANGELOG
2
 
 
3
  ### 2024-7-16
4
  version: v1.0.0
5
 
@@ -66,4 +67,14 @@ version: v1.0.6
66
  - Deepseek-chat-v3-0324
67
  - Qwen3
68
  - Gemma-3
69
- - OpenThinker2
 
 
 
 
 
 
 
 
 
 
 
1
  # CHANGELOG
2
 
3
+
4
  ### 2024-7-16
5
  version: v1.0.0
6
 
 
67
  - Deepseek-chat-v3-0324
68
  - Qwen3
69
  - Gemma-3
70
+ - OpenThinker2
71
+
72
+ ### 2025-7-29
73
+ version: v1.0.7
74
+
75
+ changed:
76
+ - [1]feat: Update the two models required by Deepexi.
77
+ - Deepexi-Guard-3B
78
+ - Qwen2.5-3B-Instruct
79
+
80
+ - [2]feat: Update a new table ChineseGuardBench required by Deepxi.