Jay
commited on
Commit
Β·
b93bb99
1
Parent(s):
6fc8478
doc: update changelog
Browse files- app.py +3 -6
- assets/text.py +3 -4
- changelog.md +12 -1
app.py
CHANGED
@@ -15,7 +15,6 @@ ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
|
|
15 |
ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
|
16 |
|
17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
18 |
-
# METRICS_GuardBench = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
19 |
|
20 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
21 |
|
@@ -165,9 +164,6 @@ def get_ChineseGuardBench(
|
|
165 |
main_choice: List[str],
|
166 |
):
|
167 |
leaderboard_table = get_dataset_new_csv(model_size)
|
168 |
-
# elif main_choice != "Subclass":
|
169 |
-
# subclass_choice = main_choice
|
170 |
-
# leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
171 |
return leaderboard_table
|
172 |
|
173 |
|
@@ -216,12 +212,12 @@ with gr.Blocks() as demo:
|
|
216 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
217 |
# ----------------- modify text -----------------
|
218 |
|
219 |
-
with gr.TabItem("π
Generation", elem_id="od-benchmark-tab-table", id=
|
220 |
dataframe_all_gen = gr.components.Dataframe(
|
221 |
elem_id="leaderboard-table",
|
222 |
)
|
223 |
|
224 |
-
with gr.TabItem("π
Perplexity", elem_id="od-benchmark-tab-table", id=
|
225 |
dataframe_all_per = gr.components.Dataframe(
|
226 |
elem_id="leaderboard-table",
|
227 |
)
|
@@ -292,6 +288,7 @@ with gr.Blocks() as demo:
|
|
292 |
)
|
293 |
|
294 |
# this is new results for ChineseGuardBench
|
|
|
295 |
# main_choice.change(
|
296 |
# get_ChineseGuardBench,
|
297 |
# inputs=[model_choice, main_choice],
|
|
|
15 |
ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
|
16 |
|
17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
|
|
18 |
|
19 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
20 |
|
|
|
164 |
main_choice: List[str],
|
165 |
):
|
166 |
leaderboard_table = get_dataset_new_csv(model_size)
|
|
|
|
|
|
|
167 |
return leaderboard_table
|
168 |
|
169 |
|
|
|
212 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
213 |
# ----------------- modify text -----------------
|
214 |
|
215 |
+
with gr.TabItem("π
Generation", elem_id="od-benchmark-tab-table", id=5):
|
216 |
dataframe_all_gen = gr.components.Dataframe(
|
217 |
elem_id="leaderboard-table",
|
218 |
)
|
219 |
|
220 |
+
with gr.TabItem("π
Perplexity", elem_id="od-benchmark-tab-table", id=6):
|
221 |
dataframe_all_per = gr.components.Dataframe(
|
222 |
elem_id="leaderboard-table",
|
223 |
)
|
|
|
288 |
)
|
289 |
|
290 |
# this is new results for ChineseGuardBench
|
291 |
+
|
292 |
# main_choice.change(
|
293 |
# get_ChineseGuardBench,
|
294 |
# inputs=[model_choice, main_choice],
|
assets/text.py
CHANGED
@@ -34,14 +34,13 @@ EVALUTION_TEXT= """
|
|
34 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
37 |
-
For generation, we use the content generated by the model to make prediction.
|
38 |
-
|
39 |
-
|
40 |
</span> <br><br>
|
41 |
|
42 |
|
43 |
""" # noqa
|
44 |
-
|
45 |
REFERENCE_TEXT = """
|
46 |
# References
|
47 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
|
|
34 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
37 |
+
For generation, we use the content generated by the model to make prediction.
|
38 |
+
In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.
|
39 |
+
The following are the results of the evaluation.πππ
|
40 |
</span> <br><br>
|
41 |
|
42 |
|
43 |
""" # noqa
|
|
|
44 |
REFERENCE_TEXT = """
|
45 |
# References
|
46 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
changelog.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# CHANGELOG
|
2 |
|
|
|
3 |
### 2024-7-16
|
4 |
version: v1.0.0
|
5 |
|
@@ -66,4 +67,14 @@ version: v1.0.6
|
|
66 |
- Deepseek-chat-v3-0324
|
67 |
- Qwen3
|
68 |
- Gemma-3
|
69 |
-
- OpenThinker2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# CHANGELOG
|
2 |
|
3 |
+
|
4 |
### 2024-7-16
|
5 |
version: v1.0.0
|
6 |
|
|
|
67 |
- Deepseek-chat-v3-0324
|
68 |
- Qwen3
|
69 |
- Gemma-3
|
70 |
+
- OpenThinker2
|
71 |
+
|
72 |
+
### 2025-7-29
|
73 |
+
version: v1.0.7
|
74 |
+
|
75 |
+
changed:
|
76 |
+
- [1]feat: Update the two models required by Deepexi.
|
77 |
+
- Deepexi-Guard-3B
|
78 |
+
- Qwen2.5-3B-Instruct
|
79 |
+
|
80 |
+
- [2]feat: Update a new table ChineseGuardBench required by Deepxi.
|