sjdnjn commited on
Commit
7a53008
·
verified ·
1 Parent(s): b9c3284

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -196
app.py CHANGED
@@ -1,204 +1,186 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
  try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
 
 
 
 
43
  try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
  )
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
- with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
-
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
4
  import pandas as pd
5
+ import plotly.express as px
6
+ import os # 用于检查文件是否存在
7
+
8
+ # --- 1. 模型加载 ---
9
+ # 替换为你们实际选择的模型。
10
+ # 记住,每位同学至少负责集成一个模型,以便提交记录均衡。
11
+ # 如果模型很大,加载时间会比较久,或者可能需要更高的 Space 硬件配置。
12
+ # 在免费 Space 上,推荐选择较小的模型进行测试。
13
+
14
+ # --- 模型 1: DistilGPT2 (小型通用文本生成模型) ---
15
+ # 负责同学: [牛正武]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  try:
17
+ model1_name = "distilbert/distilgpt2"
18
+ # device=0 表示使用第一个GPU,如果没有GPU则使用-1表示CPU
19
+ generator1 = pipeline("text-generation", model=model1_name, device=0 if torch.cuda.is_available() else -1)
20
+ print(f"✅ 模型 1 ({model1_name}) 加载成功!")
21
+ except Exception as e:
22
+ print(f"❌ 模型 1 ({model1_name}) 加载失败: {e}")
23
+ generator1 = None # 如果加载失败,将生成器设为 None
24
+
25
+ # --- 模型 2: GPT2 (通用文本生成模型) ---
26
+ # 负责同学: [孙世纪·]
27
  try:
28
+ model2_name = "gpt2" # 另一个相对较小的通用文本生成模型
29
+ generator2 = pipeline("text-generation", model=model2_name, device=0 if torch.cuda.is_available() else -1)
30
+ print(f"✅ 模型 2 ({model2_name}) 加载成功!")
31
+ except Exception as e:
32
+ print(f"❌ 模型 2 ({model2_name}) 加载失败: {e}")
33
+ generator2 = None
34
+
35
+ # --- [可选] 模型 3: 你可以根据需要添加第三个模型 ---
36
+ # 例如:一个翻译模型,或者一个专门的对话模型
37
+ # model3_name = "Helsinki-NLP/opus-mt-en-zh" # 这是一个英译中翻译模型
38
+ # try:
39
+ # translator = pipeline("translation_en_to_zh", model=model3_name, device=0 if torch.cuda.is_available() else -1)
40
+ # print(f"✅ 模型 3 ({model3_name}) 加载成功!")
41
+ # except Exception as e:
42
+ # print(f"❌ 模型 3 ({model3_name}) 加载失败: {e}")
43
+ # translator = None
44
+
45
+
46
+ # --- 2. 推理函数 ---
47
+ # 这个函数接收统一的用户输入,并调用所有加载成功的模型进行推理。
48
+ def generate_text_outputs(prompt, max_length=100): # 增加 max_length 参数
49
+ output1 = "模型 1 未加载或生成失败。"
50
+ output2 = "模型 2 未加载或生成失败。"
51
+ # output3 = "模型 3 未加载或生成失败。" # 如果有第三个模型
52
+
53
+ if generator1:
54
+ try:
55
+ # 对于文本生成模型,max_new_tokens 控制生成长度
56
+ gen1_result = generator1(prompt, max_new_tokens=max_length, num_return_sequences=1, truncation=True)
57
+ output1 = gen1_result[0]['generated_text']
58
+ # 清理:移除输入部分,只保留生成内容
59
+ if output1.startswith(prompt):
60
+ output1 = output1[len(prompt):].strip()
61
+ except Exception as e:
62
+ output1 = f"模型 1 (DistilGPT2) 生成错误: {e}"
63
+
64
+ if generator2:
65
+ try:
66
+ gen2_result = generator2(prompt, max_new_tokens=max_length, num_return_sequences=1, truncation=True)
67
+ output2 = gen2_result[0]['generated_text']
68
+ if output2.startswith(prompt):
69
+ output2 = output2[len(prompt):].strip()
70
+ except Exception as e:
71
+ output2 = f"模型 2 (GPT2) 生成错误: {e}"
72
+
73
+ # # 如果有第三个模型
74
+ # if translator:
75
+ # try:
76
+ # trans_result = translator(prompt)
77
+ # output3 = trans_result[0]['translation_text']
78
+ # except Exception as e:
79
+ # output3 = f"模型 3 (翻译模型) 生成错误: {e}"
80
+
81
+ return output1, output2 # 如果有第三个模型,这里也需要返回 output3
82
+
83
+
84
+ # --- 3. GRACE 评估数据(示例数据,请根据你们的实际评估结果修改) ---
85
+ # 这些数据将用于 "LLM Benchmark" 选项卡中的雷达图和表格。
86
+ # 评分范围通常是 1-5 分,分数越高代表表现越好。
87
+ grace_data = {
88
+ "维度": ["Generalization (泛化性)", "Relevance (相关性)", "Artistry (创新表现力)", "Efficiency (效率性)"],
89
+ # 请替换为你们实际使用的模型名称和评估分数
90
+ "DistilGPT2": [3.5, 3.0, 2.8, 4.5], # 示例分数
91
+ "GPT2": [4.0, 3.8, 3.5, 4.0] # 示例分数
92
+ # "你的模型3名称": [4.2, 4.5, 4.0, 3.0] # 如果有第三个模型
93
+ }
94
+ grace_df = pd.DataFrame(grace_data)
95
+
96
+
97
+ # --- 4. Gradio 界面构建 ---
98
+
99
+ # LLM Benchmark 选项卡内容创建函数 (30分)
100
+ def create_benchmark_tab():
101
+ # 生成雷达图
102
+ fig = px.line_polar(grace_df, r=grace_df.columns[1], theta="维度", line_close=True,
103
+ range_r=[0, 5], title="GRACE 评估:模型横向对比")
104
+ # 添加其他模型的轨迹
105
+ for col in grace_df.columns[2:]:
106
+ fig.add_trace(px.line_polar(grace_df, r=col, theta="维度", line_close=True).data[0])
107
+
108
+ fig.update_traces(fill='toself', opacity=0.6) # 填充颜色,增加透明度
109
+ fig.update_layout(
110
+ polar=dict(
111
+ radialaxis=dict(visible=True, range=[0, 5], tickvals=[1,2,3,4,5], ticktext=['1分','2分','3分','4分','5分']) # 显示刻度
112
  ),
113
+ showlegend=True, # 显示图例
114
+ # title_font_size=20 # 标题字体大小
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  )
116
 
117
+ return gr.Column(
118
+ gr.Markdown("## 📊 模型性能对比 (GRACE 评估)"),
119
+ gr.Markdown("本页展示了我们选用的模型在 GRACE 框架下的评估结果。数据为 1-5 分,分数越高代表表现越好。"),
120
+ gr.Plot(fig, label="GRACE 评估雷达图"),
121
+ gr.Markdown("### GRACE 评估数据"),
122
+ gr.DataFrame(grace_df, label="详细评估数据")
123
+ )
124
+
125
+ # Arena 选项卡内容创建函数 (40分)
126
+ def create_arena_tab():
127
+ with gr.Blocks() as arena_block:
128
+ gr.Markdown("## ⚔️ Arena: 模型实时对比")
129
+ gr.Markdown("在这里,您可以输入一段文本,实时查看不同模型的生成效果,并进行直观对比。")
130
+
131
+ with gr.Row():
132
+ # 统一输入框
133
+ user_input = gr.Textbox(label="您的输入:", placeholder="请输入您想让模型处理的文本或指令...", lines=3)
134
+ # 增加生成长度控制
135
+ gen_length_slider = gr.Slider(minimum=20, maximum=300, value=100, step=10, label="生成文本最大长度")
136
+ generate_btn = gr.Button("🚀 生成并对比")
137
+
138
+ with gr.Row():
139
+ # 模型 1 输出
140
+ output_model1 = gr.Textbox(label="模型 1 (DistilGPT2) 输出:", interactive=False, lines=10)
141
+ # 模型 2 输出
142
+ output_model2 = gr.Textbox(label="模型 2 (GPT2) 输出:", interactive=False, lines=10)
143
+ # # 如果有第三个模型
144
+ # output_model3 = gr.Textbox(label="模型 3 (翻译模型) 输出:", interactive=False, lines=10)
145
+
146
+ # 绑定按钮点击事件到推理函数
147
+ generate_btn.click(
148
+ fn=generate_text_outputs,
149
+ inputs=[user_input, gen_length_slider],
150
+ outputs=[output_model1, output_model2] # 如果有第三个模型,这里也需要添加 output_model3
151
+ )
152
+ return arena_block
153
+
154
+ # Report 选项卡内容创建函数 (30分)
155
+ def create_report_tab():
156
+ report_md_path = "report.md" # 假设你的报告 Markdown 文件名为 report.md
157
+
158
+ if os.path.exists(report_md_path):
159
+ with open(report_md_path, "r", encoding="utf-8") as f:
160
+ report_content = f.read()
161
+ return gr.Markdown(report_content)
162
+ else:
163
+ return gr.Markdown(f"## ❗ 错误:未找到报告文件 '{report_md_path}'。\n请确保已在Files页面创建 `report.md` 文件。")
164
+
165
+ # --- Gradio 应用界面定义 ---
166
+ with gr.Blocks(title="AI模型对比项目") as demo:
167
+ gr.Markdown("# 🤖 AI 模型对比与评估平台")
168
+ gr.Markdown("本平台旨在通过交互式界面,对比分析不同 AI 模型在特定任务上的表现。")
169
+
170
+ # 定义选项卡
171
+ with gr.Tab("⚔️ Arena"):
172
+ # 直接调用创建函数并渲染,确保每次点击 Tab 时内容都正确加载
173
+ create_arena_tab().render()
174
+
175
+ with gr.Tab("📊 LLM Benchmark"):
176
+ create_benchmark_tab().render()
177
+
178
+ with gr.Tab("📝 Report"):
179
+ # 使用 gr.Markdown 而不是 gr.load,更直接地显示文件内容
180
+ create_report_tab().render()
181
+
182
+ # 启动 Gradio 应用
183
+ if __name__ == "__main__":
184
+ demo.launch()
185
+
186