File size: 9,306 Bytes
927e909 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Data-Viewer tab ---- 美化·修正版
"""
import gradio as gr
import pandas as pd
import json, random
from pathlib import Path
import re # 导入re模块
# ---------- 路径 ----------
BASE_DIR = Path(__file__).resolve().parent.parent
DATA_VIEWER_FILE = BASE_DIR / "data" / "data_viewer.jsonl"
# ---------- 工具 ----------
def load_data_viewer_data() -> pd.DataFrame:
records = []
if DATA_VIEWER_FILE.exists():
for line in DATA_VIEWER_FILE.read_text(encoding="utf-8").splitlines():
try:
records.append(json.loads(line))
except json.JSONDecodeError:
continue
df = pd.DataFrame(records)
req = ["model_name", "id", "prompt", "article", "overall_score",
"comprehensiveness_score", "insight_score",
"instruction_following_score", "readability_score"]
if df.empty or not all(c in df.columns for c in req):
# 如果缺少任何必要列,返回一个包含所有期望列的空DataFrame,以避免后续错误
return pd.DataFrame(columns=req)
df["id"] = df["id"].astype(str)
return df
def make_user_task_markdown(item_id, prompt):
return f"""### User Task 🎯
**Task ID:** {item_id}
**Description:** {prompt}"""
def make_article_markdown(article: str) -> str:
if article and isinstance(article, str):
# 首先,标准化已经存在的多个换行符
processed_article = re.sub(r'\n{2,}', '\n\n', article)
# 保护表格区域
table_pattern = r'(\|[^\n]*\n(?:[\|\s\-:]+\n)?(?:\|[^\n]*\n)*)'
tables = []
def replace_table(match):
tables.append(match.group(1))
return f'__TABLE_PLACEHOLDER_{len(tables)-1}__'
processed_article = re.sub(table_pattern, replace_table, processed_article)
# 处理列表格式:识别 * ** 模式并确保前面有换行
# 匹配模式:* **标题:** 内容
processed_article = re.sub(r'(?<!\n)\*\s*\*\*([^*]+?)\*\*:', r'\n\n* **\1**:', processed_article)
# 处理嵌套列表:识别 * ** 后跟 * ** 的模式
processed_article = re.sub(r'\*\s*\*\*([^*]+?)\*\*:\s*([^*]*?)\s*\*\s*\*\*', r'* **\1**: \2\n * **', processed_article)
# 在引用标记前确保有适当的换行
processed_article = re.sub(r'(?<!\n)\[\d+[^\]]*\]\*\s*\*\*', r'\n\n* **', processed_article)
# 处理其他孤立的换行符(避免破坏我们刚创建的格式)
# 但要小心不要影响列表结构
lines = processed_article.split('\n')
result_lines = []
for i, line in enumerate(lines):
result_lines.append(line)
# 如果当前行不为空,下一行也不为空,且都不是列表项,则添加空行
if (i < len(lines) - 1 and
line.strip() and
lines[i + 1].strip() and
not line.strip().startswith('*') and
not lines[i + 1].strip().startswith('*') and
not line.strip().startswith('#')):
# 检查是否已经是双换行
if i + 1 < len(lines) and lines[i + 1].strip():
result_lines.append('') # 添加空行
processed_article = '\n'.join(result_lines)
# 恢复表格
for i, table in enumerate(tables):
processed_article = processed_article.replace(f'__TABLE_PLACEHOLDER_{i}__', table)
else:
processed_article = article if article is not None else ""
return f"""### Generated Article 📖
{processed_article}"""
def make_scores_html(overall, comprehensiveness, insight, instruction, readability):
scores_data = [
("Overall Score", overall),
("Comprehensiveness Score", comprehensiveness),
("Insight Score", insight),
("Instruction-Following Score", instruction),
("Readability Score", readability)
]
html_items_str = ""
for title, score in scores_data:
score_value = score if score is not None else "N/A"
html_items_str += f"""
<div style="text-align: center; padding: 8px 5px; flex-grow: 1; flex-basis: 0;">
<h4 style="margin: 0 0 6px 0; font-size: 1.2em; color: #4a4a4a; font-weight: 600;">{title}</h4>
<p style="margin: 0; font-size: 1.2em; font-weight: bold; color: #333;">{score_value}</p>
</div>
"""
# Outer container styled to mimic the .card class from the main CSS block
return f"""
<div style="background:#fff; border:1px solid #e0e0e0; border-radius:8px; padding: 18px 15px; margin:18px 0; box-shadow:0 2px 4px rgba(0,0,0,.06);">
<div style="display: flex; justify-content: space-between; align-items: flex-start;">
{html_items_str}
</div>
</div>"""
# ---------- 生成 Tab ----------
def create_data_viewer_tab():
with gr.Tab("🔍Data Viewer"):
gr.HTML(
"""
<style>
.card{background:#fff;border:1px solid #e0e0e0;border-radius:8px;padding:22px 24px;margin:18px 0;box-shadow:0 2px 4px rgba(0,0,0,.06);}
.scrollable-sm{max-height:260px;overflow-y:auto;}
.scrollable-lg{max-height:700px;overflow-y:auto;} /* 调整高度为分数区域腾出空间 */
.card p{color:#424242 !important;line-height:1.75;margin:0 0 14px 0;text-align:justify;}
.card ul,.card ol{margin:12px 0 12px 24px;color:#424242 !important;}
.card li{margin:4px 0;color:#424242 !important;}
.card blockquote{border-left:4px solid #3498db;margin:18px 0;padding:14px 18px;background:#f8f9fa;font-style:italic;color:#555 !important;}
.card pre{background:#f8f8f8;color:#333 !important;padding:18px;border-radius:6px;overflow-x:auto;border:1px solid #e0e0e0;}
.card strong,.card b{font-weight:700 !important;}
.card::-webkit-scrollbar{width:10px}
.card::-webkit-scrollbar-track{background:#f5f5f5;border-radius:5px}
.card::-webkit-scrollbar-thumb{background:#c0c0c0;border-radius:5px}
.card::-webkit-scrollbar-thumb:hover{background:#a0a0a0}
</style>
"""
)
df = load_data_viewer_data()
if df.empty:
gr.Markdown("## ⚠️ 没有可用数据 \n请确认 `data/data_viewer.jsonl` 存在且字段齐全(包括所有分数)。")
return
models = sorted(df["model_name"].unique())
tasks_df = (
df[["id", "prompt"]].drop_duplicates()
.assign(id_num=lambda x: x["id"].astype(int))
.sort_values("id_num")
)
task_choices = []
for _, row in tasks_df.iterrows():
limit = 30 if int(row["id"]) <= 50 else 60
preview = row["prompt"][:limit] + ("…" if len(row["prompt"]) > limit else "")
task_choices.append(f"{row['id']}. {preview}")
init_model = random.choice(models) if models else None
init_task = random.choice(task_choices) if task_choices else None
with gr.Row():
model_dd = gr.Dropdown(label="Select Model", choices=models, value=init_model, interactive=True)
task_dd = gr.Dropdown(label="Select Task", choices=task_choices, value=init_task, interactive=True)
user_md = gr.Markdown(elem_classes=["card", "scrollable-sm"])
article_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
scores_html = gr.HTML() # 新增HTML组件用于显示分数
def fetch(model, task_disp):
if not model or not task_disp:
msg = "请选择模型和任务。"
return make_user_task_markdown("--", msg), make_article_markdown(msg), ""
item_id = task_disp.split(".", 1)[0].strip()
entry = df[(df["model_name"] == model) & (df["id"] == item_id)]
if entry.empty:
err = f"未找到模型 **{model}** 对应任务 **{item_id}** 的内容或分数。"
return make_user_task_markdown(item_id, err), make_article_markdown(err), ""
prompt = entry["prompt"].iloc[0]
article = entry["article"].iloc[0]
# 提取分数
overall = entry["overall_score"].iloc[0]
comprehensiveness = entry["comprehensiveness_score"].iloc[0]
insight = entry["insight_score"].iloc[0]
instruction = entry["instruction_following_score"].iloc[0]
readability = entry["readability_score"].iloc[0]
scores_content = make_scores_html(overall, comprehensiveness, insight, instruction, readability)
return make_user_task_markdown(item_id, prompt), make_article_markdown(article), scores_content
# 初始渲染
if init_model and init_task:
user_md.value, article_md.value, scores_html.value = fetch(init_model, init_task)
else:
user_md.value = make_user_task_markdown("--", "请选择模型和任务。")
article_md.value = make_article_markdown("请选择模型和任务。")
scores_html.value = ""
model_dd.change(fetch, inputs=[model_dd, task_dd], outputs=[user_md, article_md, scores_html])
task_dd.change(fetch, inputs=[model_dd, task_dd], outputs=[user_md, article_md, scores_html]) |