File size: 5,676 Bytes
927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 1d11ffb 927e909 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import os
from pathlib import Path
def load_scores_for_model(model_results_dir: Path):
scores_by_id = {}
raw_results_file = model_results_dir / "raw_results.jsonl"
if not raw_results_file.exists():
print(f"警告: 未找到模型 {model_results_dir.name} 的结果文件: {raw_results_file}")
return scores_by_id
print(f" 正在从 {model_results_dir.name}/raw_results.jsonl 加载分数...")
with open(raw_results_file, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
try:
data = json.loads(line.strip())
article_id = str(data.get('id'))
if not article_id:
print(f" 警告: {model_results_dir.name} 第 {i+1} 行缺少ID,已跳过。")
continue
overall_score_raw = data.get('overall_score', 0.0)
overall_score_scaled = overall_score_raw * 100
comprehensiveness_score_raw = data.get('comprehensiveness', 0.0)
insight_score_raw = data.get('insight', 0.0)
instruction_score_raw = data.get('instruction_following', 0.0)
readability_score_raw = data.get('readability', 0.0)
scores_by_id[article_id] = {
'overall_score': f"{overall_score_scaled:.2f}",
'comprehensiveness_score': f"{comprehensiveness_score_raw * 100:.2f}",
'insight_score': f"{insight_score_raw * 100:.2f}",
'instruction_following_score': f"{instruction_score_raw * 100:.2f}",
'readability_score': f"{readability_score_raw * 100:.2f}"
}
except json.JSONDecodeError as e:
print(f" 错误: 解析JSON时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}")
except Exception as e:
print(f" 错误: 处理数据时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}")
print(f" 为模型 {model_results_dir.name} 加载了 {len(scores_by_id)}篇文章的分数")
return scores_by_id
def merge_jsonl_files():
project_root = Path(__file__).resolve().parent.parent
raw_data_dir = project_root / "data" / "raw_data"
raw_results_dir = project_root / "data" / "raw_results"
output_file = project_root / "data" / "data_viewer.jsonl"
input_files = list(raw_data_dir.glob("*.jsonl"))
print(f"在 {raw_data_dir} 中找到 {len(input_files)} 个模型JSONL文件")
if not input_files:
print("未找到任何原始数据文件,已退出。")
return
with open(output_file, 'w', encoding='utf-8') as f:
pass
all_merged_data = []
for raw_data_file in input_files:
model_name = raw_data_file.stem
print(f"正在处理原始数据文件: {raw_data_file.name} (模型: {model_name})")
model_results_dir = raw_results_dir / model_name
if not model_results_dir.exists():
print(f" 警告: 未找到模型 {model_name} 对应的结果文件夹: {model_results_dir}")
continue
scores_for_current_model = load_scores_for_model(model_results_dir)
processed_articles_count = 0
with open(raw_data_file, 'r', encoding='utf-8') as f_raw:
for i, line in enumerate(f_raw):
try:
article_data = json.loads(line.strip())
article_id = str(article_data.get('id'))
if not article_id:
print(f" 警告: {raw_data_file.name} 第 {i+1} 行缺少ID,已跳过。")
continue
article_scores = scores_for_current_model.get(article_id, {})
if not article_scores:
print(f" 警告: 模型 {model_name} 的文章ID {article_id} 未在结果文件中找到分数。")
merged_item = {
'model_name': model_name,
'id': article_id,
'prompt': article_data.get('prompt'),
'article': article_data.get('article'),
'overall_score': article_scores.get('overall_score'),
'comprehensiveness_score': article_scores.get('comprehensiveness_score'),
'insight_score': article_scores.get('insight_score'),
'instruction_following_score': article_scores.get('instruction_following_score'),
'readability_score': article_scores.get('readability_score')
}
all_merged_data.append(merged_item)
processed_articles_count += 1
except json.JSONDecodeError as e:
print(f" 错误: 解析原始数据JSON时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
except Exception as e:
print(f" 错误: 处理原始数据时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
print(f" 为模型 {model_name} 处理了 {processed_articles_count} 篇文章数据。")
with open(output_file, 'w', encoding='utf-8') as f_out:
for item in all_merged_data:
f_out.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"\n成功合并并保存到: {output_file}, 共 {len(all_merged_data)} 条记录")
if __name__ == "__main__":
merge_jsonl_files()
print("所有文件处理完成!") |