File size: 5,413 Bytes
927e909 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import os
import csv
from pathlib import Path
from collections import defaultdict
def calculate_dimension_score(target_score, reference_score):
"""计算单个维度的分数"""
return target_score / (target_score + reference_score)
def process_model_data(model_file):
"""处理单个模型文件的数据"""
model_name = model_file.stem
print(f"正在处理模型: {model_name}")
overall_scores = []
comprehensiveness_scores = []
insight_scores = []
instruction_following_scores = []
readability_scores = []
with open(model_file, 'r', encoding='utf-8') as f:
for line in f:
try:
data = json.loads(line.strip())
# 获取总分
overall_score = data.get('overall_score', 0)
overall_scores.append(overall_score)
# 计算四个维度的分数
target_comp = data.get('target_comprehensiveness_weighted_avg', 0)
ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0)
comp_score = calculate_dimension_score(target_comp, ref_comp)
comprehensiveness_scores.append(comp_score)
target_insight = data.get('target_insight_weighted_avg', 0)
ref_insight = data.get('reference_insight_weighted_avg', 0)
insight_score = calculate_dimension_score(target_insight, ref_insight)
insight_scores.append(insight_score)
target_instruction = data.get('target_instruction_following_weighted_avg', 0)
ref_instruction = data.get('reference_instruction_following_weighted_avg', 0)
instruction_score = calculate_dimension_score(target_instruction, ref_instruction)
instruction_following_scores.append(instruction_score)
target_readability = data.get('target_readability_weighted_avg', 0)
ref_readability = data.get('reference_readability_weighted_avg', 0)
readability_score = calculate_dimension_score(target_readability, ref_readability)
readability_scores.append(readability_score)
except json.JSONDecodeError as e:
print(f"解析JSON时出错 (模型: {model_name}): {e}")
continue
except Exception as e:
print(f"处理数据时出错 (模型: {model_name}): {e}")
continue
# 计算平均分
avg_overall = sum(overall_scores) / len(overall_scores)
avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores)
avg_insight = sum(insight_scores) / len(insight_scores)
avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores)
avg_readability = sum(readability_scores) / len(readability_scores)
print(f" - 处理了 {len(overall_scores)} 条记录")
print(f" - 总分: {avg_overall:.4f}")
return {
'model': model_name,
'overall_score': avg_overall * 100,
'comprehensiveness': avg_comprehensiveness * 100,
'insight': avg_insight * 100,
'instruction_following': avg_instruction_following * 100,
'readability': avg_readability * 100
}
def rank_leaderboard():
"""计算排行榜并保存到CSV"""
# 定义目录路径
project_root = Path(__file__).parent.parent
input_dir = project_root / "data" / "raw_results"
output_file = project_root / "data" / "leaderboard.csv"
# 获取所有JSONL文件
input_files = list(input_dir.glob("*.jsonl"))
print(f"找到 {len(input_files)} 个模型结果文件")
if not input_files:
print("未找到任何JSONL文件")
return
# 处理每个模型的数据
model_results = []
for input_file in input_files:
try:
result = process_model_data(input_file)
model_results.append(result)
except Exception as e:
print(f"处理文件 {input_file.name} 时出错: {e}")
continue
# 按总分排序(降序)
model_results.sort(key=lambda x: x['overall_score'], reverse=True)
# 写入CSV文件
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# 写入表头
writer.writeheader()
# 写入数据
for result in model_results:
writer.writerow({
'model': result['model'],
'overall_score': f"{result['overall_score']:.2f}",
'comprehensiveness': f"{result['comprehensiveness']:.2f}",
'insight': f"{result['insight']:.2f}",
'instruction_following': f"{result['instruction_following']:.2f}",
'readability': f"{result['readability']:.2f}"
})
print(f"\n排行榜已保存到: {output_file}")
print(f"共处理了 {len(model_results)} 个模型")
if __name__ == "__main__":
rank_leaderboard()
print("排行榜计算完成!")
|