|
|
|
|
|
|
|
import json |
|
import os |
|
import csv |
|
from pathlib import Path |
|
from collections import defaultdict |
|
|
|
|
|
def calculate_dimension_score(target_score, reference_score): |
|
"""计算单个维度的分数""" |
|
return target_score / (target_score + reference_score) |
|
|
|
|
|
def process_model_data(model_file): |
|
"""处理单个模型文件的数据""" |
|
model_name = model_file.stem |
|
print(f"正在处理模型: {model_name}") |
|
|
|
overall_scores = [] |
|
comprehensiveness_scores = [] |
|
insight_scores = [] |
|
instruction_following_scores = [] |
|
readability_scores = [] |
|
|
|
with open(model_file, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
try: |
|
data = json.loads(line.strip()) |
|
|
|
|
|
overall_score = data.get('overall_score', 0) |
|
overall_scores.append(overall_score) |
|
|
|
|
|
target_comp = data.get('target_comprehensiveness_weighted_avg', 0) |
|
ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0) |
|
comp_score = calculate_dimension_score(target_comp, ref_comp) |
|
comprehensiveness_scores.append(comp_score) |
|
|
|
target_insight = data.get('target_insight_weighted_avg', 0) |
|
ref_insight = data.get('reference_insight_weighted_avg', 0) |
|
insight_score = calculate_dimension_score(target_insight, ref_insight) |
|
insight_scores.append(insight_score) |
|
|
|
target_instruction = data.get('target_instruction_following_weighted_avg', 0) |
|
ref_instruction = data.get('reference_instruction_following_weighted_avg', 0) |
|
instruction_score = calculate_dimension_score(target_instruction, ref_instruction) |
|
instruction_following_scores.append(instruction_score) |
|
|
|
target_readability = data.get('target_readability_weighted_avg', 0) |
|
ref_readability = data.get('reference_readability_weighted_avg', 0) |
|
readability_score = calculate_dimension_score(target_readability, ref_readability) |
|
readability_scores.append(readability_score) |
|
|
|
except json.JSONDecodeError as e: |
|
print(f"解析JSON时出错 (模型: {model_name}): {e}") |
|
continue |
|
except Exception as e: |
|
print(f"处理数据时出错 (模型: {model_name}): {e}") |
|
continue |
|
|
|
|
|
avg_overall = sum(overall_scores) / len(overall_scores) |
|
avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores) |
|
avg_insight = sum(insight_scores) / len(insight_scores) |
|
avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores) |
|
avg_readability = sum(readability_scores) / len(readability_scores) |
|
print(f" - 处理了 {len(overall_scores)} 条记录") |
|
print(f" - 总分: {avg_overall:.4f}") |
|
|
|
return { |
|
'model': model_name, |
|
'overall_score': avg_overall * 100, |
|
'comprehensiveness': avg_comprehensiveness * 100, |
|
'insight': avg_insight * 100, |
|
'instruction_following': avg_instruction_following * 100, |
|
'readability': avg_readability * 100 |
|
} |
|
|
|
|
|
def rank_leaderboard(): |
|
"""计算排行榜并保存到CSV""" |
|
|
|
project_root = Path(__file__).parent.parent |
|
input_dir = project_root / "data" / "raw_results" |
|
output_file = project_root / "data" / "leaderboard.csv" |
|
|
|
|
|
input_files = list(input_dir.glob("*.jsonl")) |
|
print(f"找到 {len(input_files)} 个模型结果文件") |
|
|
|
if not input_files: |
|
print("未找到任何JSONL文件") |
|
return |
|
|
|
|
|
model_results = [] |
|
for input_file in input_files: |
|
try: |
|
result = process_model_data(input_file) |
|
model_results.append(result) |
|
except Exception as e: |
|
print(f"处理文件 {input_file.name} 时出错: {e}") |
|
continue |
|
|
|
|
|
model_results.sort(key=lambda x: x['overall_score'], reverse=True) |
|
|
|
|
|
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: |
|
fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability'] |
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
|
|
|
|
|
writer.writeheader() |
|
|
|
|
|
for result in model_results: |
|
writer.writerow({ |
|
'model': result['model'], |
|
'overall_score': f"{result['overall_score']:.2f}", |
|
'comprehensiveness': f"{result['comprehensiveness']:.2f}", |
|
'insight': f"{result['insight']:.2f}", |
|
'instruction_following': f"{result['instruction_following']:.2f}", |
|
'readability': f"{result['readability']:.2f}" |
|
}) |
|
|
|
print(f"\n排行榜已保存到: {output_file}") |
|
print(f"共处理了 {len(model_results)} 个模型") |
|
|
|
|
|
if __name__ == "__main__": |
|
rank_leaderboard() |
|
print("排行榜计算完成!") |
|
|