DeepResearch-Leaderboard / utils /rank_leaderboard.py
Ayanami0730's picture
Add DeepResearch Bench application with LFS support
927e909
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import os
import csv
from pathlib import Path
from collections import defaultdict
def calculate_dimension_score(target_score, reference_score):
"""计算单个维度的分数"""
return target_score / (target_score + reference_score)
def process_model_data(model_file):
"""处理单个模型文件的数据"""
model_name = model_file.stem
print(f"正在处理模型: {model_name}")
overall_scores = []
comprehensiveness_scores = []
insight_scores = []
instruction_following_scores = []
readability_scores = []
with open(model_file, 'r', encoding='utf-8') as f:
for line in f:
try:
data = json.loads(line.strip())
# 获取总分
overall_score = data.get('overall_score', 0)
overall_scores.append(overall_score)
# 计算四个维度的分数
target_comp = data.get('target_comprehensiveness_weighted_avg', 0)
ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0)
comp_score = calculate_dimension_score(target_comp, ref_comp)
comprehensiveness_scores.append(comp_score)
target_insight = data.get('target_insight_weighted_avg', 0)
ref_insight = data.get('reference_insight_weighted_avg', 0)
insight_score = calculate_dimension_score(target_insight, ref_insight)
insight_scores.append(insight_score)
target_instruction = data.get('target_instruction_following_weighted_avg', 0)
ref_instruction = data.get('reference_instruction_following_weighted_avg', 0)
instruction_score = calculate_dimension_score(target_instruction, ref_instruction)
instruction_following_scores.append(instruction_score)
target_readability = data.get('target_readability_weighted_avg', 0)
ref_readability = data.get('reference_readability_weighted_avg', 0)
readability_score = calculate_dimension_score(target_readability, ref_readability)
readability_scores.append(readability_score)
except json.JSONDecodeError as e:
print(f"解析JSON时出错 (模型: {model_name}): {e}")
continue
except Exception as e:
print(f"处理数据时出错 (模型: {model_name}): {e}")
continue
# 计算平均分
avg_overall = sum(overall_scores) / len(overall_scores)
avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores)
avg_insight = sum(insight_scores) / len(insight_scores)
avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores)
avg_readability = sum(readability_scores) / len(readability_scores)
print(f" - 处理了 {len(overall_scores)} 条记录")
print(f" - 总分: {avg_overall:.4f}")
return {
'model': model_name,
'overall_score': avg_overall * 100,
'comprehensiveness': avg_comprehensiveness * 100,
'insight': avg_insight * 100,
'instruction_following': avg_instruction_following * 100,
'readability': avg_readability * 100
}
def rank_leaderboard():
"""计算排行榜并保存到CSV"""
# 定义目录路径
project_root = Path(__file__).parent.parent
input_dir = project_root / "data" / "raw_results"
output_file = project_root / "data" / "leaderboard.csv"
# 获取所有JSONL文件
input_files = list(input_dir.glob("*.jsonl"))
print(f"找到 {len(input_files)} 个模型结果文件")
if not input_files:
print("未找到任何JSONL文件")
return
# 处理每个模型的数据
model_results = []
for input_file in input_files:
try:
result = process_model_data(input_file)
model_results.append(result)
except Exception as e:
print(f"处理文件 {input_file.name} 时出错: {e}")
continue
# 按总分排序(降序)
model_results.sort(key=lambda x: x['overall_score'], reverse=True)
# 写入CSV文件
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# 写入表头
writer.writeheader()
# 写入数据
for result in model_results:
writer.writerow({
'model': result['model'],
'overall_score': f"{result['overall_score']:.2f}",
'comprehensiveness': f"{result['comprehensiveness']:.2f}",
'insight': f"{result['insight']:.2f}",
'instruction_following': f"{result['instruction_following']:.2f}",
'readability': f"{result['readability']:.2f}"
})
print(f"\n排行榜已保存到: {output_file}")
print(f"共处理了 {len(model_results)} 个模型")
if __name__ == "__main__":
rank_leaderboard()
print("排行榜计算完成!")