Spaces:

Ayanami0730
/

DeepResearch-Leaderboard

Running

App Files Files Community

DeepResearch-Leaderboard / utils /rank_leaderboard.py

Ayanami0730

Add DeepResearch Bench application with LFS support

927e909 7 days ago

raw

history blame contribute delete

5.41 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	import json
	import os
	import csv
	from pathlib import Path
	from collections import defaultdict


	def calculate_dimension_score(target_score, reference_score):
	"""计算单个维度的分数"""
	return target_score / (target_score + reference_score)


	def process_model_data(model_file):
	"""处理单个模型文件的数据"""
	model_name = model_file.stem
	print(f"正在处理模型: {model_name}")

	overall_scores = []
	comprehensiveness_scores = []
	insight_scores = []
	instruction_following_scores = []
	readability_scores = []

	with open(model_file, 'r', encoding='utf-8') as f:
	for line in f:
	try:
	data = json.loads(line.strip())

	# 获取总分
	overall_score = data.get('overall_score', 0)
	overall_scores.append(overall_score)

	# 计算四个维度的分数
	target_comp = data.get('target_comprehensiveness_weighted_avg', 0)
	ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0)
	comp_score = calculate_dimension_score(target_comp, ref_comp)
	comprehensiveness_scores.append(comp_score)

	target_insight = data.get('target_insight_weighted_avg', 0)
	ref_insight = data.get('reference_insight_weighted_avg', 0)
	insight_score = calculate_dimension_score(target_insight, ref_insight)
	insight_scores.append(insight_score)

	target_instruction = data.get('target_instruction_following_weighted_avg', 0)
	ref_instruction = data.get('reference_instruction_following_weighted_avg', 0)
	instruction_score = calculate_dimension_score(target_instruction, ref_instruction)
	instruction_following_scores.append(instruction_score)

	target_readability = data.get('target_readability_weighted_avg', 0)
	ref_readability = data.get('reference_readability_weighted_avg', 0)
	readability_score = calculate_dimension_score(target_readability, ref_readability)
	readability_scores.append(readability_score)

	except json.JSONDecodeError as e:
	print(f"解析JSON时出错 (模型: {model_name}): {e}")
	continue
	except Exception as e:
	print(f"处理数据时出错 (模型: {model_name}): {e}")
	continue

	# 计算平均分
	avg_overall = sum(overall_scores) / len(overall_scores)
	avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores)
	avg_insight = sum(insight_scores) / len(insight_scores)
	avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores)
	avg_readability = sum(readability_scores) / len(readability_scores)
	print(f" - 处理了 {len(overall_scores)} 条记录")
	print(f" - 总分: {avg_overall:.4f}")

	return {
	'model': model_name,
	'overall_score': avg_overall * 100,
	'comprehensiveness': avg_comprehensiveness * 100,
	'insight': avg_insight * 100,
	'instruction_following': avg_instruction_following * 100,
	'readability': avg_readability * 100
	}


	def rank_leaderboard():
	"""计算排行榜并保存到CSV"""
	# 定义目录路径
	project_root = Path(__file__).parent.parent
	input_dir = project_root / "data" / "raw_results"
	output_file = project_root / "data" / "leaderboard.csv"

	# 获取所有JSONL文件
	input_files = list(input_dir.glob("*.jsonl"))
	print(f"找到 {len(input_files)} 个模型结果文件")

	if not input_files:
	print("未找到任何JSONL文件")
	return

	# 处理每个模型的数据
	model_results = []
	for input_file in input_files:
	try:
	result = process_model_data(input_file)
	model_results.append(result)
	except Exception as e:
	print(f"处理文件 {input_file.name} 时出错: {e}")
	continue

	# 按总分排序（降序）
	model_results.sort(key=lambda x: x['overall_score'], reverse=True)

	# 写入CSV文件
	with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
	fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	# 写入表头
	writer.writeheader()

	# 写入数据
	for result in model_results:
	writer.writerow({
	'model': result['model'],
	'overall_score': f"{result['overall_score']:.2f}",
	'comprehensiveness': f"{result['comprehensiveness']:.2f}",
	'insight': f"{result['insight']:.2f}",
	'instruction_following': f"{result['instruction_following']:.2f}",
	'readability': f"{result['readability']:.2f}"
	})

	print(f"\n排行榜已保存到: {output_file}")
	print(f"共处理了 {len(model_results)} 个模型")


	if __name__ == "__main__":
	rank_leaderboard()
	print("排行榜计算完成！")