Spaces:

Ayanami0730
/

DeepResearch-Leaderboard

Running

File size: 5,413 Bytes

927e909

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import os
import csv
from pathlib import Path
from collections import defaultdict


def calculate_dimension_score(target_score, reference_score):
    """计算单个维度的分数"""
    return target_score / (target_score + reference_score)


def process_model_data(model_file):
    """处理单个模型文件的数据"""
    model_name = model_file.stem
    print(f"正在处理模型: {model_name}")
    
    overall_scores = []
    comprehensiveness_scores = []
    insight_scores = []
    instruction_following_scores = []
    readability_scores = []
    
    with open(model_file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                
                # 获取总分
                overall_score = data.get('overall_score', 0)
                overall_scores.append(overall_score)
                
                # 计算四个维度的分数
                target_comp = data.get('target_comprehensiveness_weighted_avg', 0)
                ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0)
                comp_score = calculate_dimension_score(target_comp, ref_comp)
                comprehensiveness_scores.append(comp_score)
                
                target_insight = data.get('target_insight_weighted_avg', 0)
                ref_insight = data.get('reference_insight_weighted_avg', 0)
                insight_score = calculate_dimension_score(target_insight, ref_insight)
                insight_scores.append(insight_score)
                
                target_instruction = data.get('target_instruction_following_weighted_avg', 0)
                ref_instruction = data.get('reference_instruction_following_weighted_avg', 0)
                instruction_score = calculate_dimension_score(target_instruction, ref_instruction)
                instruction_following_scores.append(instruction_score)
                
                target_readability = data.get('target_readability_weighted_avg', 0)
                ref_readability = data.get('reference_readability_weighted_avg', 0)
                readability_score = calculate_dimension_score(target_readability, ref_readability)
                readability_scores.append(readability_score)
                
            except json.JSONDecodeError as e:
                print(f"解析JSON时出错 (模型: {model_name}): {e}")
                continue
            except Exception as e:
                print(f"处理数据时出错 (模型: {model_name}): {e}")
                continue
    
    # 计算平均分
    avg_overall = sum(overall_scores) / len(overall_scores)
    avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores)
    avg_insight = sum(insight_scores) / len(insight_scores)
    avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores)
    avg_readability = sum(readability_scores) / len(readability_scores)
    print(f"  - 处理了 {len(overall_scores)} 条记录")
    print(f"  - 总分: {avg_overall:.4f}")
    
    return {
        'model': model_name,
        'overall_score': avg_overall * 100,
        'comprehensiveness': avg_comprehensiveness * 100,
        'insight': avg_insight * 100,
        'instruction_following': avg_instruction_following * 100,
        'readability': avg_readability * 100
    }


def rank_leaderboard():
    """计算排行榜并保存到CSV"""
    # 定义目录路径
    project_root = Path(__file__).parent.parent
    input_dir = project_root / "data" / "raw_results"
    output_file = project_root / "data" / "leaderboard.csv"
    
    # 获取所有JSONL文件
    input_files = list(input_dir.glob("*.jsonl"))
    print(f"找到 {len(input_files)} 个模型结果文件")
    
    if not input_files:
        print("未找到任何JSONL文件")
        return
    
    # 处理每个模型的数据
    model_results = []
    for input_file in input_files:
        try:
            result = process_model_data(input_file)
            model_results.append(result)
        except Exception as e:
            print(f"处理文件 {input_file.name} 时出错: {e}")
            continue
    
    # 按总分排序（降序）
    model_results.sort(key=lambda x: x['overall_score'], reverse=True)
    
    # 写入CSV文件
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # 写入表头
        writer.writeheader()
        
        # 写入数据
        for result in model_results:
            writer.writerow({
                'model': result['model'],
                'overall_score': f"{result['overall_score']:.2f}",
                'comprehensiveness': f"{result['comprehensiveness']:.2f}",
                'insight': f"{result['insight']:.2f}",
                'instruction_following': f"{result['instruction_following']:.2f}",
                'readability': f"{result['readability']:.2f}"
            })
    
    print(f"\n排行榜已保存到: {output_file}")
    print(f"共处理了 {len(model_results)} 个模型")


if __name__ == "__main__":
    rank_leaderboard()
    print("排行榜计算完成！")