File size: 5,413 Bytes
927e909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import os
import csv
from pathlib import Path
from collections import defaultdict


def calculate_dimension_score(target_score, reference_score):
    """计算单个维度的分数"""
    return target_score / (target_score + reference_score)


def process_model_data(model_file):
    """处理单个模型文件的数据"""
    model_name = model_file.stem
    print(f"正在处理模型: {model_name}")
    
    overall_scores = []
    comprehensiveness_scores = []
    insight_scores = []
    instruction_following_scores = []
    readability_scores = []
    
    with open(model_file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                
                # 获取总分
                overall_score = data.get('overall_score', 0)
                overall_scores.append(overall_score)
                
                # 计算四个维度的分数
                target_comp = data.get('target_comprehensiveness_weighted_avg', 0)
                ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0)
                comp_score = calculate_dimension_score(target_comp, ref_comp)
                comprehensiveness_scores.append(comp_score)
                
                target_insight = data.get('target_insight_weighted_avg', 0)
                ref_insight = data.get('reference_insight_weighted_avg', 0)
                insight_score = calculate_dimension_score(target_insight, ref_insight)
                insight_scores.append(insight_score)
                
                target_instruction = data.get('target_instruction_following_weighted_avg', 0)
                ref_instruction = data.get('reference_instruction_following_weighted_avg', 0)
                instruction_score = calculate_dimension_score(target_instruction, ref_instruction)
                instruction_following_scores.append(instruction_score)
                
                target_readability = data.get('target_readability_weighted_avg', 0)
                ref_readability = data.get('reference_readability_weighted_avg', 0)
                readability_score = calculate_dimension_score(target_readability, ref_readability)
                readability_scores.append(readability_score)
                
            except json.JSONDecodeError as e:
                print(f"解析JSON时出错 (模型: {model_name}): {e}")
                continue
            except Exception as e:
                print(f"处理数据时出错 (模型: {model_name}): {e}")
                continue
    
    # 计算平均分
    avg_overall = sum(overall_scores) / len(overall_scores)
    avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores)
    avg_insight = sum(insight_scores) / len(insight_scores)
    avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores)
    avg_readability = sum(readability_scores) / len(readability_scores)
    print(f"  - 处理了 {len(overall_scores)} 条记录")
    print(f"  - 总分: {avg_overall:.4f}")
    
    return {
        'model': model_name,
        'overall_score': avg_overall * 100,
        'comprehensiveness': avg_comprehensiveness * 100,
        'insight': avg_insight * 100,
        'instruction_following': avg_instruction_following * 100,
        'readability': avg_readability * 100
    }


def rank_leaderboard():
    """计算排行榜并保存到CSV"""
    # 定义目录路径
    project_root = Path(__file__).parent.parent
    input_dir = project_root / "data" / "raw_results"
    output_file = project_root / "data" / "leaderboard.csv"
    
    # 获取所有JSONL文件
    input_files = list(input_dir.glob("*.jsonl"))
    print(f"找到 {len(input_files)} 个模型结果文件")
    
    if not input_files:
        print("未找到任何JSONL文件")
        return
    
    # 处理每个模型的数据
    model_results = []
    for input_file in input_files:
        try:
            result = process_model_data(input_file)
            model_results.append(result)
        except Exception as e:
            print(f"处理文件 {input_file.name} 时出错: {e}")
            continue
    
    # 按总分排序(降序)
    model_results.sort(key=lambda x: x['overall_score'], reverse=True)
    
    # 写入CSV文件
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # 写入表头
        writer.writeheader()
        
        # 写入数据
        for result in model_results:
            writer.writerow({
                'model': result['model'],
                'overall_score': f"{result['overall_score']:.2f}",
                'comprehensiveness': f"{result['comprehensiveness']:.2f}",
                'insight': f"{result['insight']:.2f}",
                'instruction_following': f"{result['instruction_following']:.2f}",
                'readability': f"{result['readability']:.2f}"
            })
    
    print(f"\n排行榜已保存到: {output_file}")
    print(f"共处理了 {len(model_results)} 个模型")


if __name__ == "__main__":
    rank_leaderboard()
    print("排行榜计算完成!")