File size: 7,220 Bytes
927e909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import os
from pathlib import Path


def calculate_dimension_score(target_score, reference_score):
    """计算单个维度的分数,与rank_leaderboard.py中的逻辑一致"""
    if (target_score + reference_score) == 0:  # 避免除以零
        return 0.0
    return target_score / (target_score + reference_score)


def load_scores_for_model(model_results_file_path: Path):
    """为单个模型加载所有文章的评分数据"""
    scores_by_id = {}
    if not model_results_file_path.exists():
        print(f"警告: 未找到模型 {model_results_file_path.stem} 的结果文件: {model_results_file_path}")
        return scores_by_id

    print(f"  正在从 {model_results_file_path.name} 加载分数...")
    with open(model_results_file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            try:
                data = json.loads(line.strip())
                article_id = str(data.get('id')) # 确保ID是字符串以供匹配
                if not article_id:
                    print(f"    警告: {model_results_file_path.name}{i+1} 行缺少ID,已跳过。")
                    continue

                # 直接获取 overall_score (原始值,假设在0-1范围,或者已经是0-100,根据您的数据调整)
                # 根据您之前的修改,这里我们假设原始overall_score需要乘以100
                overall_score_raw = data.get('overall_score', 0.0) 
                overall_score_scaled = overall_score_raw * 100

                # 计算四个维度的分数
                comp_score_raw = calculate_dimension_score(
                    data.get('target_comprehensiveness_weighted_avg', 0),
                    data.get('reference_comprehensiveness_weighted_avg', 0)
                )
                insight_score_raw = calculate_dimension_score(
                    data.get('target_insight_weighted_avg', 0),
                    data.get('reference_insight_weighted_avg', 0)
                )
                instruction_score_raw = calculate_dimension_score(
                    data.get('target_instruction_following_weighted_avg', 0),
                    data.get('reference_instruction_following_weighted_avg', 0)
                )
                readability_score_raw = calculate_dimension_score(
                    data.get('target_readability_weighted_avg', 0),
                    data.get('reference_readability_weighted_avg', 0)
                )
                
                scores_by_id[article_id] = {
                    'overall_score': f"{overall_score_scaled:.2f}",
                    'comprehensiveness_score': f"{comp_score_raw * 100:.2f}",
                    'insight_score': f"{insight_score_raw * 100:.2f}",
                    'instruction_following_score': f"{instruction_score_raw * 100:.2f}",
                    'readability_score': f"{readability_score_raw * 100:.2f}"
                }
            except json.JSONDecodeError as e:
                print(f"    错误: 解析JSON时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}")
            except Exception as e:
                print(f"    错误: 处理数据时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}")
    print(f"  为模型 {model_results_file_path.stem} 加载了 {len(scores_by_id)}篇文章的分数")
    return scores_by_id


def merge_jsonl_files():
    # 定义目录路径
    project_root = Path(__file__).resolve().parent.parent
    raw_data_dir = project_root / "data" / "raw_data"         # 包含原始文章内容的目录
    raw_results_dir = project_root / "data" / "raw_results"   # 包含评分结果的目录
    output_file = project_root / "data" / "data_viewer.jsonl"
    
    # 获取所有原始数据JSONL文件
    input_files = list(raw_data_dir.glob("*.jsonl"))
    print(f"在 {raw_data_dir} 中找到 {len(input_files)} 个模型JSONL文件")
    
    if not input_files:
        print("未找到任何原始数据文件,已退出。")
        return

    # 清空输出文件 (如果需要,或者可以采用追加模式,但通常合并操作会重新生成)
    with open(output_file, 'w', encoding='utf-8') as f:
        pass # 创建或清空文件
    
    all_merged_data = []
    
    for raw_data_file in input_files:
        model_name = raw_data_file.stem
        print(f"正在处理原始数据文件: {raw_data_file.name} (模型: {model_name})")
        
        # 为当前模型加载评分数据
        model_results_file = raw_results_dir / f"{model_name}.jsonl"
        scores_for_current_model = load_scores_for_model(model_results_file)
        
        processed_articles_count = 0
        with open(raw_data_file, 'r', encoding='utf-8') as f_raw:
            for i, line in enumerate(f_raw):
                try:
                    article_data = json.loads(line.strip())
                    article_id = str(article_data.get('id')) # 确保ID是字符串

                    if not article_id:
                        print(f"  警告: {raw_data_file.name}{i+1} 行缺少ID,已跳过。")
                        continue
                    
                    # 从加载的评分数据中获取该文章的评分
                    article_scores = scores_for_current_model.get(article_id, {})
                    if not article_scores:
                        print(f"  警告: 模型 {model_name} 的文章ID {article_id} 未在结果文件中找到分数。")

                    merged_item = {
                        'model_name': model_name,
                        'id': article_id,
                        'prompt': article_data.get('prompt'),
                        'article': article_data.get('article'),
                        'overall_score': article_scores.get('overall_score'), # 可能为None
                        'comprehensiveness_score': article_scores.get('comprehensiveness_score'),
                        'insight_score': article_scores.get('insight_score'),
                        'instruction_following_score': article_scores.get('instruction_following_score'),
                        'readability_score': article_scores.get('readability_score')
                    }
                    all_merged_data.append(merged_item)
                    processed_articles_count += 1
                except json.JSONDecodeError as e:
                    print(f"  错误: 解析原始数据JSON时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
                except Exception as e:
                    print(f"  错误: 处理原始数据时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
        print(f"  为模型 {model_name} 处理了 {processed_articles_count} 篇文章数据。")
    
    # 一次性写入所有合并后的数据
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for item in all_merged_data:
            f_out.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"\n成功合并并保存到: {output_file}, 共 {len(all_merged_data)} 条记录")

if __name__ == "__main__":
    merge_jsonl_files()
    print("所有文件处理完成!")