File size: 5,676 Bytes
927e909
 
 
 
 
 
 
 
1d11ffb
927e909
1d11ffb
 
 
 
927e909
 
1d11ffb
 
927e909
 
 
1d11ffb
927e909
1d11ffb
927e909
 
 
 
 
1d11ffb
 
 
 
927e909
 
 
1d11ffb
927e909
 
 
 
 
1d11ffb
927e909
1d11ffb
 
927e909
 
 
 
 
1d11ffb
 
927e909
 
 
 
 
 
 
 
 
 
1d11ffb
927e909
 
 
 
 
 
 
1d11ffb
 
 
 
 
 
927e909
 
 
 
 
 
1d11ffb
927e909
 
 
 
 
 
 
 
 
 
 
 
 
 
1d11ffb
927e909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import os
from pathlib import Path


def load_scores_for_model(model_results_dir: Path):
    scores_by_id = {}
    raw_results_file = model_results_dir / "raw_results.jsonl"
    
    if not raw_results_file.exists():
        print(f"警告: 未找到模型 {model_results_dir.name} 的结果文件: {raw_results_file}")
        return scores_by_id

    print(f"  正在从 {model_results_dir.name}/raw_results.jsonl 加载分数...")
    with open(raw_results_file, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            try:
                data = json.loads(line.strip())
                article_id = str(data.get('id'))
                if not article_id:
                    print(f"    警告: {model_results_dir.name}{i+1} 行缺少ID,已跳过。")
                    continue

                overall_score_raw = data.get('overall_score', 0.0) 
                overall_score_scaled = overall_score_raw * 100

                comprehensiveness_score_raw = data.get('comprehensiveness', 0.0)
                insight_score_raw = data.get('insight', 0.0)
                instruction_score_raw = data.get('instruction_following', 0.0)
                readability_score_raw = data.get('readability', 0.0)
                
                scores_by_id[article_id] = {
                    'overall_score': f"{overall_score_scaled:.2f}",
                    'comprehensiveness_score': f"{comprehensiveness_score_raw * 100:.2f}",
                    'insight_score': f"{insight_score_raw * 100:.2f}",
                    'instruction_following_score': f"{instruction_score_raw * 100:.2f}",
                    'readability_score': f"{readability_score_raw * 100:.2f}"
                }
            except json.JSONDecodeError as e:
                print(f"    错误: 解析JSON时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}")
            except Exception as e:
                print(f"    错误: 处理数据时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}")
    print(f"  为模型 {model_results_dir.name} 加载了 {len(scores_by_id)}篇文章的分数")
    return scores_by_id


def merge_jsonl_files():
    project_root = Path(__file__).resolve().parent.parent
    raw_data_dir = project_root / "data" / "raw_data"
    raw_results_dir = project_root / "data" / "raw_results"
    output_file = project_root / "data" / "data_viewer.jsonl"
    
    input_files = list(raw_data_dir.glob("*.jsonl"))
    print(f"在 {raw_data_dir} 中找到 {len(input_files)} 个模型JSONL文件")
    
    if not input_files:
        print("未找到任何原始数据文件,已退出。")
        return

    with open(output_file, 'w', encoding='utf-8') as f:
        pass
    
    all_merged_data = []
    
    for raw_data_file in input_files:
        model_name = raw_data_file.stem
        print(f"正在处理原始数据文件: {raw_data_file.name} (模型: {model_name})")
        
        model_results_dir = raw_results_dir / model_name
        if not model_results_dir.exists():
            print(f"  警告: 未找到模型 {model_name} 对应的结果文件夹: {model_results_dir}")
            continue
            
        scores_for_current_model = load_scores_for_model(model_results_dir)
        
        processed_articles_count = 0
        with open(raw_data_file, 'r', encoding='utf-8') as f_raw:
            for i, line in enumerate(f_raw):
                try:
                    article_data = json.loads(line.strip())
                    article_id = str(article_data.get('id'))

                    if not article_id:
                        print(f"  警告: {raw_data_file.name}{i+1} 行缺少ID,已跳过。")
                        continue
                    
                    article_scores = scores_for_current_model.get(article_id, {})
                    if not article_scores:
                        print(f"  警告: 模型 {model_name} 的文章ID {article_id} 未在结果文件中找到分数。")

                    merged_item = {
                        'model_name': model_name,
                        'id': article_id,
                        'prompt': article_data.get('prompt'),
                        'article': article_data.get('article'),
                        'overall_score': article_scores.get('overall_score'),
                        'comprehensiveness_score': article_scores.get('comprehensiveness_score'),
                        'insight_score': article_scores.get('insight_score'),
                        'instruction_following_score': article_scores.get('instruction_following_score'),
                        'readability_score': article_scores.get('readability_score')
                    }
                    all_merged_data.append(merged_item)
                    processed_articles_count += 1
                except json.JSONDecodeError as e:
                    print(f"  错误: 解析原始数据JSON时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
                except Exception as e:
                    print(f"  错误: 处理原始数据时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
        print(f"  为模型 {model_name} 处理了 {processed_articles_count} 篇文章数据。")
    
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for item in all_merged_data:
            f_out.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"\n成功合并并保存到: {output_file}, 共 {len(all_merged_data)} 条记录")

if __name__ == "__main__":
    merge_jsonl_files()
    print("所有文件处理完成!")