File size: 4,389 Bytes
6fe7180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pickle
import pandas as pd
from pathlib import Path
import os
import glob
import re
import json

def process_pickle_results(pickle_path: Path, output_path: Path):
    # === Load Pickle File ===
    with open(pickle_path, 'rb') as f:
        data = pickle.load(f)

    # === Extract Metadata ===
    reranking_model = data.get('metadata/reranking_model')
    rsa_iterations = data.get('metadata/rsa_iterations')
    results = data.get('results')

    # print(f"Reranking model: {reranking_model}, RSA iterations: {rsa_iterations}")

    # === Validate Results ===
    if not isinstance(results, list):
        raise ValueError("The 'results' key is not a list. Please check the pickle file structure.")

    # === Process and Flatten Results ===
    csv_data = []
    for index, result in enumerate(results):
        # row = {
        #     'index': index,
        #     'id': str(result.get('id')[0]),
        #     'consensuality_scores': result.get('consensuality_scores').to_dict()
        #         if isinstance(result.get('consensuality_scores'), pd.Series) else None,

        #     # Optional fields — uncomment as needed
        #     # 'best_base': result.get('best_base').tolist() if isinstance(result.get('best_base'), np.ndarray) else None,
        #     # 'best_rsa': result.get('best_rsa').tolist() if isinstance(result.get('best_rsa'), np.ndarray) else None,
        #     # 'speaker_df': result.get('speaker_df').to_json() if isinstance(result.get('speaker_df'), pd.DataFrame) else None,
        #     # 'listener_df': result.get('listener_df').to_json() if isinstance(result.get('listener_df'), pd.DataFrame) else None,
        #     # 'initial_listener': result.get('initial_listener').to_json() if isinstance(result.get('initial_listener'), pd.DataFrame) else None,
        #     # 'language_model_proba_df': result.get('language_model_proba_df').to_json() if isinstance(result.get('language_model_proba_df'), pd.DataFrame) else None,
        #     # 'initial_consensuality_scores': result.get('initial_consensuality_scores').to_dict() if isinstance(result.get('initial_consensuality_scores'), pd.Series) else None,
        #     # 'gold': result.get('gold'),
        #     # 'rationality': result.get('rationality'),
        #     # 'text_candidates': result.get('text_candidates').to_json() if isinstance(result.get('text_candidates'), pd.DataFrame) else None,
        # }
        
        
        row = {
            'index': index,
            'id': str(result.get('id')[0]),
            'consensuality_scores': json.dumps(result.get('consensuality_scores').to_dict())
                if isinstance(result.get('consensuality_scores'), pd.Series) else None,
        }

        csv_data.append(row)

    # === Save to CSV ===
    df = pd.DataFrame(csv_data)
    df.to_csv(output_path, index=False)
    print(f"Results saved to '{output_path}'.")


if __name__ == "__main__":
    
    BASE_DIR = Path(__file__).resolve().parent
    
    # Set the path to the pickle file and the output CSV file
    # ==== Uncomment the appropriate line below to set the pickle file path ====
    # pickle_file = BASE_DIR / "glimpse" / "output" / "extractive_sentences-_-all_reviews_2017-_-none-_-2025-05-20-20-22-18-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk"
    
    # ==== Find the latest file in the directory and use it instead ====
    # This assumes the pickle files are stored in the 'glimpse/output' directory
    # list_of_files = glob.glob('./glimpse/output/*.pk')
    # pickle_file = max(list_of_files, key=os.path.getctime)
    # print (f"Using pickle file: {pickle_file}")

    # output_file = BASE_DIR / "data" / "GLIMPSE_results_from_pk.csv"
    
    # process_pickle_results(pickle_file, output_file)

    output_dir = BASE_DIR / "data"
    output_dir.mkdir(parents=True, exist_ok=True)

    pickle_files = sorted(glob.glob('./glimpse/output/*.pk'), key=os.path.getctime)

    for pickle_file in pickle_files:
        year_match = re.search(r'(\d{4})', os.path.basename(pickle_file))
        year_tag = year_match.group(1) if year_match else 'unknown_year'
        output_file = output_dir / f"GLIMPSE_results_{year_tag}.csv"

        print(f"Using pickle file: {pickle_file}, saving as {output_file}")
        process_pickle_results(Path(pickle_file), output_file)