Spaces:

Sina1138
/

ReView

Sleeping

App Files Files Community

ReView / glimpse-ui /glimpse_pk_csv_converter.py

Sina1138

Super-squash branch 'main' using huggingface_hub

6fe7180 about 1 month ago

raw

history blame contribute delete

4.39 kB

	import pickle
	import pandas as pd
	from pathlib import Path
	import os
	import glob
	import re
	import json

	def process_pickle_results(pickle_path: Path, output_path: Path):
	# === Load Pickle File ===
	with open(pickle_path, 'rb') as f:
	data = pickle.load(f)

	# === Extract Metadata ===
	reranking_model = data.get('metadata/reranking_model')
	rsa_iterations = data.get('metadata/rsa_iterations')
	results = data.get('results')

	# print(f"Reranking model: {reranking_model}, RSA iterations: {rsa_iterations}")

	# === Validate Results ===
	if not isinstance(results, list):
	raise ValueError("The 'results' key is not a list. Please check the pickle file structure.")

	# === Process and Flatten Results ===
	csv_data = []
	for index, result in enumerate(results):
	# row = {
	# 'index': index,
	# 'id': str(result.get('id')[0]),
	# 'consensuality_scores': result.get('consensuality_scores').to_dict()
	# if isinstance(result.get('consensuality_scores'), pd.Series) else None,

	# # Optional fields — uncomment as needed
	# # 'best_base': result.get('best_base').tolist() if isinstance(result.get('best_base'), np.ndarray) else None,
	# # 'best_rsa': result.get('best_rsa').tolist() if isinstance(result.get('best_rsa'), np.ndarray) else None,
	# # 'speaker_df': result.get('speaker_df').to_json() if isinstance(result.get('speaker_df'), pd.DataFrame) else None,
	# # 'listener_df': result.get('listener_df').to_json() if isinstance(result.get('listener_df'), pd.DataFrame) else None,
	# # 'initial_listener': result.get('initial_listener').to_json() if isinstance(result.get('initial_listener'), pd.DataFrame) else None,
	# # 'language_model_proba_df': result.get('language_model_proba_df').to_json() if isinstance(result.get('language_model_proba_df'), pd.DataFrame) else None,
	# # 'initial_consensuality_scores': result.get('initial_consensuality_scores').to_dict() if isinstance(result.get('initial_consensuality_scores'), pd.Series) else None,
	# # 'gold': result.get('gold'),
	# # 'rationality': result.get('rationality'),
	# # 'text_candidates': result.get('text_candidates').to_json() if isinstance(result.get('text_candidates'), pd.DataFrame) else None,
	# }


	row = {
	'index': index,
	'id': str(result.get('id')[0]),
	'consensuality_scores': json.dumps(result.get('consensuality_scores').to_dict())
	if isinstance(result.get('consensuality_scores'), pd.Series) else None,
	}

	csv_data.append(row)

	# === Save to CSV ===
	df = pd.DataFrame(csv_data)
	df.to_csv(output_path, index=False)
	print(f"Results saved to '{output_path}'.")


	if __name__ == "__main__":

	BASE_DIR = Path(__file__).resolve().parent

	# Set the path to the pickle file and the output CSV file
	# ==== Uncomment the appropriate line below to set the pickle file path ====
	# pickle_file = BASE_DIR / "glimpse" / "output" / "extractive_sentences-_-all_reviews_2017-_-none-_-2025-05-20-20-22-18-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk"

	# ==== Find the latest file in the directory and use it instead ====
	# This assumes the pickle files are stored in the 'glimpse/output' directory
	# list_of_files = glob.glob('./glimpse/output/*.pk')
	# pickle_file = max(list_of_files, key=os.path.getctime)
	# print (f"Using pickle file: {pickle_file}")

	# output_file = BASE_DIR / "data" / "GLIMPSE_results_from_pk.csv"

	# process_pickle_results(pickle_file, output_file)

	output_dir = BASE_DIR / "data"
	output_dir.mkdir(parents=True, exist_ok=True)

	pickle_files = sorted(glob.glob('./glimpse/output/*.pk'), key=os.path.getctime)

	for pickle_file in pickle_files:
	year_match = re.search(r'(\d{4})', os.path.basename(pickle_file))
	year_tag = year_match.group(1) if year_match else 'unknown_year'
	output_file = output_dir / f"GLIMPSE_results_{year_tag}.csv"

	print(f"Using pickle file: {pickle_file}, saving as {output_file}")
	process_pickle_results(Path(pickle_file), output_file)