Spaces:

Sina1138
/

ReView

Sleeping

App Files Files Community

ReView / glimpse-ui /scored_reviews_builder.py

Sina1138

Super-squash branch 'main' using huggingface_hub

6fe7180 about 1 month ago

raw

history blame contribute delete

6.01 kB

	import pandas as pd
	import nltk
	import ast
	from pathlib import Path
	from tqdm import tqdm
	import json

	BASE_DIR = Path(__file__).resolve().parent

	from glimpse.glimpse.data_loading.Glimpse_tokenizer import glimpse_tokenizer

	# def tokenize_sentences(text: str) -> list:
	# # same tokenization as in the original glimpse code
	# text = text.replace('-----', '\n')
	# sentences = nltk.sent_tokenize(text)
	# sentences = [sentence for sentence in sentences if sentence != ""]
	# return sentences


	def preprocessed_scores(
	original_csv_path: Path,
	scored_csv_path: Path,
	polarity_csv_path: Path,
	topic_csv_path: Path,
	) -> dict:

	original_df = pd.read_csv(original_csv_path)
	scored_df = pd.read_csv(scored_csv_path)
	polarity_df = pd.read_csv(polarity_csv_path)
	topic_df = pd.read_csv(topic_csv_path)

	scored_reviews = {}

	for _, row in tqdm(original_df.iterrows(), total=len(original_df)):
	review_id = row["id"]
	review_text = row["text"]

	if review_id not in scored_df["id"].values or review_id not in polarity_df["id"].values:
	continue

	if review_id not in scored_reviews:
	scored_reviews[review_id] = []

	# Get consensuality scores
	consensuality_scores_str = scored_df[scored_df["id"] == review_id]["consensuality_scores"].iloc[0]
	try:
	consensuality_scores_dict = json.loads(consensuality_scores_str)
	except Exception as e:
	print(f"Error parsing consensuality scores for ID {review_id}: {e}")
	print("Problematic string:", consensuality_scores_str)
	continue # skip this problematic entry

	# Get polarity scores
	polarity_rows = polarity_df[polarity_df["id"] == review_id]
	polarity_dict = dict(zip(polarity_rows["sentence"], polarity_rows["polarity"]))

	# Get topic scores
	topic_rows = topic_df[topic_df["id"] == review_id]
	topic_dict = dict(zip(topic_rows["sentence"], topic_rows["topic"]))

	scored_sentences = {}
	for sentence in glimpse_tokenizer(review_text):
	sentence_data = {}
	if sentence in consensuality_scores_dict:
	sentence_data["consensuality"] = consensuality_scores_dict[sentence]
	if sentence in polarity_dict:
	sentence_data["polarity"] = polarity_dict[sentence]
	if sentence in topic_dict:
	sentence_data["topic"] = topic_dict[sentence]
	if sentence_data:
	scored_sentences[sentence] = sentence_data

	scored_reviews[review_id].append(scored_sentences)

	return scored_reviews


	def save_all_scored_reviews(
	start_year: int = 2017,
	end_year: int = 2021,
	input_dir: Path = BASE_DIR / "glimpse" / "data" / "processed",
	scored_csv_dir: Path = BASE_DIR / "data",
	polarity_dir: Path = BASE_DIR / "data" / "polarity_scored",
	topic_dir: Path = BASE_DIR / "data" / "topic_scored",
	output_csv_path: Path = BASE_DIR / "data" / "preprocessed_scored_reviews.csv",
	):

	all_scored_reviews = []

	for year in range(start_year, end_year + 1):
	print(f"Processing {year}...")
	try:
	original_csv_path = input_dir / f"all_reviews_{year}.csv"
	polarity_csv_path = polarity_dir / f"polarity_scored_reviews_{year}.csv"
	topic_csv_path = topic_dir / f"topic_scored_reviews_{year}.csv"
	scored_csv_path = scored_csv_dir / f"GLIMPSE_results_{year}.csv"
	scored_reviews = preprocessed_scores(
	original_csv_path,
	scored_csv_path,
	polarity_csv_path,
	topic_csv_path
	)
	all_scored_reviews.append({
	"year": year,
	"scored_dict": scored_reviews
	})

	except Exception as e:
	print(f"Skipped {year} due to error: {e}")

	df = pd.DataFrame(all_scored_reviews)
	df.to_csv(output_csv_path, index=False)
	print(f"All scored reviews saved to '{output_csv_path}'.")


	def load_scored_reviews(csv_path: Path = BASE_DIR / "data" / "preprocessed_scored_reviews.csv") -> tuple:
	df = pd.read_csv(csv_path)
	df["scored_dict"] = df["scored_dict"].apply(ast.literal_eval)
	years = df["year"].tolist()

	return years, df


	if __name__ == "__main__":
	save_all_scored_reviews()
	years, all_scored_reviews_df = load_scored_reviews()

	# Debugging sample output
	sample_year = 2017

	sample_df = all_scored_reviews_df[all_scored_reviews_df["year"] == sample_year]
	review_dict = sample_df["scored_dict"].iloc[0]

	print(f"\n=== Sample Review from {sample_year} ===")
	for review_id, sentence_data_list in review_dict.items():
	print(f"\nReview ID: {review_id}")
	for sentence_dict in sentence_data_list:
	for sentence, data in sentence_dict.items():
	print(f" Sentence: {sentence}")
	for key, value in data.items():
	print(f" → {key}: {value}")
	break # print only the first review's sentences
	break # only one review


	# --- Testing code ---
	# scored_reviews_2017 = all_scored_reviews_df[all_scored_reviews_df["year"] == 2017]
	# print(scored_reviews_2017)
	# scored_reviews_2017 = scored_reviews_2017["scored_dict"].iloc[0]
	# # scored_reviews_2017 = ast.literal_eval(scored_reviews_2017)
	# print(type(scored_reviews_2017))
	# print(scored_reviews_2017.keys())
	# sample = scored_reviews_2017["https://openreview.net/forum?id=r1rhWnZkg"]
	# print(sample[0])

	# print(years)
	# for id in scored_reviews_2017.keys():
	# print(len(scored_reviews_2017[id]))