import pandas as pd import nltk import ast from pathlib import Path from tqdm import tqdm import json BASE_DIR = Path(__file__).resolve().parent from glimpse.glimpse.data_loading.Glimpse_tokenizer import glimpse_tokenizer # def tokenize_sentences(text: str) -> list: # # same tokenization as in the original glimpse code # text = text.replace('-----', '\n') # sentences = nltk.sent_tokenize(text) # sentences = [sentence for sentence in sentences if sentence != ""] # return sentences def preprocessed_scores( original_csv_path: Path, scored_csv_path: Path, polarity_csv_path: Path, topic_csv_path: Path, ) -> dict: original_df = pd.read_csv(original_csv_path) scored_df = pd.read_csv(scored_csv_path) polarity_df = pd.read_csv(polarity_csv_path) topic_df = pd.read_csv(topic_csv_path) scored_reviews = {} for _, row in tqdm(original_df.iterrows(), total=len(original_df)): review_id = row["id"] review_text = row["text"] if review_id not in scored_df["id"].values or review_id not in polarity_df["id"].values: continue if review_id not in scored_reviews: scored_reviews[review_id] = [] # Get consensuality scores consensuality_scores_str = scored_df[scored_df["id"] == review_id]["consensuality_scores"].iloc[0] try: consensuality_scores_dict = json.loads(consensuality_scores_str) except Exception as e: print(f"Error parsing consensuality scores for ID {review_id}: {e}") print("Problematic string:", consensuality_scores_str) continue # skip this problematic entry # Get polarity scores polarity_rows = polarity_df[polarity_df["id"] == review_id] polarity_dict = dict(zip(polarity_rows["sentence"], polarity_rows["polarity"])) # Get topic scores topic_rows = topic_df[topic_df["id"] == review_id] topic_dict = dict(zip(topic_rows["sentence"], topic_rows["topic"])) scored_sentences = {} for sentence in glimpse_tokenizer(review_text): sentence_data = {} if sentence in consensuality_scores_dict: sentence_data["consensuality"] = consensuality_scores_dict[sentence] if sentence in polarity_dict: sentence_data["polarity"] = polarity_dict[sentence] if sentence in topic_dict: sentence_data["topic"] = topic_dict[sentence] if sentence_data: scored_sentences[sentence] = sentence_data scored_reviews[review_id].append(scored_sentences) return scored_reviews def save_all_scored_reviews( start_year: int = 2017, end_year: int = 2021, input_dir: Path = BASE_DIR / "glimpse" / "data" / "processed", scored_csv_dir: Path = BASE_DIR / "data", polarity_dir: Path = BASE_DIR / "data" / "polarity_scored", topic_dir: Path = BASE_DIR / "data" / "topic_scored", output_csv_path: Path = BASE_DIR / "data" / "preprocessed_scored_reviews.csv", ): all_scored_reviews = [] for year in range(start_year, end_year + 1): print(f"Processing {year}...") try: original_csv_path = input_dir / f"all_reviews_{year}.csv" polarity_csv_path = polarity_dir / f"polarity_scored_reviews_{year}.csv" topic_csv_path = topic_dir / f"topic_scored_reviews_{year}.csv" scored_csv_path = scored_csv_dir / f"GLIMPSE_results_{year}.csv" scored_reviews = preprocessed_scores( original_csv_path, scored_csv_path, polarity_csv_path, topic_csv_path ) all_scored_reviews.append({ "year": year, "scored_dict": scored_reviews }) except Exception as e: print(f"Skipped {year} due to error: {e}") df = pd.DataFrame(all_scored_reviews) df.to_csv(output_csv_path, index=False) print(f"All scored reviews saved to '{output_csv_path}'.") def load_scored_reviews(csv_path: Path = BASE_DIR / "data" / "preprocessed_scored_reviews.csv") -> tuple: df = pd.read_csv(csv_path) df["scored_dict"] = df["scored_dict"].apply(ast.literal_eval) years = df["year"].tolist() return years, df if __name__ == "__main__": save_all_scored_reviews() years, all_scored_reviews_df = load_scored_reviews() # Debugging sample output sample_year = 2017 sample_df = all_scored_reviews_df[all_scored_reviews_df["year"] == sample_year] review_dict = sample_df["scored_dict"].iloc[0] print(f"\n=== Sample Review from {sample_year} ===") for review_id, sentence_data_list in review_dict.items(): print(f"\nReview ID: {review_id}") for sentence_dict in sentence_data_list: for sentence, data in sentence_dict.items(): print(f" Sentence: {sentence}") for key, value in data.items(): print(f" → {key}: {value}") break # print only the first review's sentences break # only one review # --- Testing code --- # scored_reviews_2017 = all_scored_reviews_df[all_scored_reviews_df["year"] == 2017] # print(scored_reviews_2017) # scored_reviews_2017 = scored_reviews_2017["scored_dict"].iloc[0] # # scored_reviews_2017 = ast.literal_eval(scored_reviews_2017) # print(type(scored_reviews_2017)) # print(scored_reviews_2017.keys()) # sample = scored_reviews_2017["https://openreview.net/forum?id=r1rhWnZkg"] # print(sample[0]) # print(years) # for id in scored_reviews_2017.keys(): # print(len(scored_reviews_2017[id]))