import os import json import pandas as pd from pathlib import Path BASE_DIR = Path(__file__).resolve().parent.parent base_path = BASE_DIR / "data" / "DISAPERE-main" / "DISAPERE" / "final_dataset" output_path = BASE_DIR / "data" / "DISAPERE-main" / "SELFExtractedData" ################################################################################### ################################################################################### # EXTRACTING POLARITY SENTENCES FROM DISAPERE DATASET # def extract_polarity_sentences(json_dir): # data = [] # for filename in os.listdir(json_dir): # if filename.endswith(".json"): # with open(os.path.join(json_dir, filename), "r") as f: # thread = json.load(f) # for sentence in thread.get("review_sentences", []): # text = sentence.get("text", "").strip() # polarity = sentence.get("polarity") # if text: # if polarity == "pol_positive": # label = 2 # elif polarity == "pol_negative": # label = 0 # else: # label = 1 # data.append({"text": text, "label": label}) # return pd.DataFrame(data) # # Extract and save each split # for split in ["train", "dev", "test"]: # df = extract_polarity_sentences(os.path.join(base_path, split)) # out_file = os.path.join(output_path, f"disapere_polarity_{split}.csv") # df.to_csv(out_file, index=False) # print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples") ################################################################################### ################################################################################### # 2. EXTRACTING TOPIC SENTENCES FROM DISAPERE DATASET # # === Topic Label Mapping === # 1: "Structuring" # 0: "Evaluative" # 2: "Request" # 3: "Fact" # 4: "Social" # 5: "Other" # 6: "Substance" # 7: "Clarity" # 8: "Soundness/Correctness" # 9: "Originality" # 10: "Motivation/Impact" # 11: "Meaningful Comparison" # 12: "Replicability" # Final topic classes topic_classes = [ "asp_substance", "asp_clarity", "asp_soundness-correctness", "asp_originality", "asp_impact", "asp_comparison", "asp_replicability", "None", # This is used for sentences that do not match any specific topic # "arg-structuring_summary" ] label_map = {label: idx for idx, label in enumerate(topic_classes)} def extract_topic_sentences(json_dir): data = [] for filename in os.listdir(json_dir): if filename.endswith(".json"): with open(os.path.join(json_dir, filename), "r") as f: thread = json.load(f) for sentence in thread.get("review_sentences", []): text = sentence.get("text", "").strip() aspect = sentence.get("aspect", "") # fine_action = sentence.get("fine_review_action", "") # Decide label source topic = aspect if aspect in label_map else "None" if text and topic in label_map: label = label_map[topic] data.append({"text": text, "label": label}) return pd.DataFrame(data) # Extract and save each split for split in ["train", "dev", "test"]: df = extract_topic_sentences(os.path.join(base_path, split)) out_file = os.path.join(output_path, f"disapere_topic_{split}.csv") df.to_csv(out_file, index=False) print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples") ################################################################################### ###################################################################################