import os | |
import json | |
import pandas as pd | |
from pathlib import Path | |
BASE_DIR = Path(__file__).resolve().parent.parent | |
base_path = BASE_DIR / "data" / "DISAPERE-main" / "DISAPERE" / "final_dataset" | |
output_path = BASE_DIR / "data" / "DISAPERE-main" / "SELFExtractedData" | |
################################################################################### | |
################################################################################### | |
# EXTRACTING POLARITY SENTENCES FROM DISAPERE DATASET | |
# def extract_polarity_sentences(json_dir): | |
# data = [] | |
# for filename in os.listdir(json_dir): | |
# if filename.endswith(".json"): | |
# with open(os.path.join(json_dir, filename), "r") as f: | |
# thread = json.load(f) | |
# for sentence in thread.get("review_sentences", []): | |
# text = sentence.get("text", "").strip() | |
# polarity = sentence.get("polarity") | |
# if text: | |
# if polarity == "pol_positive": | |
# label = 2 | |
# elif polarity == "pol_negative": | |
# label = 0 | |
# else: | |
# label = 1 | |
# data.append({"text": text, "label": label}) | |
# return pd.DataFrame(data) | |
# # Extract and save each split | |
# for split in ["train", "dev", "test"]: | |
# df = extract_polarity_sentences(os.path.join(base_path, split)) | |
# out_file = os.path.join(output_path, f"disapere_polarity_{split}.csv") | |
# df.to_csv(out_file, index=False) | |
# print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples") | |
################################################################################### | |
################################################################################### | |
# 2. EXTRACTING TOPIC SENTENCES FROM DISAPERE DATASET | |
# | |
# === Topic Label Mapping === | |
# 1: "Structuring" | |
# 0: "Evaluative" | |
# 2: "Request" | |
# 3: "Fact" | |
# 4: "Social" | |
# 5: "Other" | |
# 6: "Substance" | |
# 7: "Clarity" | |
# 8: "Soundness/Correctness" | |
# 9: "Originality" | |
# 10: "Motivation/Impact" | |
# 11: "Meaningful Comparison" | |
# 12: "Replicability" | |
# Final topic classes | |
topic_classes = [ | |
"asp_substance", | |
"asp_clarity", | |
"asp_soundness-correctness", | |
"asp_originality", | |
"asp_impact", | |
"asp_comparison", | |
"asp_replicability", | |
"None", # This is used for sentences that do not match any specific topic | |
# "arg-structuring_summary" | |
] | |
label_map = {label: idx for idx, label in enumerate(topic_classes)} | |
def extract_topic_sentences(json_dir): | |
data = [] | |
for filename in os.listdir(json_dir): | |
if filename.endswith(".json"): | |
with open(os.path.join(json_dir, filename), "r") as f: | |
thread = json.load(f) | |
for sentence in thread.get("review_sentences", []): | |
text = sentence.get("text", "").strip() | |
aspect = sentence.get("aspect", "") | |
# fine_action = sentence.get("fine_review_action", "") | |
# Decide label source | |
topic = aspect if aspect in label_map else "None" | |
if text and topic in label_map: | |
label = label_map[topic] | |
data.append({"text": text, "label": label}) | |
return pd.DataFrame(data) | |
# Extract and save each split | |
for split in ["train", "dev", "test"]: | |
df = extract_topic_sentences(os.path.join(base_path, split)) | |
out_file = os.path.join(output_path, f"disapere_topic_{split}.csv") | |
df.to_csv(out_file, index=False) | |
print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples") | |
################################################################################### | |
################################################################################### | |