ReView / glimpse-ui /data /ExtractDISAPEREData.py
Sina1138's picture
Super-squash branch 'main' using huggingface_hub
6fe7180
import os
import json
import pandas as pd
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
base_path = BASE_DIR / "data" / "DISAPERE-main" / "DISAPERE" / "final_dataset"
output_path = BASE_DIR / "data" / "DISAPERE-main" / "SELFExtractedData"
###################################################################################
###################################################################################
# EXTRACTING POLARITY SENTENCES FROM DISAPERE DATASET
# def extract_polarity_sentences(json_dir):
# data = []
# for filename in os.listdir(json_dir):
# if filename.endswith(".json"):
# with open(os.path.join(json_dir, filename), "r") as f:
# thread = json.load(f)
# for sentence in thread.get("review_sentences", []):
# text = sentence.get("text", "").strip()
# polarity = sentence.get("polarity")
# if text:
# if polarity == "pol_positive":
# label = 2
# elif polarity == "pol_negative":
# label = 0
# else:
# label = 1
# data.append({"text": text, "label": label})
# return pd.DataFrame(data)
# # Extract and save each split
# for split in ["train", "dev", "test"]:
# df = extract_polarity_sentences(os.path.join(base_path, split))
# out_file = os.path.join(output_path, f"disapere_polarity_{split}.csv")
# df.to_csv(out_file, index=False)
# print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples")
###################################################################################
###################################################################################
# 2. EXTRACTING TOPIC SENTENCES FROM DISAPERE DATASET
#
# === Topic Label Mapping ===
# 1: "Structuring"
# 0: "Evaluative"
# 2: "Request"
# 3: "Fact"
# 4: "Social"
# 5: "Other"
# 6: "Substance"
# 7: "Clarity"
# 8: "Soundness/Correctness"
# 9: "Originality"
# 10: "Motivation/Impact"
# 11: "Meaningful Comparison"
# 12: "Replicability"
# Final topic classes
topic_classes = [
"asp_substance",
"asp_clarity",
"asp_soundness-correctness",
"asp_originality",
"asp_impact",
"asp_comparison",
"asp_replicability",
"None", # This is used for sentences that do not match any specific topic
# "arg-structuring_summary"
]
label_map = {label: idx for idx, label in enumerate(topic_classes)}
def extract_topic_sentences(json_dir):
data = []
for filename in os.listdir(json_dir):
if filename.endswith(".json"):
with open(os.path.join(json_dir, filename), "r") as f:
thread = json.load(f)
for sentence in thread.get("review_sentences", []):
text = sentence.get("text", "").strip()
aspect = sentence.get("aspect", "")
# fine_action = sentence.get("fine_review_action", "")
# Decide label source
topic = aspect if aspect in label_map else "None"
if text and topic in label_map:
label = label_map[topic]
data.append({"text": text, "label": label})
return pd.DataFrame(data)
# Extract and save each split
for split in ["train", "dev", "test"]:
df = extract_topic_sentences(os.path.join(base_path, split))
out_file = os.path.join(output_path, f"disapere_topic_{split}.csv")
df.to_csv(out_file, index=False)
print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples")
###################################################################################
###################################################################################