Spaces:

Sina1138
/

ReView

Running

File size: 3,992 Bytes

6fe7180

import os
import json
import pandas as pd
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent.parent
base_path = BASE_DIR / "data" / "DISAPERE-main" / "DISAPERE" / "final_dataset"
output_path = BASE_DIR / "data" / "DISAPERE-main" / "SELFExtractedData"

###################################################################################
###################################################################################

# EXTRACTING POLARITY SENTENCES FROM DISAPERE DATASET

# def extract_polarity_sentences(json_dir):
#     data = []
#     for filename in os.listdir(json_dir):
#         if filename.endswith(".json"):
#             with open(os.path.join(json_dir, filename), "r") as f:
#                 thread = json.load(f)
#                 for sentence in thread.get("review_sentences", []):
#                     text = sentence.get("text", "").strip()
#                     polarity = sentence.get("polarity")
#                     if text:
#                         if polarity == "pol_positive":
#                             label = 2
#                         elif polarity == "pol_negative":
#                             label = 0
#                         else:
#                             label = 1
#                         data.append({"text": text, "label": label})
#     return pd.DataFrame(data)

# # Extract and save each split
# for split in ["train", "dev", "test"]:
#     df = extract_polarity_sentences(os.path.join(base_path, split))
#     out_file = os.path.join(output_path, f"disapere_polarity_{split}.csv")
#     df.to_csv(out_file, index=False)
#     print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples")


###################################################################################
###################################################################################

# 2. EXTRACTING TOPIC SENTENCES FROM DISAPERE DATASET
#
# === Topic Label Mapping ===
# 1: "Structuring"
# 0: "Evaluative"
# 2: "Request"
# 3: "Fact"
# 4: "Social"
# 5: "Other"
# 6: "Substance"
# 7: "Clarity"
# 8: "Soundness/Correctness"
# 9: "Originality"
# 10: "Motivation/Impact"
# 11: "Meaningful Comparison"
# 12: "Replicability"

# Final topic classes
topic_classes = [
    "asp_substance",
    "asp_clarity",
    "asp_soundness-correctness",
    "asp_originality",
    "asp_impact",
    "asp_comparison",
    "asp_replicability",
    "None",  # This is used for sentences that do not match any specific topic
    # "arg-structuring_summary"
]

label_map = {label: idx for idx, label in enumerate(topic_classes)}

def extract_topic_sentences(json_dir):
    data = []
    for filename in os.listdir(json_dir):
        if filename.endswith(".json"):
            with open(os.path.join(json_dir, filename), "r") as f:
                thread = json.load(f)
                for sentence in thread.get("review_sentences", []):
                    text = sentence.get("text", "").strip()
                    aspect = sentence.get("aspect", "")
                    # fine_action = sentence.get("fine_review_action", "")
                    
                    # Decide label source
                    topic = aspect if aspect in label_map else "None"

                    if text and topic in label_map:
                        label = label_map[topic]
                        data.append({"text": text, "label": label})
    return pd.DataFrame(data)

# Extract and save each split
for split in ["train", "dev", "test"]:
    df = extract_topic_sentences(os.path.join(base_path, split))
    out_file = os.path.join(output_path, f"disapere_topic_{split}.csv")
    df.to_csv(out_file, index=False)
    print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples")

###################################################################################
###################################################################################