File size: 3,992 Bytes
6fe7180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import json
import pandas as pd
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent.parent
base_path = BASE_DIR / "data" / "DISAPERE-main" / "DISAPERE" / "final_dataset"
output_path = BASE_DIR / "data" / "DISAPERE-main" / "SELFExtractedData"

###################################################################################
###################################################################################

# EXTRACTING POLARITY SENTENCES FROM DISAPERE DATASET

# def extract_polarity_sentences(json_dir):
#     data = []
#     for filename in os.listdir(json_dir):
#         if filename.endswith(".json"):
#             with open(os.path.join(json_dir, filename), "r") as f:
#                 thread = json.load(f)
#                 for sentence in thread.get("review_sentences", []):
#                     text = sentence.get("text", "").strip()
#                     polarity = sentence.get("polarity")
#                     if text:
#                         if polarity == "pol_positive":
#                             label = 2
#                         elif polarity == "pol_negative":
#                             label = 0
#                         else:
#                             label = 1
#                         data.append({"text": text, "label": label})
#     return pd.DataFrame(data)

# # Extract and save each split
# for split in ["train", "dev", "test"]:
#     df = extract_polarity_sentences(os.path.join(base_path, split))
#     out_file = os.path.join(output_path, f"disapere_polarity_{split}.csv")
#     df.to_csv(out_file, index=False)
#     print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples")


###################################################################################
###################################################################################

# 2. EXTRACTING TOPIC SENTENCES FROM DISAPERE DATASET
#
# === Topic Label Mapping ===
# 1: "Structuring"
# 0: "Evaluative"
# 2: "Request"
# 3: "Fact"
# 4: "Social"
# 5: "Other"
# 6: "Substance"
# 7: "Clarity"
# 8: "Soundness/Correctness"
# 9: "Originality"
# 10: "Motivation/Impact"
# 11: "Meaningful Comparison"
# 12: "Replicability"

# Final topic classes
topic_classes = [
    "asp_substance",
    "asp_clarity",
    "asp_soundness-correctness",
    "asp_originality",
    "asp_impact",
    "asp_comparison",
    "asp_replicability",
    "None",  # This is used for sentences that do not match any specific topic
    # "arg-structuring_summary"
]

label_map = {label: idx for idx, label in enumerate(topic_classes)}

def extract_topic_sentences(json_dir):
    data = []
    for filename in os.listdir(json_dir):
        if filename.endswith(".json"):
            with open(os.path.join(json_dir, filename), "r") as f:
                thread = json.load(f)
                for sentence in thread.get("review_sentences", []):
                    text = sentence.get("text", "").strip()
                    aspect = sentence.get("aspect", "")
                    # fine_action = sentence.get("fine_review_action", "")
                    
                    # Decide label source
                    topic = aspect if aspect in label_map else "None"

                    if text and topic in label_map:
                        label = label_map[topic]
                        data.append({"text": text, "label": label})
    return pd.DataFrame(data)

# Extract and save each split
for split in ["train", "dev", "test"]:
    df = extract_topic_sentences(os.path.join(base_path, split))
    out_file = os.path.join(output_path, f"disapere_topic_{split}.csv")
    df.to_csv(out_file, index=False)
    print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples")

###################################################################################
###################################################################################