File size: 1,696 Bytes
718e236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# prep_datasets.py
# One-time exporter: saves PIQA + HellaSwag to clean JSONL for offline use.

from datasets import load_dataset
import json, os

OUT_DIR = "data"
os.makedirs(OUT_DIR, exist_ok=True)

def write_jsonl(path, rows):
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

# --- PIQA ---
print("Downloading PIQA…")
piqa = load_dataset("piqa")
def piqa_clean(split):
    out = []
    for ex in split:
        out.append({
            "goal": ex.get("goal") or "",
            "sol1": ex.get("sol1") or "",
            "sol2": ex.get("sol2") or "",
            "label": int(ex.get("label", 0))
        })
    return out

piqa_train = piqa["train"]
piqa_valid = piqa["validation"]

print("Writing PIQA JSONL…")
write_jsonl(os.path.join(OUT_DIR, "piqa_train.jsonl"), piqa_clean(piqa_train))
write_jsonl(os.path.join(OUT_DIR, "piqa_valid.jsonl"), piqa_clean(piqa_valid))

# --- HellaSwag ---
print("Downloading HellaSwag…")
hs = load_dataset("hellaswag")
def hs_clean(split):
    out = []
    for ex in split:
        out.append({
            # keep both ctx and ctx_a to be safe (some variants use both)
            "ctx": ex.get("ctx") or "",
            "ctx_a": ex.get("ctx_a") or "",
            "endings": list(ex.get("endings") or []),
            "label": int(ex.get("label", 0))
        })
    return out

hs_train = hs["train"]
hs_valid = hs["validation"]

print("Writing HellaSwag JSONL…")
write_jsonl(os.path.join(OUT_DIR, "hellaswag_train.jsonl"), hs_clean(hs_train))
write_jsonl(os.path.join(OUT_DIR, "hellaswag_valid.jsonl"), hs_clean(hs_valid))

print("Done. Files created in ./data")