# prep_datasets.py # One-time exporter: saves PIQA + HellaSwag to clean JSONL for offline use. from datasets import load_dataset import json, os OUT_DIR = "data" os.makedirs(OUT_DIR, exist_ok=True) def write_jsonl(path, rows): with open(path, "w", encoding="utf-8") as f: for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n") # --- PIQA --- print("Downloading PIQA…") piqa = load_dataset("piqa") def piqa_clean(split): out = [] for ex in split: out.append({ "goal": ex.get("goal") or "", "sol1": ex.get("sol1") or "", "sol2": ex.get("sol2") or "", "label": int(ex.get("label", 0)) }) return out piqa_train = piqa["train"] piqa_valid = piqa["validation"] print("Writing PIQA JSONL…") write_jsonl(os.path.join(OUT_DIR, "piqa_train.jsonl"), piqa_clean(piqa_train)) write_jsonl(os.path.join(OUT_DIR, "piqa_valid.jsonl"), piqa_clean(piqa_valid)) # --- HellaSwag --- print("Downloading HellaSwag…") hs = load_dataset("hellaswag") def hs_clean(split): out = [] for ex in split: out.append({ # keep both ctx and ctx_a to be safe (some variants use both) "ctx": ex.get("ctx") or "", "ctx_a": ex.get("ctx_a") or "", "endings": list(ex.get("endings") or []), "label": int(ex.get("label", 0)) }) return out hs_train = hs["train"] hs_valid = hs["validation"] print("Writing HellaSwag JSONL…") write_jsonl(os.path.join(OUT_DIR, "hellaswag_train.jsonl"), hs_clean(hs_train)) write_jsonl(os.path.join(OUT_DIR, "hellaswag_valid.jsonl"), hs_clean(hs_valid)) print("Done. Files created in ./data")