Spaces:
Sleeping
Sleeping
File size: 4,753 Bytes
c269e9c bf6c353 c269e9c 3853c51 c269e9c bf6c353 3853c51 c269e9c bf6c353 c269e9c 3853c51 bf6c353 c269e9c bf6c353 c269e9c bf6c353 c269e9c 3853c51 c269e9c 3853c51 bf6c353 3853c51 bf6c353 c269e9c 3853c51 bf6c353 c269e9c bf6c353 c269e9c bf6c353 c269e9c 3853c51 c269e9c 3853c51 bf6c353 3853c51 bf6c353 3853c51 bf6c353 3853c51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# data_utils.py — local-first dataset loaders + hashing vectorizer
from typing import List, Tuple
import os, json
import numpy as np
try:
from datasets import load_dataset # optional, used only as fallback
except Exception:
load_dataset = None
# -----------------------------
# Hashing vectorizer (unigram + bigram)
# -----------------------------
def hash_vectorize(texts: List[str], n_features: int = 4096, seed: int = 1234) -> np.ndarray:
n = len(texts)
X = np.zeros((n, n_features), dtype=np.float32)
for i, t in enumerate(texts):
if not t:
continue
toks = t.lower().split()
prev = None
for tok in toks:
h1 = hash(tok) % n_features
X[i, h1] += 1.0
if prev is not None:
bg = prev + "_" + tok
h2 = hash(bg) % n_features
X[i, h2] += 1.0
prev = tok
norm = float(np.linalg.norm(X[i])) + 1e-8
X[i] /= norm
return X
# -----------------------------
# Utilities for local JSONL
# -----------------------------
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
def _read_jsonl(path: str):
out = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
out.append(json.loads(line))
return out
def _has_local(*names: str) -> bool:
return all(os.path.exists(os.path.join(DATA_DIR, n)) for n in names)
# -----------------------------
# PIQA loader (pair-expanded)
# -----------------------------
def load_piqa(subset: int = 800, seed: int = 42):
"""
Returns:
Xtr_txt, ytr, Xva_txt, yva
For each original PIQA example, we emit TWO rows:
[goal + sol1] with label 1 if sol1 correct, else 0
[goal + sol2] with label 1 if sol2 correct, else 0
"""
rng = np.random.RandomState(seed)
# Prefer local
tr_name, va_name = "piqa_train.jsonl", "piqa_valid.jsonl"
if _has_local(tr_name, va_name):
tr = _read_jsonl(os.path.join(DATA_DIR, tr_name))
va = _read_jsonl(os.path.join(DATA_DIR, va_name))
else:
# Fallback to datasets (if available)
if load_dataset is None:
raise RuntimeError("PIQA local files not found and 'datasets' not installed.")
ds = load_dataset("piqa")
tr, va = list(ds["train"]), list(ds["validation"])
# subsample
idx_tr = rng.choice(len(tr), size=min(subset, len(tr)), replace=False)
idx_va = rng.choice(len(va), size=min(max(subset // 4, 200), len(va)), replace=False)
def pack(rows, idxs):
X_text, y = [], []
for k in idxs:
p = rows[k]
stem = (p.get("goal") or "").strip()
sol1 = (p.get("sol1") or "").strip()
sol2 = (p.get("sol2") or "").strip()
label = int(p.get("label", 0))
X_text.append(f"{stem} {sol1}"); y.append(1 if label == 0 else 0)
X_text.append(f"{stem} {sol2}"); y.append(1 if label == 1 else 0)
return X_text, np.array(y, dtype=np.int64)
Xtr_txt, ytr = pack(tr, idx_tr)
Xva_txt, yva = pack(va, idx_va)
return Xtr_txt, ytr, Xva_txt, yva
# -----------------------------
# HellaSwag loader (4-way expanded)
# -----------------------------
def load_hellaswag(subset: int = 800, seed: int = 42):
"""
Returns:
Xtr_txt, ytr, Xva_txt, yva
For each example, we emit FOUR rows:
[context + ending_i] with label 1 if i is the correct ending else 0
"""
rng = np.random.RandomState(seed)
tr_name, va_name = "hellaswag_train.jsonl", "hellaswag_valid.jsonl"
if _has_local(tr_name, va_name):
tr = _read_jsonl(os.path.join(DATA_DIR, tr_name))
va = _read_jsonl(os.path.join(DATA_DIR, va_name))
else:
if load_dataset is None:
raise RuntimeError("HellaSwag local files not found and 'datasets' not installed.")
ds = load_dataset("hellaswag")
tr, va = list(ds["train"]), list(ds["validation"])
idx_tr = rng.choice(len(tr), size=min(subset, len(tr)), replace=False)
idx_va = rng.choice(len(va), size=min(max(subset // 4, 200), len(va)), replace=False)
def pack(rows, idxs):
X_text, y = [], []
for k in idxs:
p = rows[k]
ctx = f"{(p.get('ctx') or '')} {(p.get('ctx_a') or '')}".strip()
endings = p.get("endings") or []
label = int(p.get("label", 0))
for i, e in enumerate(endings):
X_text.append(f"{ctx} {e}".strip())
y.append(1 if i == label else 0)
return X_text, np.array(y, dtype=np.int64)
Xtr_txt, ytr = pack(tr, idx_tr)
Xva_txt, yva = pack(va, idx_va)
return Xtr_txt, ytr, Xva_txt, yva
|