|
import json, sys |
|
from pathlib import Path |
|
from typing import List, Dict, Any |
|
|
|
|
|
|
|
|
|
|
|
def load_gaia_questions(path: Path) -> set[str]: |
|
"""Return *set of question strings* from GAIA file (regular JSON array).""" |
|
with path.open("r", encoding="utf-8") as f: |
|
data: List[Dict[str, Any]] = json.load(f) |
|
return {rec.get("question", "").strip() for rec in data if rec.get("question")} |
|
|
|
|
|
def load_validation_records(path: Path) -> List[Dict[str, Any]]: |
|
"""Return *full dict records* from validation.json (newline‑delimited JSON).""" |
|
records: List[Dict[str, Any]] = [] |
|
with path.open("r", encoding="utf-8") as f: |
|
for line in f: |
|
line = line.strip() |
|
if not line: |
|
continue |
|
try: |
|
obj = json.loads(line) |
|
records.append(obj) |
|
except json.JSONDecodeError as e: |
|
|
|
print(f"Warning: could not parse line in {path}: {e}") |
|
return records |
|
|
|
|
|
|
|
|
|
|
|
def main( |
|
gaia_path: str = "question_set/gaia_questions.json", |
|
validation_path: str = "question_set/validation.json", |
|
output_path: str = "question_set/common_questions.json", |
|
) -> None: |
|
gaia_file = Path(gaia_path) |
|
val_file = Path(validation_path) |
|
|
|
if not gaia_file.exists(): |
|
sys.exit(f"Error: {gaia_file} not found") |
|
if not val_file.exists(): |
|
sys.exit(f"Error: {val_file} not found") |
|
|
|
gaia_questions = load_gaia_questions(gaia_file) |
|
validation_recs = load_validation_records(val_file) |
|
|
|
|
|
common: List[Dict[str, Any]] = [] |
|
for rec in validation_recs: |
|
q = rec.get("question") or rec.get("Question") |
|
if q and q.strip() in gaia_questions: |
|
common.append(rec) |
|
|
|
|
|
out_path = Path(output_path) |
|
out_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
with out_path.open("w", encoding="utf-8") as f: |
|
json.dump(common, f, indent=2, ensure_ascii=False) |
|
|
|
print(f"Extracted {len(common)} record(s) – full validation fields kept –> {out_path}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
main(*sys.argv[1:]) |
|
|