firstAI / scripts /generate_ultrachat_sft.py
ndc8
Add scripts for converting and generating UltraChat-style SFT dataset
7ecd130
raw
history blame
6.89 kB
#!/usr/bin/env python3
"""
Generate a synthetic UltraChat-style SFT dataset.
Each JSONL line has:
{
"prompt": "<first user message>",
"messages": [{"content": "...", "role": "user"}, {"content": "...", "role": "assistant"}, ...],
"prompt_id": "<sha256 hex of prompt>"
}
Usage:
python scripts/generate_ultrachat_sft.py --out sample_data/train_sft.jsonl --n 500
Only uses Python standard library.
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
import random
import sys
from typing import Any, Dict, List, Sequence
TOPICS = [
"travel planning",
"cooking and recipes",
"software engineering",
"data science",
"machine learning",
"mathematics",
"history",
"literature",
"productivity",
"fitness and health",
"economics",
"marketing",
"photography",
"music theory",
"language learning",
"gardening",
"home improvement",
"career advice",
"public speaking",
"time management",
"networking",
"resume writing",
"interview prep",
"cloud computing",
"kubernetes",
"devops",
"cybersecurity",
"robotics",
"electronics",
"astronomy",
"climate and environment",
"education",
"parenting",
"pet care",
"mental wellness",
"philosophy",
"ethics",
"UX design",
"UI design",
"copywriting",
"game design",
"board games",
"coffee brewing",
"tea brewing",
"urban planning",
"transportation",
"sports science",
"statistics",
]
USER_OPENERS = [
"Could you help me with {topic}?",
"What are the key steps to get started with {topic}?",
"Give me a concise plan to improve at {topic} over 4 weeks.",
"Explain a few common pitfalls in {topic} and how to avoid them.",
"Draft a checklist for beginners in {topic}.",
"Compare two approaches commonly used in {topic} and when to choose each.",
"I have 2 hours to learn about {topic} todayβ€”what should I do?",
"What metrics matter most in {topic} and how do I track them?",
"Summarize an actionable framework for {topic} with examples.",
]
FOLLOW_UPS = [
"Nice. Can you add 3 concrete examples?",
"Could you turn that into a step-by-step guide?",
"What are likely failure modes and mitigations?",
"How would a beginner apply this in a weekend project?",
"Please give a small template I can reuse.",
"How do I measure progress over a month?",
]
DEEPENERS = [
"Great. Any tips to make it resilient under constraints?",
"How would you adapt this for a small team?",
"What are ethical considerations I should keep in mind?",
"What's an underrated practice here and why?",
]
def make_answer(topic: str, opener: str) -> str:
bullets: List[str] = [
f"Define your objective in {topic} (outcome, constraints, timeline).",
f"Map key concepts and tools in {topic}; pick one stack and stick to it for 2-4 weeks.",
f"Practice with a small, scoped project; iterate with feedback.",
f"Track 1-3 metrics that reflect real progress; review weekly.",
f"Document what worked, what didn't, and the next experiment.",
]
return (
f"Here's a pragmatic path for {topic} based on your request:\n" +
"\n".join([f"- {b}" for b in bullets]) +
"\n\nCommon pitfalls:\n- Starting too big; keep scope small.\n- Tool hopping; commit to one stack.\n- No feedback loop; schedule reviews.\n- Unclear metrics; define success upfront."
)
def make_examples(topic: str) -> str:
ex: List[str] = [
f"Example 1 β€” 90-minute sprint: Learn one core concept in {topic} and apply it to a toy task.",
f"Example 2 β€” Weekend project: Build a tiny demo that proves a single capability in {topic}.",
f"Example 3 β€” Peer review: Share results, gather feedback, and improve one dimension.",
]
return "\n".join(ex)
def make_template(topic: str) -> str:
return (
f"Template β€” One-page plan for {topic}:\n"
f"Goal: <1-2 sentences>\n"
f"Scope: <what's in / out>\n"
f"Resources: <3 links or docs>\n"
f"Milestones (weekly): <targets>\n"
f"Metrics: <quant/qual measures>\n"
f"Risks & mitigations: <top 3>\n"
)
def rng_pick(r: random.Random, items: Sequence[str]) -> str:
return items[r.randrange(0, len(items))]
def make_dialogue(r: random.Random) -> Dict[str, Any]:
topic = rng_pick(r, TOPICS)
opener_tpl = rng_pick(r, USER_OPENERS)
opener = opener_tpl.format(topic=topic)
a1 = make_answer(topic, opener)
u2 = rng_pick(r, FOLLOW_UPS)
a2 = make_examples(topic) if "example" in u2.lower() else (
make_template(topic) if "template" in u2.lower() else
"Here is a step-by-step variant focusing on small wins first, then scope up.\n1) Clarify outcome.\n2) Pick one method.\n3) Build tiny demo.\n4) Review metrics.\n5) Iterate."
)
if r.random() < 0.6:
u3 = rng_pick(r, DEEPENERS)
a3 = (
"Resilience tips:\n- Use checklists and pre-mortems.\n- Automate a single weak link each week.\n- Keep a rollback plan.\n- Prefer boring, well-documented tools.\n\nEthics: stay transparent, reduce bias, and respect privacy."
)
messages: List[Dict[str, str]] = [
{"content": opener, "role": "user"},
{"content": a1, "role": "assistant"},
{"content": u2, "role": "user"},
{"content": a2, "role": "assistant"},
{"content": u3, "role": "user"},
{"content": a3, "role": "assistant"},
]
else:
messages = [
{"content": opener, "role": "user"},
{"content": a1, "role": "assistant"},
{"content": rng_pick(r, FOLLOW_UPS), "role": "user"},
{"content": a2, "role": "assistant"},
]
prompt = messages[0]["content"]
prompt_id = hashlib.sha256(prompt.encode("utf-8")).hexdigest()
return {"prompt": prompt, "messages": messages, "prompt_id": prompt_id}
def generate(n: int, seed: int) -> List[Dict[str, Any]]:
r = random.Random(seed)
return [make_dialogue(r) for _ in range(n)]
def main(argv: List[str]) -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--out", type=str, default="sample_data/train_sft.jsonl")
parser.add_argument("--n", type=int, default=500)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args(argv)
data = generate(args.n, args.seed)
out_path = args.out
os.makedirs(os.path.dirname(out_path), exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
for row in data:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
print(f"Wrote {len(data)} rows to {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))