|
|
|
""" |
|
Generate a synthetic UltraChat-style SFT dataset. |
|
|
|
Each JSONL line has: |
|
{ |
|
"prompt": "<first user message>", |
|
"messages": [{"content": "...", "role": "user"}, {"content": "...", "role": "assistant"}, ...], |
|
"prompt_id": "<sha256 hex of prompt>" |
|
} |
|
|
|
Usage: |
|
python scripts/generate_ultrachat_sft.py --out sample_data/train_sft.jsonl --n 500 |
|
|
|
Only uses Python standard library. |
|
""" |
|
from __future__ import annotations |
|
|
|
import argparse |
|
import hashlib |
|
import json |
|
import os |
|
import random |
|
import sys |
|
from typing import Any, Dict, List, Sequence |
|
|
|
TOPICS = [ |
|
"travel planning", |
|
"cooking and recipes", |
|
"software engineering", |
|
"data science", |
|
"machine learning", |
|
"mathematics", |
|
"history", |
|
"literature", |
|
"productivity", |
|
"fitness and health", |
|
"economics", |
|
"marketing", |
|
"photography", |
|
"music theory", |
|
"language learning", |
|
"gardening", |
|
"home improvement", |
|
"career advice", |
|
"public speaking", |
|
"time management", |
|
"networking", |
|
"resume writing", |
|
"interview prep", |
|
"cloud computing", |
|
"kubernetes", |
|
"devops", |
|
"cybersecurity", |
|
"robotics", |
|
"electronics", |
|
"astronomy", |
|
"climate and environment", |
|
"education", |
|
"parenting", |
|
"pet care", |
|
"mental wellness", |
|
"philosophy", |
|
"ethics", |
|
"UX design", |
|
"UI design", |
|
"copywriting", |
|
"game design", |
|
"board games", |
|
"coffee brewing", |
|
"tea brewing", |
|
"urban planning", |
|
"transportation", |
|
"sports science", |
|
"statistics", |
|
] |
|
|
|
USER_OPENERS = [ |
|
"Could you help me with {topic}?", |
|
"What are the key steps to get started with {topic}?", |
|
"Give me a concise plan to improve at {topic} over 4 weeks.", |
|
"Explain a few common pitfalls in {topic} and how to avoid them.", |
|
"Draft a checklist for beginners in {topic}.", |
|
"Compare two approaches commonly used in {topic} and when to choose each.", |
|
"I have 2 hours to learn about {topic} todayβwhat should I do?", |
|
"What metrics matter most in {topic} and how do I track them?", |
|
"Summarize an actionable framework for {topic} with examples.", |
|
] |
|
|
|
FOLLOW_UPS = [ |
|
"Nice. Can you add 3 concrete examples?", |
|
"Could you turn that into a step-by-step guide?", |
|
"What are likely failure modes and mitigations?", |
|
"How would a beginner apply this in a weekend project?", |
|
"Please give a small template I can reuse.", |
|
"How do I measure progress over a month?", |
|
] |
|
|
|
DEEPENERS = [ |
|
"Great. Any tips to make it resilient under constraints?", |
|
"How would you adapt this for a small team?", |
|
"What are ethical considerations I should keep in mind?", |
|
"What's an underrated practice here and why?", |
|
] |
|
|
|
def make_answer(topic: str, opener: str) -> str: |
|
bullets: List[str] = [ |
|
f"Define your objective in {topic} (outcome, constraints, timeline).", |
|
f"Map key concepts and tools in {topic}; pick one stack and stick to it for 2-4 weeks.", |
|
f"Practice with a small, scoped project; iterate with feedback.", |
|
f"Track 1-3 metrics that reflect real progress; review weekly.", |
|
f"Document what worked, what didn't, and the next experiment.", |
|
] |
|
return ( |
|
f"Here's a pragmatic path for {topic} based on your request:\n" + |
|
"\n".join([f"- {b}" for b in bullets]) + |
|
"\n\nCommon pitfalls:\n- Starting too big; keep scope small.\n- Tool hopping; commit to one stack.\n- No feedback loop; schedule reviews.\n- Unclear metrics; define success upfront." |
|
) |
|
|
|
|
|
def make_examples(topic: str) -> str: |
|
ex: List[str] = [ |
|
f"Example 1 β 90-minute sprint: Learn one core concept in {topic} and apply it to a toy task.", |
|
f"Example 2 β Weekend project: Build a tiny demo that proves a single capability in {topic}.", |
|
f"Example 3 β Peer review: Share results, gather feedback, and improve one dimension.", |
|
] |
|
return "\n".join(ex) |
|
|
|
|
|
def make_template(topic: str) -> str: |
|
return ( |
|
f"Template β One-page plan for {topic}:\n" |
|
f"Goal: <1-2 sentences>\n" |
|
f"Scope: <what's in / out>\n" |
|
f"Resources: <3 links or docs>\n" |
|
f"Milestones (weekly): <targets>\n" |
|
f"Metrics: <quant/qual measures>\n" |
|
f"Risks & mitigations: <top 3>\n" |
|
) |
|
|
|
|
|
def rng_pick(r: random.Random, items: Sequence[str]) -> str: |
|
return items[r.randrange(0, len(items))] |
|
|
|
|
|
def make_dialogue(r: random.Random) -> Dict[str, Any]: |
|
topic = rng_pick(r, TOPICS) |
|
opener_tpl = rng_pick(r, USER_OPENERS) |
|
opener = opener_tpl.format(topic=topic) |
|
|
|
a1 = make_answer(topic, opener) |
|
|
|
u2 = rng_pick(r, FOLLOW_UPS) |
|
a2 = make_examples(topic) if "example" in u2.lower() else ( |
|
make_template(topic) if "template" in u2.lower() else |
|
"Here is a step-by-step variant focusing on small wins first, then scope up.\n1) Clarify outcome.\n2) Pick one method.\n3) Build tiny demo.\n4) Review metrics.\n5) Iterate." |
|
) |
|
|
|
if r.random() < 0.6: |
|
u3 = rng_pick(r, DEEPENERS) |
|
a3 = ( |
|
"Resilience tips:\n- Use checklists and pre-mortems.\n- Automate a single weak link each week.\n- Keep a rollback plan.\n- Prefer boring, well-documented tools.\n\nEthics: stay transparent, reduce bias, and respect privacy." |
|
) |
|
messages: List[Dict[str, str]] = [ |
|
{"content": opener, "role": "user"}, |
|
{"content": a1, "role": "assistant"}, |
|
{"content": u2, "role": "user"}, |
|
{"content": a2, "role": "assistant"}, |
|
{"content": u3, "role": "user"}, |
|
{"content": a3, "role": "assistant"}, |
|
] |
|
else: |
|
messages = [ |
|
{"content": opener, "role": "user"}, |
|
{"content": a1, "role": "assistant"}, |
|
{"content": rng_pick(r, FOLLOW_UPS), "role": "user"}, |
|
{"content": a2, "role": "assistant"}, |
|
] |
|
|
|
prompt = messages[0]["content"] |
|
prompt_id = hashlib.sha256(prompt.encode("utf-8")).hexdigest() |
|
return {"prompt": prompt, "messages": messages, "prompt_id": prompt_id} |
|
|
|
|
|
def generate(n: int, seed: int) -> List[Dict[str, Any]]: |
|
r = random.Random(seed) |
|
return [make_dialogue(r) for _ in range(n)] |
|
|
|
|
|
def main(argv: List[str]) -> int: |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--out", type=str, default="sample_data/train_sft.jsonl") |
|
parser.add_argument("--n", type=int, default=500) |
|
parser.add_argument("--seed", type=int, default=42) |
|
args = parser.parse_args(argv) |
|
|
|
data = generate(args.n, args.seed) |
|
out_path = args.out |
|
os.makedirs(os.path.dirname(out_path), exist_ok=True) |
|
with open(out_path, "w", encoding="utf-8") as f: |
|
for row in data: |
|
f.write(json.dumps(row, ensure_ascii=False) + "\n") |
|
|
|
print(f"Wrote {len(data)} rows to {out_path}") |
|
return 0 |
|
|
|
|
|
if __name__ == "__main__": |
|
raise SystemExit(main(sys.argv[1:])) |
|
|