File size: 6,893 Bytes
7ecd130 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
#!/usr/bin/env python3
"""
Generate a synthetic UltraChat-style SFT dataset.
Each JSONL line has:
{
"prompt": "<first user message>",
"messages": [{"content": "...", "role": "user"}, {"content": "...", "role": "assistant"}, ...],
"prompt_id": "<sha256 hex of prompt>"
}
Usage:
python scripts/generate_ultrachat_sft.py --out sample_data/train_sft.jsonl --n 500
Only uses Python standard library.
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
import random
import sys
from typing import Any, Dict, List, Sequence
TOPICS = [
"travel planning",
"cooking and recipes",
"software engineering",
"data science",
"machine learning",
"mathematics",
"history",
"literature",
"productivity",
"fitness and health",
"economics",
"marketing",
"photography",
"music theory",
"language learning",
"gardening",
"home improvement",
"career advice",
"public speaking",
"time management",
"networking",
"resume writing",
"interview prep",
"cloud computing",
"kubernetes",
"devops",
"cybersecurity",
"robotics",
"electronics",
"astronomy",
"climate and environment",
"education",
"parenting",
"pet care",
"mental wellness",
"philosophy",
"ethics",
"UX design",
"UI design",
"copywriting",
"game design",
"board games",
"coffee brewing",
"tea brewing",
"urban planning",
"transportation",
"sports science",
"statistics",
]
USER_OPENERS = [
"Could you help me with {topic}?",
"What are the key steps to get started with {topic}?",
"Give me a concise plan to improve at {topic} over 4 weeks.",
"Explain a few common pitfalls in {topic} and how to avoid them.",
"Draft a checklist for beginners in {topic}.",
"Compare two approaches commonly used in {topic} and when to choose each.",
"I have 2 hours to learn about {topic} today—what should I do?",
"What metrics matter most in {topic} and how do I track them?",
"Summarize an actionable framework for {topic} with examples.",
]
FOLLOW_UPS = [
"Nice. Can you add 3 concrete examples?",
"Could you turn that into a step-by-step guide?",
"What are likely failure modes and mitigations?",
"How would a beginner apply this in a weekend project?",
"Please give a small template I can reuse.",
"How do I measure progress over a month?",
]
DEEPENERS = [
"Great. Any tips to make it resilient under constraints?",
"How would you adapt this for a small team?",
"What are ethical considerations I should keep in mind?",
"What's an underrated practice here and why?",
]
def make_answer(topic: str, opener: str) -> str:
bullets: List[str] = [
f"Define your objective in {topic} (outcome, constraints, timeline).",
f"Map key concepts and tools in {topic}; pick one stack and stick to it for 2-4 weeks.",
f"Practice with a small, scoped project; iterate with feedback.",
f"Track 1-3 metrics that reflect real progress; review weekly.",
f"Document what worked, what didn't, and the next experiment.",
]
return (
f"Here's a pragmatic path for {topic} based on your request:\n" +
"\n".join([f"- {b}" for b in bullets]) +
"\n\nCommon pitfalls:\n- Starting too big; keep scope small.\n- Tool hopping; commit to one stack.\n- No feedback loop; schedule reviews.\n- Unclear metrics; define success upfront."
)
def make_examples(topic: str) -> str:
ex: List[str] = [
f"Example 1 — 90-minute sprint: Learn one core concept in {topic} and apply it to a toy task.",
f"Example 2 — Weekend project: Build a tiny demo that proves a single capability in {topic}.",
f"Example 3 — Peer review: Share results, gather feedback, and improve one dimension.",
]
return "\n".join(ex)
def make_template(topic: str) -> str:
return (
f"Template — One-page plan for {topic}:\n"
f"Goal: <1-2 sentences>\n"
f"Scope: <what's in / out>\n"
f"Resources: <3 links or docs>\n"
f"Milestones (weekly): <targets>\n"
f"Metrics: <quant/qual measures>\n"
f"Risks & mitigations: <top 3>\n"
)
def rng_pick(r: random.Random, items: Sequence[str]) -> str:
return items[r.randrange(0, len(items))]
def make_dialogue(r: random.Random) -> Dict[str, Any]:
topic = rng_pick(r, TOPICS)
opener_tpl = rng_pick(r, USER_OPENERS)
opener = opener_tpl.format(topic=topic)
a1 = make_answer(topic, opener)
u2 = rng_pick(r, FOLLOW_UPS)
a2 = make_examples(topic) if "example" in u2.lower() else (
make_template(topic) if "template" in u2.lower() else
"Here is a step-by-step variant focusing on small wins first, then scope up.\n1) Clarify outcome.\n2) Pick one method.\n3) Build tiny demo.\n4) Review metrics.\n5) Iterate."
)
if r.random() < 0.6:
u3 = rng_pick(r, DEEPENERS)
a3 = (
"Resilience tips:\n- Use checklists and pre-mortems.\n- Automate a single weak link each week.\n- Keep a rollback plan.\n- Prefer boring, well-documented tools.\n\nEthics: stay transparent, reduce bias, and respect privacy."
)
messages: List[Dict[str, str]] = [
{"content": opener, "role": "user"},
{"content": a1, "role": "assistant"},
{"content": u2, "role": "user"},
{"content": a2, "role": "assistant"},
{"content": u3, "role": "user"},
{"content": a3, "role": "assistant"},
]
else:
messages = [
{"content": opener, "role": "user"},
{"content": a1, "role": "assistant"},
{"content": rng_pick(r, FOLLOW_UPS), "role": "user"},
{"content": a2, "role": "assistant"},
]
prompt = messages[0]["content"]
prompt_id = hashlib.sha256(prompt.encode("utf-8")).hexdigest()
return {"prompt": prompt, "messages": messages, "prompt_id": prompt_id}
def generate(n: int, seed: int) -> List[Dict[str, Any]]:
r = random.Random(seed)
return [make_dialogue(r) for _ in range(n)]
def main(argv: List[str]) -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--out", type=str, default="sample_data/train_sft.jsonl")
parser.add_argument("--n", type=int, default=500)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args(argv)
data = generate(args.n, args.seed)
out_path = args.out
os.makedirs(os.path.dirname(out_path), exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
for row in data:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
print(f"Wrote {len(data)} rows to {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))
|