File size: 6,893 Bytes
7ecd130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/env python3
"""
Generate a synthetic UltraChat-style SFT dataset.

Each JSONL line has:
{
  "prompt": "<first user message>",
  "messages": [{"content": "...", "role": "user"}, {"content": "...", "role": "assistant"}, ...],
  "prompt_id": "<sha256 hex of prompt>"
}

Usage:
  python scripts/generate_ultrachat_sft.py --out sample_data/train_sft.jsonl --n 500

Only uses Python standard library.
"""
from __future__ import annotations

import argparse
import hashlib
import json
import os
import random
import sys
from typing import Any, Dict, List, Sequence

TOPICS = [
    "travel planning",
    "cooking and recipes",
    "software engineering",
    "data science",
    "machine learning",
    "mathematics",
    "history",
    "literature",
    "productivity",
    "fitness and health",
    "economics",
    "marketing",
    "photography",
    "music theory",
    "language learning",
    "gardening",
    "home improvement",
    "career advice",
    "public speaking",
    "time management",
    "networking",
    "resume writing",
    "interview prep",
    "cloud computing",
    "kubernetes",
    "devops",
    "cybersecurity",
    "robotics",
    "electronics",
    "astronomy",
    "climate and environment",
    "education",
    "parenting",
    "pet care",
    "mental wellness",
    "philosophy",
    "ethics",
    "UX design",
    "UI design",
    "copywriting",
    "game design",
    "board games",
    "coffee brewing",
    "tea brewing",
    "urban planning",
    "transportation",
    "sports science",
    "statistics",
]

USER_OPENERS = [
    "Could you help me with {topic}?",
    "What are the key steps to get started with {topic}?",
    "Give me a concise plan to improve at {topic} over 4 weeks.",
    "Explain a few common pitfalls in {topic} and how to avoid them.",
    "Draft a checklist for beginners in {topic}.",
    "Compare two approaches commonly used in {topic} and when to choose each.",
    "I have 2 hours to learn about {topic} today—what should I do?",
    "What metrics matter most in {topic} and how do I track them?",
    "Summarize an actionable framework for {topic} with examples.",
]

FOLLOW_UPS = [
    "Nice. Can you add 3 concrete examples?",
    "Could you turn that into a step-by-step guide?",
    "What are likely failure modes and mitigations?",
    "How would a beginner apply this in a weekend project?",
    "Please give a small template I can reuse.",
    "How do I measure progress over a month?",
]

DEEPENERS = [
    "Great. Any tips to make it resilient under constraints?",
    "How would you adapt this for a small team?",
    "What are ethical considerations I should keep in mind?",
    "What's an underrated practice here and why?",
]

def make_answer(topic: str, opener: str) -> str:
    bullets: List[str] = [
        f"Define your objective in {topic} (outcome, constraints, timeline).",
        f"Map key concepts and tools in {topic}; pick one stack and stick to it for 2-4 weeks.",
        f"Practice with a small, scoped project; iterate with feedback.",
        f"Track 1-3 metrics that reflect real progress; review weekly.",
        f"Document what worked, what didn't, and the next experiment.",
    ]
    return (
        f"Here's a pragmatic path for {topic} based on your request:\n" +
        "\n".join([f"- {b}" for b in bullets]) +
        "\n\nCommon pitfalls:\n- Starting too big; keep scope small.\n- Tool hopping; commit to one stack.\n- No feedback loop; schedule reviews.\n- Unclear metrics; define success upfront."
    )


def make_examples(topic: str) -> str:
    ex: List[str] = [
        f"Example 1 — 90-minute sprint: Learn one core concept in {topic} and apply it to a toy task.",
        f"Example 2 — Weekend project: Build a tiny demo that proves a single capability in {topic}.",
        f"Example 3 — Peer review: Share results, gather feedback, and improve one dimension.",
    ]
    return "\n".join(ex)


def make_template(topic: str) -> str:
    return (
        f"Template — One-page plan for {topic}:\n"
        f"Goal: <1-2 sentences>\n"
        f"Scope: <what's in / out>\n"
        f"Resources: <3 links or docs>\n"
        f"Milestones (weekly): <targets>\n"
        f"Metrics: <quant/qual measures>\n"
        f"Risks & mitigations: <top 3>\n"
    )


def rng_pick(r: random.Random, items: Sequence[str]) -> str:
    return items[r.randrange(0, len(items))]


def make_dialogue(r: random.Random) -> Dict[str, Any]:
    topic = rng_pick(r, TOPICS)
    opener_tpl = rng_pick(r, USER_OPENERS)
    opener = opener_tpl.format(topic=topic)

    a1 = make_answer(topic, opener)

    u2 = rng_pick(r, FOLLOW_UPS)
    a2 = make_examples(topic) if "example" in u2.lower() else (
        make_template(topic) if "template" in u2.lower() else
        "Here is a step-by-step variant focusing on small wins first, then scope up.\n1) Clarify outcome.\n2) Pick one method.\n3) Build tiny demo.\n4) Review metrics.\n5) Iterate."
    )

    if r.random() < 0.6:
        u3 = rng_pick(r, DEEPENERS)
        a3 = (
            "Resilience tips:\n- Use checklists and pre-mortems.\n- Automate a single weak link each week.\n- Keep a rollback plan.\n- Prefer boring, well-documented tools.\n\nEthics: stay transparent, reduce bias, and respect privacy."
        )
        messages: List[Dict[str, str]] = [
            {"content": opener, "role": "user"},
            {"content": a1, "role": "assistant"},
            {"content": u2, "role": "user"},
            {"content": a2, "role": "assistant"},
            {"content": u3, "role": "user"},
            {"content": a3, "role": "assistant"},
        ]
    else:
        messages = [
            {"content": opener, "role": "user"},
            {"content": a1, "role": "assistant"},
            {"content": rng_pick(r, FOLLOW_UPS), "role": "user"},
            {"content": a2, "role": "assistant"},
        ]

    prompt = messages[0]["content"]
    prompt_id = hashlib.sha256(prompt.encode("utf-8")).hexdigest()
    return {"prompt": prompt, "messages": messages, "prompt_id": prompt_id}


def generate(n: int, seed: int) -> List[Dict[str, Any]]:
    r = random.Random(seed)
    return [make_dialogue(r) for _ in range(n)]


def main(argv: List[str]) -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--out", type=str, default="sample_data/train_sft.jsonl")
    parser.add_argument("--n", type=int, default=500)
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args(argv)

    data = generate(args.n, args.seed)
    out_path = args.out
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        for row in data:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    print(f"Wrote {len(data)} rows to {out_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main(sys.argv[1:]))