ndc8 commited on
Commit
7ecd130
·
1 Parent(s): 91181f3

Add scripts for converting and generating UltraChat-style SFT dataset

Browse files

- Implement `convert_ultrachat_to_text.py` to convert UltraChat records into a single text format for training.
- Implement `generate_ultrachat_sft.py` to generate synthetic UltraChat-style SFT datasets with customizable parameters.

sample_data/test_sft.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
sample_data/test_sft_text.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
sample_data/train_sft.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
sample_data/train_sft_curated.jsonl ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"prompt":"Design a robust retry strategy for a REST client calling a flaky upstream API.","messages":[{"role":"user","content":"Design a robust retry strategy for a REST client calling a flaky upstream API."},{"role":"assistant","content":"Use bounded exponential backoff with jitter, a small max retry cap, and timeouts that fit your SLO. Retry only on idempotent requests and transient failures (5xx, timeouts, connection reset). Add circuit breaker to fail fast during sustained outages and surface metrics (latency, error rate, retry count). Include per-request deadlines to avoid unbounded tail latency and propagate correlation IDs for tracing."},{"role":"user","content":"What jitter mode should I choose?"},{"role":"assistant","content":"Prefer 'full jitter' or 'decorrelated jitter' to spread retry storms. Full jitter picks a random delay in [0, base*2^n] each attempt; it's simple and effective."}],"prompt_id":"cura0001"}
2
+ {"prompt":"I need a quick primer: SQL window functions for running totals and rankings.","messages":[{"role":"user","content":"I need a quick primer: SQL window functions for running totals and rankings."},{"role":"assistant","content":"Running totals: SUM(amount) OVER (PARTITION BY account ORDER BY ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW). Rankings: use ROW_NUMBER(), RANK(), DENSE_RANK() with ORDER BY inside OVER(). Combine PARTITION BY to reset per group. Avoid mixing ORDER BY in SELECT and window frames incorrectly; rely on explicit frames for deterministic results."},{"role":"user","content":"How do I do top-1 per group?"},{"role":"assistant","content":"Use ROW_NUMBER() OVER (PARTITION BY grp ORDER BY score DESC) AS rn then filter rn = 1 in an outer query. It's clearer and often faster than DISTINCT ON across engines."}],"prompt_id":"cura0002"}
3
+ {"prompt":"Explain the difference between blue/green and canary deployments.","messages":[{"role":"user","content":"Explain the difference between blue/green and canary deployments."},{"role":"assistant","content":"Blue/green runs two production environments (blue, green) and flips all traffic from old to new at once after validation. Canary gradually shifts a small percentage of traffic to the new version, monitoring metrics and increasing if healthy. Blue/green optimizes for fast rollback; canary optimizes for risk reduction and incremental validation under real load."},{"role":"user","content":"When would you favor blue/green?"},{"role":"assistant","content":"When you need instant rollback and can afford duplicate infrastructure, or when state/data migrations are handled atomically and validated before the flip."}],"prompt_id":"cura0003"}
4
+ {"prompt":"My Kafka consumer lags spike occasionally. What should I check first?","messages":[{"role":"user","content":"My Kafka consumer lags spike occasionally. What should I check first?"},{"role":"assistant","content":"Check: (1) processing time vs max.poll.interval.ms; (2) batch size and max.poll.records; (3) consumer group rebalances (unstable partitions, frequent membership changes); (4) broker/network throughput; (5) GC pauses. Add metrics (poll latency, process time, commit time) and consider increasing concurrency (more partitions or parallelism within a consumer) if single-threaded processing is the bottleneck."},{"role":"user","content":"Commits seem slow."},{"role":"assistant","content":"Use async commits batched per processed batch, ensure idempotent sinks, and avoid committing too frequently. If using transactions/EOS, ensure transaction timeouts and producer buffers aren’t constraining throughput."}],"prompt_id":"cura0004"}
5
+ {"prompt":"How do I structure a FastAPI project for maintainability?","messages":[{"role":"user","content":"How do I structure a FastAPI project for maintainability?"},{"role":"assistant","content":"Organize by domain: routers/ (API), schemas/ (Pydantic models), services/ (business logic), repositories/ (DB access), core/ (settings, security), and tests/. Keep dependencies pointing inward (API -> services -> repos). Use dependency injection (Depends) sparingly for explicitness, and centralize exception handlers and middlewares. Add pre-commit hooks (black, isort, ruff, mypy)."},{"role":"user","content":"Where should validation live?"},{"role":"assistant","content":"Put request shape/type validation in Pydantic models, cross-field business rules in services. Keep DB constraints for integrity as a final guard."}],"prompt_id":"cura0005"}
6
+ {"prompt":"We need a feature flag system with gradual rollout and stickiness.","messages":[{"role":"user","content":"We need a feature flag system with gradual rollout and stickiness."},{"role":"assistant","content":"Use server-evaluated flags with a rules engine: target by attributes (country, app_version), percentage rollouts with consistent hashing on a stable key (user_id) for stickiness, and environment separation. Provide a kill switch, audit logs, and SDK caches with TTL. Evaluate flags at request time for dynamic behavior, but cache decisions for performance when safe."},{"role":"user","content":"How do I avoid config debt?"},{"role":"assistant","content":"Set TTLs on flags, require owner, add CI checks to block stale flags beyond a certain age, and run periodic cleanup sprints."}],"prompt_id":"cura0006"}
7
+ {"prompt":"Design an S3 + Lambda image thumbnailer with idempotency and a DLQ.","messages":[{"role":"user","content":"Design an S3 + Lambda image thumbnailer with idempotency and a DLQ."},{"role":"assistant","content":"S3 ObjectCreated -> Lambda validates content-type, generates thumbnails, stores under thumbnails/{key}. Use ETag + key as idempotency key in DynamoDB to avoid duplicates. Configure Lambda retries with exponential backoff and an SQS DLQ for poison images. Emit metrics/logs. Use memory tuned for sharp/imagelib performance, and consider Provisioned Concurrency for cold start-sensitive paths."},{"role":"user","content":"How to prevent reprocessing on overwrite?"},{"role":"assistant","content":"Use object versioning and process only the latest version; compare stored ETag in your idempotency record to skip already-processed versions."}],"prompt_id":"cura0007"}
8
+ {"prompt":"Make a small, readable Git commit message template for a team.","messages":[{"role":"user","content":"Make a small, readable Git commit message template for a team."},{"role":"assistant","content":"Template:\n\n<type>(<scope>): <short summary>\n\nWhy:\n- <reason/impact>\n\nWhat:\n- <key changes>\n\nNotes:\n- <breaking changes/migrations>\n\nTypes: feat, fix, chore, docs, refactor, test, perf, ci. Keep subject <= 72 chars; use imperative mood."},{"role":"user","content":"Do we need ticket IDs?"},{"role":"assistant","content":"If your tracker is the source of truth, include the ID (e.g., PROJ-123) in the subject or a footer, but keep the summary readable."}],"prompt_id":"cura0008"}
9
+ {"prompt":"Best practices for paginating a write-heavy Postgres list endpoint.","messages":[{"role":"user","content":"Best practices for paginating a write-heavy Postgres list endpoint."},{"role":"assistant","content":"Use cursor pagination by (created_at, id) with a composite index matching the order. Avoid OFFSET. For consistent results under concurrent writes, use WHERE (created_at, id) < (:created_at, :id) for DESC pages. Limit payload size, select necessary columns, and consider keyset pagination helpers. Document forward/backward navigation and next/prev cursors."},{"role":"user","content":"How to support filtering too?"},{"role":"assistant","content":"Include filter fields in the index or accept slightly less efficient filtered scans. For highly selective filters, build dedicated indexes. Keep cursor encoding including relevant filter context to avoid mismatches when resuming."}],"prompt_id":"cura0009"}
10
+ {"prompt":"We’re seeing OOMKilled in Kubernetes—how to set resource requests/limits?","messages":[{"role":"user","content":"We’re seeing OOMKilled in Kubernetes—how to set resource requests/limits?"},{"role":"assistant","content":"Profile the app to find steady-state and peak memory. Set requests to typical steady-state plus headroom; set limits high enough to avoid false kills but low enough to protect the node. Use VPA recommendations as a baseline, enable memory QoS (if available), and add liveness/readiness probes that account for GC or warm-up phases. Monitor container_memory_working_set_bytes and eviction signals."},{"role":"user","content":"What about CPU throttling?"},{"role":"assistant","content":"Avoid setting CPU limits unless needed; prefer requests that allow bursts. If you must set limits, ensure they match performance SLAs and test under load to detect throttling-induced latency."}],"prompt_id":"cura0010"}
11
+ {"prompt":"How can I reduce LLM hallucinations in a production Q&A bot?","messages":[{"role":"user","content":"How can I reduce LLM hallucinations in a production Q&A bot?"},{"role":"assistant","content":"Use retrieval-augmented generation (RAG) with high-quality embeddings and chunking; prompt the model to answer strictly from retrieved context and cite sources. Add a post-generation verifier for facts and a fallback of 'I don't know' when confidence is low. Constrain with structured prompts, enable content filters, and log+review flagged responses for continual improvement."},{"role":"user","content":"Any retrieval tips?"},{"role":"assistant","content":"Use domain-tuned embeddings, overlap chunks, and hybrid search (BM25 + vector). Deduplicate and normalize docs; prefer fresh, authoritative sources."}],"prompt_id":"cura0011"}
12
+ {"prompt":"What’s the difference between idempotency keys and deduplication at-least-once systems?","messages":[{"role":"user","content":"What’s the difference between idempotency keys and deduplication at-least-once systems?"},{"role":"assistant","content":"Idempotency keys ensure repeating the same request produces the same effect (client or server enforces one-time processing). Deduplication removes duplicate messages at the broker/consumer level. Use both: idempotency for side-effect safety at the app boundary and dedupe for transport-level resilience. Store idempotency records with TTL and conflict checks."},{"role":"user","content":"Where to store keys?"},{"role":"assistant","content":"In a durable, transactional store close to the operation (DB table or cache with write-through). Index by key and created_at for expiry sweeps."}],"prompt_id":"cura0012"}
13
+ {"prompt":"How do I do structured logging well in a Node.js service?","messages":[{"role":"user","content":"How do I do structured logging well in a Node.js service?"},{"role":"assistant","content":"Use a JSON logger (pino, bunyan) with a consistent schema: level, msg, timestamp, service, trace_id, span_id, user_id (if relevant). Avoid logging PII; use redaction. Log at INFO for key events, WARN for recoverable anomalies, ERROR for failures; DEBUG locally. Include context in logs, not in message strings. Forward to a central system with retention and indexing policies."},{"role":"user","content":"Any perf concerns?"},{"role":"assistant","content":"Use async logging and avoid heavy formatting. Benchmark; pino with transports disabled in-process is fast. Offload shipping to a sidecar/agent where possible."}],"prompt_id":"cura0013"}
14
+ {"prompt":"Guidance for writing maintainable Terraform modules.","messages":[{"role":"user","content":"Guidance for writing maintainable Terraform modules."},{"role":"assistant","content":"Keep inputs minimal and typed with validation. Provide sane defaults and outputs. Use versioned modules with semantic versioning and a CHANGELOG. Document variables and examples. Avoid deep module nesting; prefer composition in stacks. Use 'for_each' and 'depends_on' judiciously. Add linting (tflint), formatting, and basic tests (terraform validate, kitchen/terratest)."},{"role":"user","content":"How to handle breaking changes?"},{"role":"assistant","content":"Release a major version, provide migration notes, and support deprecation windows where possible. Consider a compatibility module temporarily if needed."}],"prompt_id":"cura0014"}
15
+ {"prompt":"How to design an analytics event schema that’s future-proof?","messages":[{"role":"user","content":"How to design an analytics event schema that’s future-proof?"},{"role":"assistant","content":"Define a core envelope (event_name, user_id, timestamp, context) and a 'properties' object for event-specific fields. Use stable, documented names and types. Add schema versioning and a registry. Enforce validation at ingestion (DDL, JSON Schema). Support nullable/optional fields for evolution, avoid breaking renames. Include privacy metadata (PII flags) and data retention policies."},{"role":"user","content":"How to evolve safely?"},{"role":"assistant","content":"Use additive changes by default; deprecate, then remove after a window. Provide backfills or derived fields to keep downstreams working."}],"prompt_id":"cura0015"}
16
+ {"prompt":"I need a resilient webhook consumer design for Stripe events.","messages":[{"role":"user","content":"I need a resilient webhook consumer design for Stripe events."},{"role":"assistant","content":"Verify signatures on receipt, enqueue to a durable queue, process with idempotency keyed by event.id, and wrap side effects in transactions. Retry with backoff; send poison messages to a DLQ. Expose metrics (throughput, failures) and dashboards. Keep handlers idempotent and avoid long-running logic in the HTTP handler."},{"role":"user","content":"How many retries?"},{"role":"assistant","content":"Use a small bounded number (e.g., 5) with exponential backoff; beyond that, send to DLQ and alert humans."}],"prompt_id":"cura0016"}
17
+ {"prompt":"C# async/await common pitfalls in ASP.NET Core.","messages":[{"role":"user","content":"C# async/await common pitfalls in ASP.NET Core."},{"role":"assistant","content":"Avoid async void except for event handlers; prefer Task-returning methods. Don’t block on async (.Result/.Wait()) to prevent deadlocks. Propagate cancellation tokens down the stack. Pool HttpClient via IHttpClientFactory. Beware context capture; use ConfigureAwait(false) in libraries. Measure thread pool starvation; use async all the way."},{"role":"user","content":"How to time-bound external calls?"},{"role":"assistant","content":"Use CancellationToken with per-call timeouts (HttpClient policies) and fallback logic using Polly (retry, circuit breaker)."}],"prompt_id":"cura0017"}
18
+ {"prompt":"What’s a practical approach to rate limiting in an API gateway?","messages":[{"role":"user","content":"What’s a practical approach to rate limiting in an API gateway?"},{"role":"assistant","content":"Token bucket or leaky bucket with distributed counters stored in Redis/Memcached. Support per-API-key limits, burst capacity, and sliding windows. Return standard headers (X-RateLimit-*) and a 429 with retry-after. Add shadow mode first to calibrate limits before enforcing. Consider global vs per-route quotas and safe-list internal traffic."},{"role":"user","content":"How to prevent Redis hot keys?"},{"role":"assistant","content":"Shard keys (hash prefix), add jitter to expirations, and consider hierarchical limits (per-user + per-IP) to spread load."}],"prompt_id":"cura0018"}
19
+ {"prompt":"We need a privacy-friendly analytics setup for a SaaS app.","messages":[{"role":"user","content":"We need a privacy-friendly analytics setup for a SaaS app."},{"role":"assistant","content":"Collect minimal data, avoid PII by default, and hash identifiers where possible. Use first-party storage, respect DNT/consent, and sample aggressively. Aggregate metrics on the server, not at the client. Provide data deletion APIs and retention limits. Document what you collect and why. Prefer self-hosted or privacy-forward tools when feasible."},{"role":"user","content":"How to handle EU users?"},{"role":"assistant","content":"Collect explicit consent for non-essential tracking, store data in-region, and sign DPAs with processors. Offer opt-out and implement subject access/deletion requests."}],"prompt_id":"cura0019"}
20
+ {"prompt":"Give me a clean GitHub Actions CI for a Python package with tests and cache.","messages":[{"role":"user","content":"Give me a clean GitHub Actions CI for a Python package with tests and cache."},{"role":"assistant","content":"Matrix Python versions, set up caching for pip, run lint (ruff/flake8), type check (mypy), and tests with coverage. Upload coverage to a service if needed. Use actions/setup-python, actions/cache keyed by hashFiles of requirements*.txt/lockfile. Fail fast on lint/type errors to save time."},{"role":"user","content":"What about wheels?"},{"role":"assistant","content":"Add a release workflow: build sdist and wheels (cibuildwheel for native), sign if required, and publish to PyPI on a tag push."}],"prompt_id":"cura0020"}
21
+ {"prompt":"How do I make a secure Docker image for Node.js?","messages":[{"role":"user","content":"How do I make a secure Docker image for Node.js?"},{"role":"assistant","content":"Use a slim base (distroless/alpine with care), create a non-root user, drop caps, and set NODE_ENV=production. Copy only needed files; leverage .dockerignore. Run npm ci with a locked lockfile. Scan images (Trivy) and update frequently. Expose minimal ports, set HEALTHCHECK, and avoid shelling out. Sign images (cosign) and enforce provenance (SLSA)."},{"role":"user","content":"Anything about openssl?"},{"role":"assistant","content":"Pin CA bundles, keep OpenSSL up to date via base image updates, and avoid bundling unused crypto libraries to reduce surface area."}],"prompt_id":"cura0021"}
22
+ {"prompt":"What’s a good structure for a blameless incident postmortem?","messages":[{"role":"user","content":"What’s a good structure for a blameless incident postmortem?"},{"role":"assistant","content":"Include: Summary (impact, severity, detection), Timeline, Root Causes (technical + organizational), Mitigations, Follow-ups (owners, due dates), What went well/poorly, and Attachments (graphs, logs). Focus on systemic fixes, not blame. Publish within 3–5 business days and track action items to closure."},{"role":"user","content":"How to keep it concise?"},{"role":"assistant","content":"Cap the summary to 5–7 bullets, use a time-ordered table for the timeline, and move details to an appendix. Clarity beats completeness."}],"prompt_id":"cura0022"}
23
+ {"prompt":"I need a practical guide to designing idempotent REST endpoints.","messages":[{"role":"user","content":"I need a practical guide to designing idempotent REST endpoints."},{"role":"assistant","content":"Use PUT for create-or-replace with client-provided IDs, PATCH for partial updates designed to be idempotent (replace-or-upsert fields). Support an Idempotency-Key header to dedupe POSTs for non-idempotent operations. Store request hash and response status/body for the key TTL. Make side effects conditional on idempotency record existence."},{"role":"user","content":"What about retries from mobile?"},{"role":"assistant","content":"Assume retries happen; require clients to send stable keys and handle timeouts gracefully. Document error codes for duplicate requests (e.g., 409 with existing resource ID)."}],"prompt_id":"cura0023"}
24
+ {"prompt":"How do I choose chunk size and overlap for RAG document ingestion?","messages":[{"role":"user","content":"How do I choose chunk size and overlap for RAG document ingestion?"},{"role":"assistant","content":"Start with 500–1,000 tokens per chunk with ~50–100 token overlap. Tune based on domain: longer for narrative/prose, shorter for code/specs. Ensure chunk boundaries align with semantics (headings, sections). Evaluate retrieval recall on a validation set and adjust; consider hierarchical retrieval (section -> subsection) for large docs."},{"role":"user","content":"Should I store metadata?"},{"role":"assistant","content":"Yes—source, title, section, timestamp, and permissions. Metadata filtering is critical to precise retrieval and access control."}],"prompt_id":"cura0024"}
25
+ {"prompt":"What are effective patterns for feature toggles in mobile apps?","messages":[{"role":"user","content":"What are effective patterns for feature toggles in mobile apps?"},{"role":"assistant","content":"Use a remote-config/flag service with cached evaluations and sticky bucketing. Define defaults in the app, fetch on startup with a timeout, and refresh periodically. Namespace flags per environment. Log evaluations for analysis. Decouple flags from code via typed accessors/enums, and remove stale flags as part of release hygiene."},{"role":"user","content":"Offline behavior?"},{"role":"assistant","content":"Keep last-known-good config and safe defaults. Defer risky features behind server-verified flags so they don't activate without a fetch."}],"prompt_id":"cura0025"}
sample_data/train_sft_curated_text.jsonl ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"text": "<start_of_turn>user\nDesign a robust retry strategy for a REST client calling a flaky upstream API.<end_of_turn>\n<start_of_turn>model\nUse bounded exponential backoff with jitter, a small max retry cap, and timeouts that fit your SLO. Retry only on idempotent requests and transient failures (5xx, timeouts, connection reset). Add circuit breaker to fail fast during sustained outages and surface metrics (latency, error rate, retry count). Include per-request deadlines to avoid unbounded tail latency and propagate correlation IDs for tracing.<end_of_turn>\n<start_of_turn>user\nWhat jitter mode should I choose?<end_of_turn>\n<start_of_turn>model\nPrefer 'full jitter' or 'decorrelated jitter' to spread retry storms. Full jitter picks a random delay in [0, base*2^n] each attempt; it's simple and effective.<end_of_turn>\n"}
2
+ {"text": "<start_of_turn>user\nI need a quick primer: SQL window functions for running totals and rankings.<end_of_turn>\n<start_of_turn>model\nRunning totals: SUM(amount) OVER (PARTITION BY account ORDER BY ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW). Rankings: use ROW_NUMBER(), RANK(), DENSE_RANK() with ORDER BY inside OVER(). Combine PARTITION BY to reset per group. Avoid mixing ORDER BY in SELECT and window frames incorrectly; rely on explicit frames for deterministic results.<end_of_turn>\n<start_of_turn>user\nHow do I do top-1 per group?<end_of_turn>\n<start_of_turn>model\nUse ROW_NUMBER() OVER (PARTITION BY grp ORDER BY score DESC) AS rn then filter rn = 1 in an outer query. It's clearer and often faster than DISTINCT ON across engines.<end_of_turn>\n"}
3
+ {"text": "<start_of_turn>user\nExplain the difference between blue/green and canary deployments.<end_of_turn>\n<start_of_turn>model\nBlue/green runs two production environments (blue, green) and flips all traffic from old to new at once after validation. Canary gradually shifts a small percentage of traffic to the new version, monitoring metrics and increasing if healthy. Blue/green optimizes for fast rollback; canary optimizes for risk reduction and incremental validation under real load.<end_of_turn>\n<start_of_turn>user\nWhen would you favor blue/green?<end_of_turn>\n<start_of_turn>model\nWhen you need instant rollback and can afford duplicate infrastructure, or when state/data migrations are handled atomically and validated before the flip.<end_of_turn>\n"}
4
+ {"text": "<start_of_turn>user\nMy Kafka consumer lags spike occasionally. What should I check first?<end_of_turn>\n<start_of_turn>model\nCheck: (1) processing time vs max.poll.interval.ms; (2) batch size and max.poll.records; (3) consumer group rebalances (unstable partitions, frequent membership changes); (4) broker/network throughput; (5) GC pauses. Add metrics (poll latency, process time, commit time) and consider increasing concurrency (more partitions or parallelism within a consumer) if single-threaded processing is the bottleneck.<end_of_turn>\n<start_of_turn>user\nCommits seem slow.<end_of_turn>\n<start_of_turn>model\nUse async commits batched per processed batch, ensure idempotent sinks, and avoid committing too frequently. If using transactions/EOS, ensure transaction timeouts and producer buffers aren’t constraining throughput.<end_of_turn>\n"}
5
+ {"text": "<start_of_turn>user\nHow do I structure a FastAPI project for maintainability?<end_of_turn>\n<start_of_turn>model\nOrganize by domain: routers/ (API), schemas/ (Pydantic models), services/ (business logic), repositories/ (DB access), core/ (settings, security), and tests/. Keep dependencies pointing inward (API -> services -> repos). Use dependency injection (Depends) sparingly for explicitness, and centralize exception handlers and middlewares. Add pre-commit hooks (black, isort, ruff, mypy).<end_of_turn>\n<start_of_turn>user\nWhere should validation live?<end_of_turn>\n<start_of_turn>model\nPut request shape/type validation in Pydantic models, cross-field business rules in services. Keep DB constraints for integrity as a final guard.<end_of_turn>\n"}
6
+ {"text": "<start_of_turn>user\nWe need a feature flag system with gradual rollout and stickiness.<end_of_turn>\n<start_of_turn>model\nUse server-evaluated flags with a rules engine: target by attributes (country, app_version), percentage rollouts with consistent hashing on a stable key (user_id) for stickiness, and environment separation. Provide a kill switch, audit logs, and SDK caches with TTL. Evaluate flags at request time for dynamic behavior, but cache decisions for performance when safe.<end_of_turn>\n<start_of_turn>user\nHow do I avoid config debt?<end_of_turn>\n<start_of_turn>model\nSet TTLs on flags, require owner, add CI checks to block stale flags beyond a certain age, and run periodic cleanup sprints.<end_of_turn>\n"}
7
+ {"text": "<start_of_turn>user\nDesign an S3 + Lambda image thumbnailer with idempotency and a DLQ.<end_of_turn>\n<start_of_turn>model\nS3 ObjectCreated -> Lambda validates content-type, generates thumbnails, stores under thumbnails/{key}. Use ETag + key as idempotency key in DynamoDB to avoid duplicates. Configure Lambda retries with exponential backoff and an SQS DLQ for poison images. Emit metrics/logs. Use memory tuned for sharp/imagelib performance, and consider Provisioned Concurrency for cold start-sensitive paths.<end_of_turn>\n<start_of_turn>user\nHow to prevent reprocessing on overwrite?<end_of_turn>\n<start_of_turn>model\nUse object versioning and process only the latest version; compare stored ETag in your idempotency record to skip already-processed versions.<end_of_turn>\n"}
8
+ {"text": "<start_of_turn>user\nMake a small, readable Git commit message template for a team.<end_of_turn>\n<start_of_turn>model\nTemplate:\n\n<type>(<scope>): <short summary>\n\nWhy:\n- <reason/impact>\n\nWhat:\n- <key changes>\n\nNotes:\n- <breaking changes/migrations>\n\nTypes: feat, fix, chore, docs, refactor, test, perf, ci. Keep subject <= 72 chars; use imperative mood.<end_of_turn>\n<start_of_turn>user\nDo we need ticket IDs?<end_of_turn>\n<start_of_turn>model\nIf your tracker is the source of truth, include the ID (e.g., PROJ-123) in the subject or a footer, but keep the summary readable.<end_of_turn>\n"}
9
+ {"text": "<start_of_turn>user\nBest practices for paginating a write-heavy Postgres list endpoint.<end_of_turn>\n<start_of_turn>model\nUse cursor pagination by (created_at, id) with a composite index matching the order. Avoid OFFSET. For consistent results under concurrent writes, use WHERE (created_at, id) < (:created_at, :id) for DESC pages. Limit payload size, select necessary columns, and consider keyset pagination helpers. Document forward/backward navigation and next/prev cursors.<end_of_turn>\n<start_of_turn>user\nHow to support filtering too?<end_of_turn>\n<start_of_turn>model\nInclude filter fields in the index or accept slightly less efficient filtered scans. For highly selective filters, build dedicated indexes. Keep cursor encoding including relevant filter context to avoid mismatches when resuming.<end_of_turn>\n"}
10
+ {"text": "<start_of_turn>user\nWe’re seeing OOMKilled in Kubernetes—how to set resource requests/limits?<end_of_turn>\n<start_of_turn>model\nProfile the app to find steady-state and peak memory. Set requests to typical steady-state plus headroom; set limits high enough to avoid false kills but low enough to protect the node. Use VPA recommendations as a baseline, enable memory QoS (if available), and add liveness/readiness probes that account for GC or warm-up phases. Monitor container_memory_working_set_bytes and eviction signals.<end_of_turn>\n<start_of_turn>user\nWhat about CPU throttling?<end_of_turn>\n<start_of_turn>model\nAvoid setting CPU limits unless needed; prefer requests that allow bursts. If you must set limits, ensure they match performance SLAs and test under load to detect throttling-induced latency.<end_of_turn>\n"}
11
+ {"text": "<start_of_turn>user\nHow can I reduce LLM hallucinations in a production Q&A bot?<end_of_turn>\n<start_of_turn>model\nUse retrieval-augmented generation (RAG) with high-quality embeddings and chunking; prompt the model to answer strictly from retrieved context and cite sources. Add a post-generation verifier for facts and a fallback of 'I don't know' when confidence is low. Constrain with structured prompts, enable content filters, and log+review flagged responses for continual improvement.<end_of_turn>\n<start_of_turn>user\nAny retrieval tips?<end_of_turn>\n<start_of_turn>model\nUse domain-tuned embeddings, overlap chunks, and hybrid search (BM25 + vector). Deduplicate and normalize docs; prefer fresh, authoritative sources.<end_of_turn>\n"}
12
+ {"text": "<start_of_turn>user\nWhat’s the difference between idempotency keys and deduplication at-least-once systems?<end_of_turn>\n<start_of_turn>model\nIdempotency keys ensure repeating the same request produces the same effect (client or server enforces one-time processing). Deduplication removes duplicate messages at the broker/consumer level. Use both: idempotency for side-effect safety at the app boundary and dedupe for transport-level resilience. Store idempotency records with TTL and conflict checks.<end_of_turn>\n<start_of_turn>user\nWhere to store keys?<end_of_turn>\n<start_of_turn>model\nIn a durable, transactional store close to the operation (DB table or cache with write-through). Index by key and created_at for expiry sweeps.<end_of_turn>\n"}
13
+ {"text": "<start_of_turn>user\nHow do I do structured logging well in a Node.js service?<end_of_turn>\n<start_of_turn>model\nUse a JSON logger (pino, bunyan) with a consistent schema: level, msg, timestamp, service, trace_id, span_id, user_id (if relevant). Avoid logging PII; use redaction. Log at INFO for key events, WARN for recoverable anomalies, ERROR for failures; DEBUG locally. Include context in logs, not in message strings. Forward to a central system with retention and indexing policies.<end_of_turn>\n<start_of_turn>user\nAny perf concerns?<end_of_turn>\n<start_of_turn>model\nUse async logging and avoid heavy formatting. Benchmark; pino with transports disabled in-process is fast. Offload shipping to a sidecar/agent where possible.<end_of_turn>\n"}
14
+ {"text": "<start_of_turn>user\nGuidance for writing maintainable Terraform modules.<end_of_turn>\n<start_of_turn>model\nKeep inputs minimal and typed with validation. Provide sane defaults and outputs. Use versioned modules with semantic versioning and a CHANGELOG. Document variables and examples. Avoid deep module nesting; prefer composition in stacks. Use 'for_each' and 'depends_on' judiciously. Add linting (tflint), formatting, and basic tests (terraform validate, kitchen/terratest).<end_of_turn>\n<start_of_turn>user\nHow to handle breaking changes?<end_of_turn>\n<start_of_turn>model\nRelease a major version, provide migration notes, and support deprecation windows where possible. Consider a compatibility module temporarily if needed.<end_of_turn>\n"}
15
+ {"text": "<start_of_turn>user\nHow to design an analytics event schema that’s future-proof?<end_of_turn>\n<start_of_turn>model\nDefine a core envelope (event_name, user_id, timestamp, context) and a 'properties' object for event-specific fields. Use stable, documented names and types. Add schema versioning and a registry. Enforce validation at ingestion (DDL, JSON Schema). Support nullable/optional fields for evolution, avoid breaking renames. Include privacy metadata (PII flags) and data retention policies.<end_of_turn>\n<start_of_turn>user\nHow to evolve safely?<end_of_turn>\n<start_of_turn>model\nUse additive changes by default; deprecate, then remove after a window. Provide backfills or derived fields to keep downstreams working.<end_of_turn>\n"}
16
+ {"text": "<start_of_turn>user\nI need a resilient webhook consumer design for Stripe events.<end_of_turn>\n<start_of_turn>model\nVerify signatures on receipt, enqueue to a durable queue, process with idempotency keyed by event.id, and wrap side effects in transactions. Retry with backoff; send poison messages to a DLQ. Expose metrics (throughput, failures) and dashboards. Keep handlers idempotent and avoid long-running logic in the HTTP handler.<end_of_turn>\n<start_of_turn>user\nHow many retries?<end_of_turn>\n<start_of_turn>model\nUse a small bounded number (e.g., 5) with exponential backoff; beyond that, send to DLQ and alert humans.<end_of_turn>\n"}
17
+ {"text": "<start_of_turn>user\nC# async/await common pitfalls in ASP.NET Core.<end_of_turn>\n<start_of_turn>model\nAvoid async void except for event handlers; prefer Task-returning methods. Don’t block on async (.Result/.Wait()) to prevent deadlocks. Propagate cancellation tokens down the stack. Pool HttpClient via IHttpClientFactory. Beware context capture; use ConfigureAwait(false) in libraries. Measure thread pool starvation; use async all the way.<end_of_turn>\n<start_of_turn>user\nHow to time-bound external calls?<end_of_turn>\n<start_of_turn>model\nUse CancellationToken with per-call timeouts (HttpClient policies) and fallback logic using Polly (retry, circuit breaker).<end_of_turn>\n"}
18
+ {"text": "<start_of_turn>user\nWhat’s a practical approach to rate limiting in an API gateway?<end_of_turn>\n<start_of_turn>model\nToken bucket or leaky bucket with distributed counters stored in Redis/Memcached. Support per-API-key limits, burst capacity, and sliding windows. Return standard headers (X-RateLimit-*) and a 429 with retry-after. Add shadow mode first to calibrate limits before enforcing. Consider global vs per-route quotas and safe-list internal traffic.<end_of_turn>\n<start_of_turn>user\nHow to prevent Redis hot keys?<end_of_turn>\n<start_of_turn>model\nShard keys (hash prefix), add jitter to expirations, and consider hierarchical limits (per-user + per-IP) to spread load.<end_of_turn>\n"}
19
+ {"text": "<start_of_turn>user\nWe need a privacy-friendly analytics setup for a SaaS app.<end_of_turn>\n<start_of_turn>model\nCollect minimal data, avoid PII by default, and hash identifiers where possible. Use first-party storage, respect DNT/consent, and sample aggressively. Aggregate metrics on the server, not at the client. Provide data deletion APIs and retention limits. Document what you collect and why. Prefer self-hosted or privacy-forward tools when feasible.<end_of_turn>\n<start_of_turn>user\nHow to handle EU users?<end_of_turn>\n<start_of_turn>model\nCollect explicit consent for non-essential tracking, store data in-region, and sign DPAs with processors. Offer opt-out and implement subject access/deletion requests.<end_of_turn>\n"}
20
+ {"text": "<start_of_turn>user\nGive me a clean GitHub Actions CI for a Python package with tests and cache.<end_of_turn>\n<start_of_turn>model\nMatrix Python versions, set up caching for pip, run lint (ruff/flake8), type check (mypy), and tests with coverage. Upload coverage to a service if needed. Use actions/setup-python, actions/cache keyed by hashFiles of requirements*.txt/lockfile. Fail fast on lint/type errors to save time.<end_of_turn>\n<start_of_turn>user\nWhat about wheels?<end_of_turn>\n<start_of_turn>model\nAdd a release workflow: build sdist and wheels (cibuildwheel for native), sign if required, and publish to PyPI on a tag push.<end_of_turn>\n"}
21
+ {"text": "<start_of_turn>user\nHow do I make a secure Docker image for Node.js?<end_of_turn>\n<start_of_turn>model\nUse a slim base (distroless/alpine with care), create a non-root user, drop caps, and set NODE_ENV=production. Copy only needed files; leverage .dockerignore. Run npm ci with a locked lockfile. Scan images (Trivy) and update frequently. Expose minimal ports, set HEALTHCHECK, and avoid shelling out. Sign images (cosign) and enforce provenance (SLSA).<end_of_turn>\n<start_of_turn>user\nAnything about openssl?<end_of_turn>\n<start_of_turn>model\nPin CA bundles, keep OpenSSL up to date via base image updates, and avoid bundling unused crypto libraries to reduce surface area.<end_of_turn>\n"}
22
+ {"text": "<start_of_turn>user\nWhat’s a good structure for a blameless incident postmortem?<end_of_turn>\n<start_of_turn>model\nInclude: Summary (impact, severity, detection), Timeline, Root Causes (technical + organizational), Mitigations, Follow-ups (owners, due dates), What went well/poorly, and Attachments (graphs, logs). Focus on systemic fixes, not blame. Publish within 3–5 business days and track action items to closure.<end_of_turn>\n<start_of_turn>user\nHow to keep it concise?<end_of_turn>\n<start_of_turn>model\nCap the summary to 5–7 bullets, use a time-ordered table for the timeline, and move details to an appendix. Clarity beats completeness.<end_of_turn>\n"}
23
+ {"text": "<start_of_turn>user\nI need a practical guide to designing idempotent REST endpoints.<end_of_turn>\n<start_of_turn>model\nUse PUT for create-or-replace with client-provided IDs, PATCH for partial updates designed to be idempotent (replace-or-upsert fields). Support an Idempotency-Key header to dedupe POSTs for non-idempotent operations. Store request hash and response status/body for the key TTL. Make side effects conditional on idempotency record existence.<end_of_turn>\n<start_of_turn>user\nWhat about retries from mobile?<end_of_turn>\n<start_of_turn>model\nAssume retries happen; require clients to send stable keys and handle timeouts gracefully. Document error codes for duplicate requests (e.g., 409 with existing resource ID).<end_of_turn>\n"}
24
+ {"text": "<start_of_turn>user\nHow do I choose chunk size and overlap for RAG document ingestion?<end_of_turn>\n<start_of_turn>model\nStart with 500–1,000 tokens per chunk with ~50–100 token overlap. Tune based on domain: longer for narrative/prose, shorter for code/specs. Ensure chunk boundaries align with semantics (headings, sections). Evaluate retrieval recall on a validation set and adjust; consider hierarchical retrieval (section -> subsection) for large docs.<end_of_turn>\n<start_of_turn>user\nShould I store metadata?<end_of_turn>\n<start_of_turn>model\nYes—source, title, section, timestamp, and permissions. Metadata filtering is critical to precise retrieval and access control.<end_of_turn>\n"}
25
+ {"text": "<start_of_turn>user\nWhat are effective patterns for feature toggles in mobile apps?<end_of_turn>\n<start_of_turn>model\nUse a remote-config/flag service with cached evaluations and sticky bucketing. Define defaults in the app, fetch on startup with a timeout, and refresh periodically. Namespace flags per environment. Log evaluations for analysis. Decouple flags from code via typed accessors/enums, and remove stale flags as part of release hygiene.<end_of_turn>\n<start_of_turn>user\nOffline behavior?<end_of_turn>\n<start_of_turn>model\nKeep last-known-good config and safe defaults. Defer risky features behind server-verified flags so they don't activate without a fetch.<end_of_turn>\n"}
sample_data/train_sft_text.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
scripts/convert_ultrachat_to_text.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert UltraChat-style records (prompt, messages[], prompt_id) into a single
4
+ "text" field using the chat formatting expected by training/train_gemma_unsloth.py:
5
+
6
+ <start_of_turn>user\n...<end_of_turn>\n<start_of_turn>model\n...<end_of_turn>\n(repeat)
7
+
8
+ Usage:
9
+ python scripts/convert_ultrachat_to_text.py \
10
+ --in sample_data/train_sft.jsonl \
11
+ --out sample_data/train_sft_text.jsonl
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+ import os
18
+ from typing import List, TypedDict, Any, Dict, cast
19
+
20
+ ROLE_MAP = {
21
+ "user": "user",
22
+ "assistant": "model",
23
+ }
24
+
25
+
26
+ class Msg(TypedDict):
27
+ content: str
28
+ role: str
29
+
30
+
31
+ def to_chat_text(messages: List[Msg]) -> str:
32
+ parts: List[str] = []
33
+ for m in messages:
34
+ role = ROLE_MAP.get(m.get("role", "user"), "user")
35
+ content = (m.get("content", "") or "").rstrip()
36
+ parts.append(f"<start_of_turn>{role}\n{content}<end_of_turn>")
37
+ return "\n".join(parts) + "\n"
38
+
39
+
40
+ def convert(in_path: str, out_path: str) -> int:
41
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
42
+ n_in = 0
43
+ n_out = 0
44
+ with open(in_path, "r", encoding="utf-8") as fin, open(out_path, "w", encoding="utf-8") as fout:
45
+ for line in fin:
46
+ if not line.strip():
47
+ continue
48
+ n_in += 1
49
+ obj: Any = json.loads(line)
50
+ raw: Any = obj.get("messages")
51
+ if not isinstance(raw, list) or not raw:
52
+ continue
53
+ all_dicts = True
54
+ for x_any in cast(List[Any], raw):
55
+ if not isinstance(x_any, dict):
56
+ all_dicts = False
57
+ break
58
+ if not all_dicts:
59
+ continue
60
+ raw_list: List[Dict[str, Any]] = cast(List[Dict[str, Any]], raw)
61
+ messages: List[Msg] = []
62
+ for item in raw_list:
63
+ content_any = item.get("content")
64
+ role_any = item.get("role")
65
+ if not isinstance(content_any, str) or not isinstance(role_any, str):
66
+ messages = []
67
+ break
68
+ messages.append({"content": content_any, "role": role_any})
69
+ if not messages:
70
+ continue
71
+ text = to_chat_text(messages)
72
+ fout.write(json.dumps({"text": text}, ensure_ascii=False) + "\n")
73
+ n_out += 1
74
+ print(f"Converted {n_out}/{n_in} records -> {out_path}")
75
+ return 0
76
+
77
+
78
+ def main() -> int:
79
+ ap = argparse.ArgumentParser()
80
+ ap.add_argument("--in", dest="in_path", required=True)
81
+ ap.add_argument("--out", dest="out_path", required=True)
82
+ args = ap.parse_args()
83
+ return convert(args.in_path, args.out_path)
84
+
85
+
86
+ if __name__ == "__main__":
87
+ raise SystemExit(main())
scripts/generate_ultrachat_sft.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generate a synthetic UltraChat-style SFT dataset.
4
+
5
+ Each JSONL line has:
6
+ {
7
+ "prompt": "<first user message>",
8
+ "messages": [{"content": "...", "role": "user"}, {"content": "...", "role": "assistant"}, ...],
9
+ "prompt_id": "<sha256 hex of prompt>"
10
+ }
11
+
12
+ Usage:
13
+ python scripts/generate_ultrachat_sft.py --out sample_data/train_sft.jsonl --n 500
14
+
15
+ Only uses Python standard library.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import hashlib
21
+ import json
22
+ import os
23
+ import random
24
+ import sys
25
+ from typing import Any, Dict, List, Sequence
26
+
27
+ TOPICS = [
28
+ "travel planning",
29
+ "cooking and recipes",
30
+ "software engineering",
31
+ "data science",
32
+ "machine learning",
33
+ "mathematics",
34
+ "history",
35
+ "literature",
36
+ "productivity",
37
+ "fitness and health",
38
+ "economics",
39
+ "marketing",
40
+ "photography",
41
+ "music theory",
42
+ "language learning",
43
+ "gardening",
44
+ "home improvement",
45
+ "career advice",
46
+ "public speaking",
47
+ "time management",
48
+ "networking",
49
+ "resume writing",
50
+ "interview prep",
51
+ "cloud computing",
52
+ "kubernetes",
53
+ "devops",
54
+ "cybersecurity",
55
+ "robotics",
56
+ "electronics",
57
+ "astronomy",
58
+ "climate and environment",
59
+ "education",
60
+ "parenting",
61
+ "pet care",
62
+ "mental wellness",
63
+ "philosophy",
64
+ "ethics",
65
+ "UX design",
66
+ "UI design",
67
+ "copywriting",
68
+ "game design",
69
+ "board games",
70
+ "coffee brewing",
71
+ "tea brewing",
72
+ "urban planning",
73
+ "transportation",
74
+ "sports science",
75
+ "statistics",
76
+ ]
77
+
78
+ USER_OPENERS = [
79
+ "Could you help me with {topic}?",
80
+ "What are the key steps to get started with {topic}?",
81
+ "Give me a concise plan to improve at {topic} over 4 weeks.",
82
+ "Explain a few common pitfalls in {topic} and how to avoid them.",
83
+ "Draft a checklist for beginners in {topic}.",
84
+ "Compare two approaches commonly used in {topic} and when to choose each.",
85
+ "I have 2 hours to learn about {topic} today—what should I do?",
86
+ "What metrics matter most in {topic} and how do I track them?",
87
+ "Summarize an actionable framework for {topic} with examples.",
88
+ ]
89
+
90
+ FOLLOW_UPS = [
91
+ "Nice. Can you add 3 concrete examples?",
92
+ "Could you turn that into a step-by-step guide?",
93
+ "What are likely failure modes and mitigations?",
94
+ "How would a beginner apply this in a weekend project?",
95
+ "Please give a small template I can reuse.",
96
+ "How do I measure progress over a month?",
97
+ ]
98
+
99
+ DEEPENERS = [
100
+ "Great. Any tips to make it resilient under constraints?",
101
+ "How would you adapt this for a small team?",
102
+ "What are ethical considerations I should keep in mind?",
103
+ "What's an underrated practice here and why?",
104
+ ]
105
+
106
+ def make_answer(topic: str, opener: str) -> str:
107
+ bullets: List[str] = [
108
+ f"Define your objective in {topic} (outcome, constraints, timeline).",
109
+ f"Map key concepts and tools in {topic}; pick one stack and stick to it for 2-4 weeks.",
110
+ f"Practice with a small, scoped project; iterate with feedback.",
111
+ f"Track 1-3 metrics that reflect real progress; review weekly.",
112
+ f"Document what worked, what didn't, and the next experiment.",
113
+ ]
114
+ return (
115
+ f"Here's a pragmatic path for {topic} based on your request:\n" +
116
+ "\n".join([f"- {b}" for b in bullets]) +
117
+ "\n\nCommon pitfalls:\n- Starting too big; keep scope small.\n- Tool hopping; commit to one stack.\n- No feedback loop; schedule reviews.\n- Unclear metrics; define success upfront."
118
+ )
119
+
120
+
121
+ def make_examples(topic: str) -> str:
122
+ ex: List[str] = [
123
+ f"Example 1 — 90-minute sprint: Learn one core concept in {topic} and apply it to a toy task.",
124
+ f"Example 2 — Weekend project: Build a tiny demo that proves a single capability in {topic}.",
125
+ f"Example 3 — Peer review: Share results, gather feedback, and improve one dimension.",
126
+ ]
127
+ return "\n".join(ex)
128
+
129
+
130
+ def make_template(topic: str) -> str:
131
+ return (
132
+ f"Template — One-page plan for {topic}:\n"
133
+ f"Goal: <1-2 sentences>\n"
134
+ f"Scope: <what's in / out>\n"
135
+ f"Resources: <3 links or docs>\n"
136
+ f"Milestones (weekly): <targets>\n"
137
+ f"Metrics: <quant/qual measures>\n"
138
+ f"Risks & mitigations: <top 3>\n"
139
+ )
140
+
141
+
142
+ def rng_pick(r: random.Random, items: Sequence[str]) -> str:
143
+ return items[r.randrange(0, len(items))]
144
+
145
+
146
+ def make_dialogue(r: random.Random) -> Dict[str, Any]:
147
+ topic = rng_pick(r, TOPICS)
148
+ opener_tpl = rng_pick(r, USER_OPENERS)
149
+ opener = opener_tpl.format(topic=topic)
150
+
151
+ a1 = make_answer(topic, opener)
152
+
153
+ u2 = rng_pick(r, FOLLOW_UPS)
154
+ a2 = make_examples(topic) if "example" in u2.lower() else (
155
+ make_template(topic) if "template" in u2.lower() else
156
+ "Here is a step-by-step variant focusing on small wins first, then scope up.\n1) Clarify outcome.\n2) Pick one method.\n3) Build tiny demo.\n4) Review metrics.\n5) Iterate."
157
+ )
158
+
159
+ if r.random() < 0.6:
160
+ u3 = rng_pick(r, DEEPENERS)
161
+ a3 = (
162
+ "Resilience tips:\n- Use checklists and pre-mortems.\n- Automate a single weak link each week.\n- Keep a rollback plan.\n- Prefer boring, well-documented tools.\n\nEthics: stay transparent, reduce bias, and respect privacy."
163
+ )
164
+ messages: List[Dict[str, str]] = [
165
+ {"content": opener, "role": "user"},
166
+ {"content": a1, "role": "assistant"},
167
+ {"content": u2, "role": "user"},
168
+ {"content": a2, "role": "assistant"},
169
+ {"content": u3, "role": "user"},
170
+ {"content": a3, "role": "assistant"},
171
+ ]
172
+ else:
173
+ messages = [
174
+ {"content": opener, "role": "user"},
175
+ {"content": a1, "role": "assistant"},
176
+ {"content": rng_pick(r, FOLLOW_UPS), "role": "user"},
177
+ {"content": a2, "role": "assistant"},
178
+ ]
179
+
180
+ prompt = messages[0]["content"]
181
+ prompt_id = hashlib.sha256(prompt.encode("utf-8")).hexdigest()
182
+ return {"prompt": prompt, "messages": messages, "prompt_id": prompt_id}
183
+
184
+
185
+ def generate(n: int, seed: int) -> List[Dict[str, Any]]:
186
+ r = random.Random(seed)
187
+ return [make_dialogue(r) for _ in range(n)]
188
+
189
+
190
+ def main(argv: List[str]) -> int:
191
+ parser = argparse.ArgumentParser()
192
+ parser.add_argument("--out", type=str, default="sample_data/train_sft.jsonl")
193
+ parser.add_argument("--n", type=int, default=500)
194
+ parser.add_argument("--seed", type=int, default=42)
195
+ args = parser.parse_args(argv)
196
+
197
+ data = generate(args.n, args.seed)
198
+ out_path = args.out
199
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
200
+ with open(out_path, "w", encoding="utf-8") as f:
201
+ for row in data:
202
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
203
+
204
+ print(f"Wrote {len(data)} rows to {out_path}")
205
+ return 0
206
+
207
+
208
+ if __name__ == "__main__":
209
+ raise SystemExit(main(sys.argv[1:]))