Spaces:
Running
Running
Commit
·
5301c48
0
Parent(s):
init commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +118 -0
- .env.template +17 -0
- .github/dependabot.yml +6 -0
- .github/workflows/lint-and-test.yaml +60 -0
- .github/workflows/publish_pypi.yaml +39 -0
- .github/workflows/publish_testpypi.yaml +107 -0
- .gitignore +190 -0
- .gitmodules +6 -0
- .pre-commit-config.yaml +25 -0
- Dockerfile +79 -0
- LICENSE +202 -0
- Makefile +28 -0
- README.HuggingFace.md +177 -0
- README.md +193 -0
- docs_mintlify +1 -0
- examples/__init__.py +0 -0
- examples/data_factory.ipynb +681 -0
- examples/data_factory_release_check.ipynb +494 -0
- examples/embedding_usage_example.py +202 -0
- examples/structured_llm.ipynb +470 -0
- examples/usecases/math_data_gen.ipynb +0 -0
- internal +1 -0
- mcp_hackathon/README.md +119 -0
- mcp_hackathon/data_gen_server/.gitignore +1 -0
- mcp_hackathon/data_gen_server/.python-version +1 -0
- mcp_hackathon/data_gen_server/data_gen_server.py +68 -0
- mcp_hackathon/data_gen_server/model_gen.py +73 -0
- mcp_hackathon/data_gen_server/model_probe.py +65 -0
- nginx.conf +112 -0
- poetry.lock +0 -0
- prebuilt_template/README.md +61 -0
- prebuilt_template/function_calling/README.md +23 -0
- prebuilt_template/function_calling/sample_run.ipynb +425 -0
- prebuilt_template/generate_by_topic/README.md +102 -0
- prebuilt_template/generate_by_topic/sample_run.ipynb +438 -0
- pyproject.toml +132 -0
- pytest.ini +7 -0
- readme-web.md +23 -0
- scripts/hug_push.sh +1 -0
- scripts/rag.py +155 -0
- src/starfish/__init__.py +18 -0
- src/starfish/common/env_loader.py +52 -0
- src/starfish/common/exceptions.py +325 -0
- src/starfish/common/logger.py +104 -0
- src/starfish/components/__init__.py +3 -0
- src/starfish/components/prepare_topic.py +275 -0
- src/starfish/data_factory/config.py +6 -0
- src/starfish/data_factory/constants.py +75 -0
- src/starfish/data_factory/event_loop.py +35 -0
- src/starfish/data_factory/factory.py +112 -0
.dockerignore
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Git and version control
|
2 |
+
.git
|
3 |
+
.gitignore
|
4 |
+
.gitattributes
|
5 |
+
|
6 |
+
# Development files
|
7 |
+
.env*
|
8 |
+
!.env.example
|
9 |
+
.vscode/
|
10 |
+
.idea/
|
11 |
+
*.swp
|
12 |
+
*.swo
|
13 |
+
*~
|
14 |
+
|
15 |
+
# OS files
|
16 |
+
.DS_Store
|
17 |
+
.DS_Store?
|
18 |
+
._*
|
19 |
+
.Spotlight-V100
|
20 |
+
.Trashes
|
21 |
+
ehthumbs.db
|
22 |
+
Thumbs.db
|
23 |
+
|
24 |
+
# Python
|
25 |
+
__pycache__/
|
26 |
+
*.py[cod]
|
27 |
+
*$py.class
|
28 |
+
*.so
|
29 |
+
.Python
|
30 |
+
build/
|
31 |
+
develop-eggs/
|
32 |
+
dist/
|
33 |
+
downloads/
|
34 |
+
eggs/
|
35 |
+
.eggs/
|
36 |
+
lib/
|
37 |
+
lib64/
|
38 |
+
parts/
|
39 |
+
sdist/
|
40 |
+
var/
|
41 |
+
wheels/
|
42 |
+
pip-wheel-metadata/
|
43 |
+
share/python-wheels/
|
44 |
+
*.egg-info/
|
45 |
+
.installed.cfg
|
46 |
+
*.egg
|
47 |
+
MANIFEST
|
48 |
+
.venv/
|
49 |
+
env/
|
50 |
+
venv/
|
51 |
+
ENV/
|
52 |
+
env.bak/
|
53 |
+
venv.bak/
|
54 |
+
|
55 |
+
# Node.js
|
56 |
+
web/node_modules/
|
57 |
+
web/npm-debug.log*
|
58 |
+
web/yarn-debug.log*
|
59 |
+
web/yarn-error.log*
|
60 |
+
web/.pnpm-debug.log*
|
61 |
+
web/.next/
|
62 |
+
web/out/
|
63 |
+
web/dist/
|
64 |
+
web/build/
|
65 |
+
web/.vercel
|
66 |
+
|
67 |
+
# Documentation
|
68 |
+
*.md
|
69 |
+
!README.md
|
70 |
+
docs/
|
71 |
+
docs_mintlify/
|
72 |
+
vibe_coding/
|
73 |
+
|
74 |
+
# Tests
|
75 |
+
tests/
|
76 |
+
*.test.js
|
77 |
+
*.test.ts
|
78 |
+
*.test.tsx
|
79 |
+
.coverage
|
80 |
+
htmlcov/
|
81 |
+
pytest.ini
|
82 |
+
.pytest_cache/
|
83 |
+
|
84 |
+
# Data and outputs
|
85 |
+
data_factory_output/
|
86 |
+
db/
|
87 |
+
*.db
|
88 |
+
*.sqlite
|
89 |
+
|
90 |
+
# Jupyter notebooks
|
91 |
+
*.ipynb
|
92 |
+
.ipynb_checkpoints/
|
93 |
+
|
94 |
+
# Temporary files
|
95 |
+
*.tmp
|
96 |
+
*.temp
|
97 |
+
.cache/
|
98 |
+
|
99 |
+
# Logs
|
100 |
+
*.log
|
101 |
+
web/api/logs/
|
102 |
+
|
103 |
+
# Docker files (except the main ones)
|
104 |
+
.dockerignore*
|
105 |
+
Dockerfile.*
|
106 |
+
docker-compose*.yml
|
107 |
+
|
108 |
+
# Development and internal files
|
109 |
+
internal/
|
110 |
+
examples/
|
111 |
+
mcp_hackathon/
|
112 |
+
prebuilt_template/
|
113 |
+
scripts/
|
114 |
+
htmlcov/
|
115 |
+
|
116 |
+
# Poetry (we copy these explicitly)
|
117 |
+
# poetry.lock - we need this
|
118 |
+
# pyproject.toml - we need this
|
.env.template
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Starfish Environment Variables
|
2 |
+
# Copy this file to .env and customize for your local environment
|
3 |
+
# DO NOT commit the .env file to version control
|
4 |
+
|
5 |
+
# Environment type (DEV, STAGING, PROD)
|
6 |
+
ENV=DEV
|
7 |
+
|
8 |
+
# API Keys (replace with your own)
|
9 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
10 |
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
11 |
+
HUGGING_FACE_HUB_TOKEN=your_huggingface_token_here
|
12 |
+
TELEMETRY_ENABLED=true
|
13 |
+
|
14 |
+
# Logging
|
15 |
+
LOG_LEVEL=INFO
|
16 |
+
# STARFISH_LOCAL_STORAGE_DIR=
|
17 |
+
JINA_AI_API_KEY=jina_api_key
|
.github/dependabot.yml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: 2
|
2 |
+
updates:
|
3 |
+
- package-ecosystem: "pip"
|
4 |
+
directory: "/" # root of the repo
|
5 |
+
schedule:
|
6 |
+
interval: "weekly"
|
.github/workflows/lint-and-test.yaml
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Starfish testing workflow
|
2 |
+
|
3 |
+
on:
|
4 |
+
# push:
|
5 |
+
# branches:
|
6 |
+
# - main
|
7 |
+
# - dev
|
8 |
+
pull_request:
|
9 |
+
branches:
|
10 |
+
- main
|
11 |
+
- dev
|
12 |
+
- '!f/pypi_release'
|
13 |
+
|
14 |
+
jobs:
|
15 |
+
test-integration:
|
16 |
+
if: github.event.pull_request.head.ref != 'f/pypi_release'
|
17 |
+
runs-on: ubuntu-latest
|
18 |
+
|
19 |
+
steps:
|
20 |
+
- name: Checkout code
|
21 |
+
uses: actions/checkout@v2
|
22 |
+
|
23 |
+
- name: Set up Python
|
24 |
+
uses: actions/setup-python@v2
|
25 |
+
with:
|
26 |
+
python-version: '3.11'
|
27 |
+
|
28 |
+
- name: Load cached Poetry installation
|
29 |
+
uses: actions/cache@v3
|
30 |
+
with:
|
31 |
+
path: ~/.local
|
32 |
+
key: poetry-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
|
33 |
+
|
34 |
+
- name: Load cached venv
|
35 |
+
uses: actions/cache@v3
|
36 |
+
with:
|
37 |
+
path: .venv
|
38 |
+
key: venv-${{ runner.os }}-python-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
|
39 |
+
|
40 |
+
- name: Set Locale
|
41 |
+
run: |
|
42 |
+
sudo locale-gen "en_US.UTF-8"
|
43 |
+
export LC_ALL=en_US.UTF-8
|
44 |
+
export LANG=en_US.UTF-8
|
45 |
+
export TELEMETRY_ENABLED=false
|
46 |
+
|
47 |
+
- name: Install dependencies
|
48 |
+
run: |
|
49 |
+
pip install poetry
|
50 |
+
poetry install --with dev
|
51 |
+
|
52 |
+
# - name: Run ruff
|
53 |
+
# run: |
|
54 |
+
# poetry run ruff check . --output-format=github
|
55 |
+
# poetry run ruff format . --check
|
56 |
+
|
57 |
+
# --cov-report=html
|
58 |
+
- name: Run tests with coverage
|
59 |
+
run: |
|
60 |
+
poetry run pytest --cov='src' --cov-fail-under=20 tests/
|
.github/workflows/publish_pypi.yaml
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Publish to PyPI
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
tags:
|
6 |
+
- 'v*'
|
7 |
+
# branches:
|
8 |
+
# - 'main'
|
9 |
+
|
10 |
+
jobs:
|
11 |
+
deploy:
|
12 |
+
runs-on: ubuntu-latest
|
13 |
+
steps:
|
14 |
+
- uses: actions/checkout@v3
|
15 |
+
with:
|
16 |
+
fetch-depth: 0
|
17 |
+
- name: Verify tag is on main branch
|
18 |
+
run: |
|
19 |
+
TAG_NAME=${GITHUB_REF#refs/tags/}
|
20 |
+
COMMIT=$(git rev-parse $TAG_NAME)
|
21 |
+
if ! git branch --contains $COMMIT | grep -qw main; then
|
22 |
+
echo "::error::Tag $TAG_NAME must be created from main branch"
|
23 |
+
exit 1
|
24 |
+
fi
|
25 |
+
- name: Set up Python
|
26 |
+
uses: actions/setup-python@v4
|
27 |
+
with:
|
28 |
+
python-version: '3.x'
|
29 |
+
- name: Install dependencies
|
30 |
+
run: |
|
31 |
+
python -m pip install --upgrade pip
|
32 |
+
pip install build twine
|
33 |
+
- name: Build and publish
|
34 |
+
env:
|
35 |
+
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
36 |
+
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
37 |
+
run: |
|
38 |
+
python -m build
|
39 |
+
twine upload dist/*
|
.github/workflows/publish_testpypi.yaml
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Publish to Test PyPI
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
tags:
|
6 |
+
- 'test-v*'
|
7 |
+
branches:
|
8 |
+
- 'f/pypi_release'
|
9 |
+
|
10 |
+
jobs:
|
11 |
+
deploy_testpypi:
|
12 |
+
#if: true
|
13 |
+
runs-on: ubuntu-latest
|
14 |
+
steps:
|
15 |
+
- uses: actions/checkout@v3
|
16 |
+
with:
|
17 |
+
fetch-depth: 0 # Required for full commit history check
|
18 |
+
- name: Verify tag is on dev branch
|
19 |
+
run: |
|
20 |
+
TAG_NAME=${GITHUB_REF#refs/tags/}
|
21 |
+
COMMIT=$(git rev-parse $TAG_NAME)
|
22 |
+
if ! git branch --contains $COMMIT | grep -qw dev; then
|
23 |
+
echo "::error::Tag $TAG_NAME must be created from dev branch"
|
24 |
+
exit 1
|
25 |
+
fi
|
26 |
+
- name: Set up Python
|
27 |
+
uses: actions/setup-python@v4
|
28 |
+
with:
|
29 |
+
python-version: '3.x'
|
30 |
+
- name: Install dependencies
|
31 |
+
run: |
|
32 |
+
python -m pip install --upgrade pip
|
33 |
+
pip install build twine
|
34 |
+
- name: Build and publish
|
35 |
+
env:
|
36 |
+
#TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
|
37 |
+
TWINE_USERNAME: __token__
|
38 |
+
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
|
39 |
+
#ACTIONS_STEP_DEBUG: true
|
40 |
+
run: |
|
41 |
+
# echo "TWINE_PASSWORD first 5 chars: ${TWINE_PASSWORD:0:184}"
|
42 |
+
# echo "TWINE_PASSWORD length: ${#TWINE_PASSWORD}"
|
43 |
+
python -m build
|
44 |
+
twine upload --verbose --repository-url https://test.pypi.org/legacy/ dist/*
|
45 |
+
|
46 |
+
test-colab:
|
47 |
+
needs: deploy_testpypi
|
48 |
+
runs-on: ubuntu-latest
|
49 |
+
#a Public "Colab-like" Image
|
50 |
+
container:
|
51 |
+
image: jupyter/minimal-notebook:latest
|
52 |
+
options: --user root # Run as root to avoid permission issues
|
53 |
+
permissions:
|
54 |
+
contents: write
|
55 |
+
steps:
|
56 |
+
- uses: actions/checkout@v3
|
57 |
+
with:
|
58 |
+
sparse-checkout: |
|
59 |
+
tests/*
|
60 |
+
examples/data_factory_release_check.ipynb
|
61 |
+
sparse-checkout-cone-mode: false
|
62 |
+
- name: Update system packages
|
63 |
+
run: |
|
64 |
+
apt-get update
|
65 |
+
apt-get install -y libssl3 # Removed sudo since we're running as root
|
66 |
+
- name: Print Python and Jupyter versions
|
67 |
+
run: |
|
68 |
+
python --version
|
69 |
+
pip list | grep -E 'jupyter|ipykernel|nbconvert|notebook'
|
70 |
+
# Authenticate to GCP
|
71 |
+
# - name: Authenticate to GCP
|
72 |
+
# uses: google-github-actions/auth@v1
|
73 |
+
# with:
|
74 |
+
# credentials_json: ${{ secrets.GCP_SA_KEY }}
|
75 |
+
|
76 |
+
# # Configure Docker to use GCR credentials
|
77 |
+
# - name: Configure Docker for GCR
|
78 |
+
# uses: google-github-actions/docker-auth@v1
|
79 |
+
|
80 |
+
# # Now you can pull the image
|
81 |
+
# - name: Use Colab base image
|
82 |
+
# run: docker pull gcr.io/colab-images/base:latest
|
83 |
+
|
84 |
+
# --no-prompt --no-input \ suppress the output
|
85 |
+
- name: Run Colab-style tests
|
86 |
+
run: |
|
87 |
+
if ! jupyter nbconvert --execute --to notebook --inplace \
|
88 |
+
--ExecutePreprocessor.kernel_name=python3 \
|
89 |
+
--ExecutePreprocessor.timeout=120 \
|
90 |
+
--no-prompt --no-input \
|
91 |
+
--stdout \
|
92 |
+
examples/data_factory_release_check.ipynb; then
|
93 |
+
echo "::error::Notebook execution failed"
|
94 |
+
exit 1
|
95 |
+
fi
|
96 |
+
echo "Notebook executed successfully. Summary:" && \
|
97 |
+
jupyter nbconvert --to markdown --stdout \
|
98 |
+
examples/data_factory_release_check.ipynb | \
|
99 |
+
grep -E '^#|^##' || true
|
100 |
+
|
101 |
+
# Add tag deletion step
|
102 |
+
- name: Delete triggering tag after successful test
|
103 |
+
if: startsWith(github.ref, 'refs/tags/test-v')
|
104 |
+
run: |
|
105 |
+
gh api -X DELETE /repos/$GITHUB_REPOSITORY/git/refs/tags/${GITHUB_REF#refs/tags/}
|
106 |
+
env:
|
107 |
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
.gitignore
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adhoc stuff
|
2 |
+
web/node_modules/
|
3 |
+
web/.next/
|
4 |
+
web/public/
|
5 |
+
web/dist/
|
6 |
+
web/build/
|
7 |
+
web/out/
|
8 |
+
web/coverage/
|
9 |
+
web/logs/
|
10 |
+
web/.local/
|
11 |
+
web/.env
|
12 |
+
|
13 |
+
.serena/
|
14 |
+
docs/
|
15 |
+
/vibe_coding/response.md
|
16 |
+
/dev/
|
17 |
+
todo
|
18 |
+
.local/
|
19 |
+
.vscode/
|
20 |
+
db/
|
21 |
+
.ruff_cache/
|
22 |
+
data_factory_output/
|
23 |
+
examples/test_jupyter.ipynb
|
24 |
+
# *.ipynb
|
25 |
+
# .ipynb_checkpoints
|
26 |
+
.cursor
|
27 |
+
|
28 |
+
.DS_Store
|
29 |
+
*/.DS_Store
|
30 |
+
|
31 |
+
# Byte-compiled / optimized / DLL files
|
32 |
+
__pycache__/
|
33 |
+
*.py[cod]
|
34 |
+
*$py.class
|
35 |
+
|
36 |
+
# C extensions
|
37 |
+
*.so
|
38 |
+
|
39 |
+
# Distribution / packaging
|
40 |
+
.Python
|
41 |
+
build/
|
42 |
+
develop-eggs/
|
43 |
+
dist/
|
44 |
+
downloads/
|
45 |
+
eggs/
|
46 |
+
.eggs/
|
47 |
+
lib/
|
48 |
+
lib64/
|
49 |
+
parts/
|
50 |
+
sdist/
|
51 |
+
var/
|
52 |
+
wheels/
|
53 |
+
share/python-wheels/
|
54 |
+
*.egg-info/
|
55 |
+
.installed.cfg
|
56 |
+
*.egg
|
57 |
+
MANIFEST
|
58 |
+
|
59 |
+
# PyInstaller
|
60 |
+
# Usually these files are written by a python script from a template
|
61 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
62 |
+
*.manifest
|
63 |
+
*.spec
|
64 |
+
|
65 |
+
# Installer logs
|
66 |
+
pip-log.txt
|
67 |
+
pip-delete-this-directory.txt
|
68 |
+
|
69 |
+
# Unit test / coverage reports
|
70 |
+
htmlcov/
|
71 |
+
.tox/
|
72 |
+
.nox/
|
73 |
+
.coverage
|
74 |
+
.coverage.*
|
75 |
+
.cache
|
76 |
+
nosetests.xml
|
77 |
+
coverage.xml
|
78 |
+
*.cover
|
79 |
+
*.py,cover
|
80 |
+
.hypothesis/
|
81 |
+
.pytest_cache/
|
82 |
+
cover/
|
83 |
+
|
84 |
+
# Translations
|
85 |
+
*.mo
|
86 |
+
*.pot
|
87 |
+
|
88 |
+
# Django stuff:
|
89 |
+
*.log
|
90 |
+
local_settings.py
|
91 |
+
db.sqlite3
|
92 |
+
db.sqlite3-journal
|
93 |
+
|
94 |
+
# Flask stuff:
|
95 |
+
instance/
|
96 |
+
.webassets-cache
|
97 |
+
|
98 |
+
# Scrapy stuff:
|
99 |
+
.scrapy
|
100 |
+
|
101 |
+
# Sphinx documentation
|
102 |
+
docs/_build/
|
103 |
+
|
104 |
+
# PyBuilder
|
105 |
+
.pybuilder/
|
106 |
+
target/
|
107 |
+
|
108 |
+
# Jupyter Notebook
|
109 |
+
.ipynb_checkpoints
|
110 |
+
|
111 |
+
# IPython
|
112 |
+
profile_default/
|
113 |
+
ipython_config.py
|
114 |
+
|
115 |
+
# pyenv
|
116 |
+
# For a library or package, you might want to ignore these files since the code is
|
117 |
+
# intended to run in multiple environments; otherwise, check them in:
|
118 |
+
# .python-version
|
119 |
+
|
120 |
+
# pipenv
|
121 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
122 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
123 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
124 |
+
# install all needed dependencies.
|
125 |
+
#Pipfile.lock
|
126 |
+
|
127 |
+
# poetry
|
128 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
129 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
130 |
+
# commonly ignored for libraries.
|
131 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
132 |
+
#poetry.lock
|
133 |
+
|
134 |
+
# pdm
|
135 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
136 |
+
#pdm.lock
|
137 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
138 |
+
# in version control.
|
139 |
+
# https://pdm.fming.dev/#use-with-ide
|
140 |
+
.pdm.toml
|
141 |
+
|
142 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
143 |
+
__pypackages__/
|
144 |
+
|
145 |
+
# Celery stuff
|
146 |
+
celerybeat-schedule
|
147 |
+
celerybeat.pid
|
148 |
+
|
149 |
+
# SageMath parsed files
|
150 |
+
*.sage.py
|
151 |
+
|
152 |
+
# Environments
|
153 |
+
.env
|
154 |
+
.venv
|
155 |
+
env/
|
156 |
+
venv/
|
157 |
+
ENV/
|
158 |
+
env.bak/
|
159 |
+
venv.bak/
|
160 |
+
|
161 |
+
# Spyder project settings
|
162 |
+
.spyderproject
|
163 |
+
.spyproject
|
164 |
+
|
165 |
+
# Rope project settings
|
166 |
+
.ropeproject
|
167 |
+
|
168 |
+
# mkdocs documentation
|
169 |
+
/site
|
170 |
+
|
171 |
+
# mypy
|
172 |
+
.mypy_cache/
|
173 |
+
.dmypy.json
|
174 |
+
dmypy.json
|
175 |
+
|
176 |
+
# Pyre type checker
|
177 |
+
.pyre/
|
178 |
+
|
179 |
+
# pytype static type analyzer
|
180 |
+
.pytype/
|
181 |
+
|
182 |
+
# Cython debug symbols
|
183 |
+
cython_debug/
|
184 |
+
|
185 |
+
# PyCharm
|
186 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
187 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
188 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
189 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
190 |
+
#.idea/
|
.gitmodules
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[submodule "internal"]
|
2 |
+
path = internal
|
3 |
+
url = https://github.com/starfishdata/starfish_internal.git
|
4 |
+
[submodule "docs_mintlify"]
|
5 |
+
path = docs_mintlify
|
6 |
+
url = https://github.com/starfishdata/docs.git
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
repos:
|
2 |
+
# - repo: local
|
3 |
+
# hooks:
|
4 |
+
# - id: pytest
|
5 |
+
# name: Run pytest
|
6 |
+
# entry: poetry run pytest tests/
|
7 |
+
# language: system
|
8 |
+
# types: [python]
|
9 |
+
# pass_filenames: false
|
10 |
+
# always_run: true
|
11 |
+
|
12 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
13 |
+
# Ruff version.
|
14 |
+
rev: v0.8.6
|
15 |
+
hooks:
|
16 |
+
# Run the linter.
|
17 |
+
# - id: ruff
|
18 |
+
# #args: [ --fix ]
|
19 |
+
# types: [python]
|
20 |
+
# Run the formatter.
|
21 |
+
- id: ruff-format
|
22 |
+
#args: [ --fix ]
|
23 |
+
#run even when no Python files are staged
|
24 |
+
#always_run: true
|
25 |
+
types: [python]
|
Dockerfile
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Multi-stage build for combined frontend + backend
|
2 |
+
FROM node:18-alpine AS frontend-builder
|
3 |
+
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
# Copy package files
|
7 |
+
COPY web/package*.json ./
|
8 |
+
|
9 |
+
# Install dependencies
|
10 |
+
RUN npm ci
|
11 |
+
|
12 |
+
# Copy frontend code and build
|
13 |
+
COPY web/ ./
|
14 |
+
|
15 |
+
# Clean up unnecessary files
|
16 |
+
RUN rm -rf api/ || true
|
17 |
+
RUN rm -rf storage/ || true
|
18 |
+
RUN rm -rf .git/ || true
|
19 |
+
RUN rm -rf .next/ || true
|
20 |
+
RUN rm -rf .local/ || true
|
21 |
+
|
22 |
+
# Build frontend
|
23 |
+
RUN npm run build
|
24 |
+
|
25 |
+
# Backend stage
|
26 |
+
FROM python:3.11-slim
|
27 |
+
|
28 |
+
# Install system dependencies
|
29 |
+
RUN apt-get update && apt-get install -y \
|
30 |
+
nginx \
|
31 |
+
supervisor \
|
32 |
+
curl \
|
33 |
+
&& rm -rf /var/lib/apt/lists/*
|
34 |
+
|
35 |
+
# Install Node.js for combined container
|
36 |
+
RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
|
37 |
+
apt-get install -y nodejs
|
38 |
+
|
39 |
+
WORKDIR /app
|
40 |
+
|
41 |
+
# Copy pyproject.toml and poetry.lock
|
42 |
+
COPY pyproject.toml poetry.lock ./
|
43 |
+
|
44 |
+
# Install Poetry and basic dependencies (skip heavy ML packages for testing)
|
45 |
+
RUN pip install --no-cache-dir --upgrade pip \
|
46 |
+
&& pip install --no-cache-dir poetry \
|
47 |
+
&& poetry config virtualenvs.create false \
|
48 |
+
&& poetry install --only=main --no-root || pip install fastapi uvicorn python-dotenv pydantic
|
49 |
+
|
50 |
+
# Copy starfish source code and README (needed by backend)
|
51 |
+
COPY src/ ./src/
|
52 |
+
COPY README.md ./
|
53 |
+
|
54 |
+
# Copy built frontend from previous stage
|
55 |
+
COPY --from=frontend-builder /app/.next ./web/.next
|
56 |
+
COPY --from=frontend-builder /app/public ./web/public
|
57 |
+
COPY --from=frontend-builder /app/package.json ./web/package.json
|
58 |
+
#COPY --from=frontend-builder /app/node_modules ./web/node_modules
|
59 |
+
|
60 |
+
# Copy backend API code
|
61 |
+
COPY web/api/ ./web/api/
|
62 |
+
|
63 |
+
# Copy configuration files
|
64 |
+
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
65 |
+
COPY nginx.conf /etc/nginx/nginx.conf
|
66 |
+
|
67 |
+
# Create necessary directories and set permissions
|
68 |
+
RUN mkdir -p /var/log/supervisor /var/log/nginx /var/run \
|
69 |
+
&& chmod +x /app/src/ || true
|
70 |
+
|
71 |
+
# Expose port 7860 (required for Hugging Face Spaces)
|
72 |
+
EXPOSE 7860
|
73 |
+
|
74 |
+
# Health check
|
75 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
76 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
77 |
+
|
78 |
+
# Start supervisor which manages both nginx and the applications
|
79 |
+
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
LICENSE
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
Apache License
|
3 |
+
Version 2.0, January 2004
|
4 |
+
http://www.apache.org/licenses/
|
5 |
+
|
6 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
7 |
+
|
8 |
+
1. Definitions.
|
9 |
+
|
10 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
11 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
12 |
+
|
13 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
14 |
+
the copyright owner that is granting the License.
|
15 |
+
|
16 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
17 |
+
other entities that control, are controlled by, or are under common
|
18 |
+
control with that entity. For the purposes of this definition,
|
19 |
+
"control" means (i) the power, direct or indirect, to cause the
|
20 |
+
direction or management of such entity, whether by contract or
|
21 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
22 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
23 |
+
|
24 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
25 |
+
exercising permissions granted by this License.
|
26 |
+
|
27 |
+
"Source" form shall mean the preferred form for making modifications,
|
28 |
+
including but not limited to software source code, documentation
|
29 |
+
source, and configuration files.
|
30 |
+
|
31 |
+
"Object" form shall mean any form resulting from mechanical
|
32 |
+
transformation or translation of a Source form, including but
|
33 |
+
not limited to compiled object code, generated documentation,
|
34 |
+
and conversions to other media types.
|
35 |
+
|
36 |
+
"Work" shall mean the work of authorship, whether in Source or
|
37 |
+
Object form, made available under the License, as indicated by a
|
38 |
+
copyright notice that is included in or attached to the work
|
39 |
+
(an example is provided in the Appendix below).
|
40 |
+
|
41 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
42 |
+
form, that is based on (or derived from) the Work and for which the
|
43 |
+
editorial revisions, annotations, elaborations, or other modifications
|
44 |
+
represent, as a whole, an original work of authorship. For the purposes
|
45 |
+
of this License, Derivative Works shall not include works that remain
|
46 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
47 |
+
the Work and Derivative Works thereof.
|
48 |
+
|
49 |
+
"Contribution" shall mean any work of authorship, including
|
50 |
+
the original version of the Work and any modifications or additions
|
51 |
+
to that Work or Derivative Works thereof, that is intentionally
|
52 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
53 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
54 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
55 |
+
means any form of electronic, verbal, or written communication sent
|
56 |
+
to the Licensor or its representatives, including but not limited to
|
57 |
+
communication on electronic mailing lists, source code control systems,
|
58 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
59 |
+
Licensor for the purpose of discussing and improving the Work, but
|
60 |
+
excluding communication that is conspicuously marked or otherwise
|
61 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
62 |
+
|
63 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
64 |
+
on behalf of whom a Contribution has been received by Licensor and
|
65 |
+
subsequently incorporated within the Work.
|
66 |
+
|
67 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
68 |
+
this License, each Contributor hereby grants to You a perpetual,
|
69 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
70 |
+
copyright license to reproduce, prepare Derivative Works of,
|
71 |
+
publicly display, publicly perform, sublicense, and distribute the
|
72 |
+
Work and such Derivative Works in Source or Object form.
|
73 |
+
|
74 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
75 |
+
this License, each Contributor hereby grants to You a perpetual,
|
76 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
77 |
+
(except as stated in this section) patent license to make, have made,
|
78 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
79 |
+
where such license applies only to those patent claims licensable
|
80 |
+
by such Contributor that are necessarily infringed by their
|
81 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
82 |
+
with the Work to which such Contribution(s) was submitted. If You
|
83 |
+
institute patent litigation against any entity (including a
|
84 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
85 |
+
or a Contribution incorporated within the Work constitutes direct
|
86 |
+
or contributory patent infringement, then any patent licenses
|
87 |
+
granted to You under this License for that Work shall terminate
|
88 |
+
as of the date such litigation is filed.
|
89 |
+
|
90 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
91 |
+
Work or Derivative Works thereof in any medium, with or without
|
92 |
+
modifications, and in Source or Object form, provided that You
|
93 |
+
meet the following conditions:
|
94 |
+
|
95 |
+
(a) You must give any other recipients of the Work or
|
96 |
+
Derivative Works a copy of this License; and
|
97 |
+
|
98 |
+
(b) You must cause any modified files to carry prominent notices
|
99 |
+
stating that You changed the files; and
|
100 |
+
|
101 |
+
(c) You must retain, in the Source form of any Derivative Works
|
102 |
+
that You distribute, all copyright, patent, trademark, and
|
103 |
+
attribution notices from the Source form of the Work,
|
104 |
+
excluding those notices that do not pertain to any part of
|
105 |
+
the Derivative Works; and
|
106 |
+
|
107 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
108 |
+
distribution, then any Derivative Works that You distribute must
|
109 |
+
include a readable copy of the attribution notices contained
|
110 |
+
within such NOTICE file, excluding those notices that do not
|
111 |
+
pertain to any part of the Derivative Works, in at least one
|
112 |
+
of the following places: within a NOTICE text file distributed
|
113 |
+
as part of the Derivative Works; within the Source form or
|
114 |
+
documentation, if provided along with the Derivative Works; or,
|
115 |
+
within a display generated by the Derivative Works, if and
|
116 |
+
wherever such third-party notices normally appear. The contents
|
117 |
+
of the NOTICE file are for informational purposes only and
|
118 |
+
do not modify the License. You may add Your own attribution
|
119 |
+
notices within Derivative Works that You distribute, alongside
|
120 |
+
or as an addendum to the NOTICE text from the Work, provided
|
121 |
+
that such additional attribution notices cannot be construed
|
122 |
+
as modifying the License.
|
123 |
+
|
124 |
+
You may add Your own copyright statement to Your modifications and
|
125 |
+
may provide additional or different license terms and conditions
|
126 |
+
for use, reproduction, or distribution of Your modifications, or
|
127 |
+
for any such Derivative Works as a whole, provided Your use,
|
128 |
+
reproduction, and distribution of the Work otherwise complies with
|
129 |
+
the conditions stated in this License.
|
130 |
+
|
131 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
132 |
+
any Contribution intentionally submitted for inclusion in the Work
|
133 |
+
by You to the Licensor shall be under the terms and conditions of
|
134 |
+
this License, without any additional terms or conditions.
|
135 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
136 |
+
the terms of any separate license agreement you may have executed
|
137 |
+
with Licensor regarding such Contributions.
|
138 |
+
|
139 |
+
6. Trademarks. This License does not grant permission to use the trade
|
140 |
+
names, trademarks, service marks, or product names of the Licensor,
|
141 |
+
except as required for reasonable and customary use in describing the
|
142 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
143 |
+
|
144 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
145 |
+
agreed to in writing, Licensor provides the Work (and each
|
146 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
147 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
148 |
+
implied, including, without limitation, any warranties or conditions
|
149 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
150 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
151 |
+
appropriateness of using or redistributing the Work and assume any
|
152 |
+
risks associated with Your exercise of permissions under this License.
|
153 |
+
|
154 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
155 |
+
whether in tort (including negligence), contract, or otherwise,
|
156 |
+
unless required by applicable law (such as deliberate and grossly
|
157 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
158 |
+
liable to You for damages, including any direct, indirect, special,
|
159 |
+
incidental, or consequential damages of any character arising as a
|
160 |
+
result of this License or out of the use or inability to use the
|
161 |
+
Work (including but not limited to damages for loss of goodwill,
|
162 |
+
work stoppage, computer failure or malfunction, or any and all
|
163 |
+
other commercial damages or losses), even if such Contributor
|
164 |
+
has been advised of the possibility of such damages.
|
165 |
+
|
166 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
167 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
168 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
169 |
+
or other liability obligations and/or rights consistent with this
|
170 |
+
License. However, in accepting such obligations, You may act only
|
171 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
172 |
+
of any other Contributor, and only if You agree to indemnify,
|
173 |
+
defend, and hold each Contributor harmless for any liability
|
174 |
+
incurred by, or claims asserted against, such Contributor by reason
|
175 |
+
of your accepting any such warranty or additional liability.
|
176 |
+
|
177 |
+
END OF TERMS AND CONDITIONS
|
178 |
+
|
179 |
+
APPENDIX: How to apply the Apache License to your work.
|
180 |
+
|
181 |
+
To apply the Apache License to your work, attach the following
|
182 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
183 |
+
replaced with your own identifying information. (Don't include
|
184 |
+
the brackets!) The text should be enclosed in the appropriate
|
185 |
+
comment syntax for the file format. We also recommend that a
|
186 |
+
file or class name and description of purpose be included on the
|
187 |
+
same "printed page" as the copyright notice for easier
|
188 |
+
identification within third-party archives.
|
189 |
+
|
190 |
+
Copyright 2025 Starfish AI Inc.
|
191 |
+
|
192 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
193 |
+
you may not use this file except in compliance with the License.
|
194 |
+
You may obtain a copy of the License at
|
195 |
+
|
196 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
197 |
+
|
198 |
+
Unless required by applicable law or agreed to in writing, software
|
199 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
200 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
201 |
+
See the License for the specific language governing permissions and
|
202 |
+
limitations under the License.
|
Makefile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
lint:
|
2 |
+
@echo "Running Linter (Ruff)..."
|
3 |
+
poetry run isort tests/ src/ examples --check-only || poetry run isort tests/ src/ examples
|
4 |
+
poetry run ruff check src examples --fix --unsafe-fixes --exit-zero
|
5 |
+
poetry run ruff format src examples --check || poetry run ruff format src examples
|
6 |
+
docstring:
|
7 |
+
ruff check --select D src/starfish/data_factory
|
8 |
+
test:
|
9 |
+
poetry run pytest tests/
|
10 |
+
|
11 |
+
install: install-extras
|
12 |
+
|
13 |
+
#poetry install --extras "code_execution vllm" --with dev
|
14 |
+
# Install with specific extras
|
15 |
+
#make install EXTRAS="pdf"
|
16 |
+
# Install all extras
|
17 |
+
#make install EXTRAS="all"
|
18 |
+
# Install without extras (default)
|
19 |
+
#make install
|
20 |
+
install-extras:
|
21 |
+
@echo "Installing dependencies with extras: $(EXTRAS)"
|
22 |
+
poetry install $(if $(EXTRAS),--extras "$(EXTRAS)",) --with dev
|
23 |
+
|
24 |
+
start-client_claude:
|
25 |
+
python src/starfish/data_mcp/client_claude.py src/starfish/data_mcp/server.py
|
26 |
+
|
27 |
+
start-client_openai:
|
28 |
+
python src/starfish/data_mcp/client_openai.py
|
README.HuggingFace.md
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Hugging Face Spaces Deployment
|
2 |
+
|
3 |
+
This guide explains how to deploy your combined FastAPI backend and Next.js frontend to Hugging Face Spaces.
|
4 |
+
|
5 |
+
> ✅ **Build Status**: Docker build is working successfully with resolved path alias issues!
|
6 |
+
|
7 |
+
## Overview
|
8 |
+
|
9 |
+
The `Dockerfile.huggingface` creates a single container that runs:
|
10 |
+
- **FastAPI backend** on port 8002
|
11 |
+
- **Next.js frontend** on port 3000
|
12 |
+
- **Nginx reverse proxy** on port 7860 (required by Hugging Face Spaces)
|
13 |
+
- **Supervisor** to manage all processes
|
14 |
+
|
15 |
+
## Files for Hugging Face Spaces
|
16 |
+
|
17 |
+
1. **`Dockerfile`** - Combined Dockerfile for both services (multi-stage build)
|
18 |
+
2. **`nginx.conf`** - Nginx configuration for routing
|
19 |
+
3. **`supervisord.conf`** - Process manager configuration
|
20 |
+
4. **`.dockerignore`** - Optimized to exclude only necessary files
|
21 |
+
5. **`next.config.js`** - Enhanced with webpack path alias configuration
|
22 |
+
6. **`tsconfig.json`** - Updated with explicit path mappings
|
23 |
+
|
24 |
+
## Deployment Steps
|
25 |
+
|
26 |
+
### 1. Prepare Your Repository
|
27 |
+
|
28 |
+
Your repository is already configured with the correct `Dockerfile` for Hugging Face Spaces deployment.
|
29 |
+
|
30 |
+
### 2. Set Environment Variables in Hugging Face Spaces
|
31 |
+
|
32 |
+
In your Hugging Face Space settings, add these secrets:
|
33 |
+
- `GOOGLE_API_KEY` - Your Google API key
|
34 |
+
- `OPENAI_API_KEY` - Your OpenAI API key
|
35 |
+
|
36 |
+
### 3. Configure Your Space
|
37 |
+
|
38 |
+
- **Space Type**: Docker
|
39 |
+
- **Visibility**: Public or Private (your choice)
|
40 |
+
- **Hardware**: CPU Basic (or upgrade if needed)
|
41 |
+
|
42 |
+
### 4. Update API URLs in Frontend
|
43 |
+
|
44 |
+
Make sure your frontend points to the correct API endpoints:
|
45 |
+
```typescript
|
46 |
+
// In your frontend code, use relative URLs:
|
47 |
+
const API_BASE_URL = "/api" // This goes to Next.js API routes in src/app/api/
|
48 |
+
|
49 |
+
// Next.js API routes will then proxy to FastAPI using:
|
50 |
+
// SERVER_BASE_URL=http://localhost:8002 (set in Dockerfile)
|
51 |
+
```
|
52 |
+
|
53 |
+
### 5. Deploy
|
54 |
+
|
55 |
+
1. Push your code to the Hugging Face Space repository
|
56 |
+
2. The space will automatically build and deploy
|
57 |
+
|
58 |
+
## How It Works
|
59 |
+
|
60 |
+
### Architecture
|
61 |
+
```
|
62 |
+
External Request :7860
|
63 |
+
↓
|
64 |
+
Nginx Proxy
|
65 |
+
↓
|
66 |
+
Next.js :3000 (handles ALL routes)
|
67 |
+
↓
|
68 |
+
/api/* → src/app/api/ routes
|
69 |
+
↓
|
70 |
+
proxy.ts uses SERVER_BASE_URL
|
71 |
+
↓
|
72 |
+
FastAPI Backend :8002
|
73 |
+
```
|
74 |
+
|
75 |
+
### Port Mapping
|
76 |
+
- **7860** - Main port (required by Hugging Face Spaces)
|
77 |
+
- **3000** - Next.js frontend (internal) - handles all routing
|
78 |
+
- **8002** - FastAPI backend (internal) - accessed via Next.js proxy
|
79 |
+
|
80 |
+
### URL Routing
|
81 |
+
- `/` - Next.js frontend (all routes handled by Next.js)
|
82 |
+
- `/api/*` - Next.js API routes (in `src/app/api/`) that proxy to FastAPI backend
|
83 |
+
- `/backend-docs` - Direct FastAPI documentation (for debugging)
|
84 |
+
- `/backend-openapi.json` - Direct FastAPI OpenAPI schema (for debugging)
|
85 |
+
|
86 |
+
### Process Management
|
87 |
+
Supervisor manages three processes:
|
88 |
+
1. **backend** - FastAPI server (port 8002)
|
89 |
+
2. **frontend** - Next.js server (port 3000) - handles all routing and proxying
|
90 |
+
3. **nginx** - Reverse proxy (port 7860) - routes all traffic to Next.js
|
91 |
+
|
92 |
+
## Troubleshooting
|
93 |
+
|
94 |
+
### Common Issues
|
95 |
+
|
96 |
+
1. **Build fails with "Module not found: Can't resolve '@/lib/utils'"**
|
97 |
+
- **FIXED**: This was caused by `lib/` being excluded in `.dockerignore`
|
98 |
+
- The issue has been resolved by removing the `lib/` exclusion pattern
|
99 |
+
|
100 |
+
2. **Build fails during npm install**
|
101 |
+
- Check that all package.json dependencies are valid
|
102 |
+
- Ensure Node.js version compatibility
|
103 |
+
|
104 |
+
3. **FastAPI fails to start**
|
105 |
+
- Check environment variables are set
|
106 |
+
- Verify the starfish package is properly configured
|
107 |
+
- Check logs in the Space's logs tab
|
108 |
+
|
109 |
+
4. **Frontend can't reach backend**
|
110 |
+
- Ensure API calls use relative URLs (`/api/...`)
|
111 |
+
- Check that `SERVER_BASE_URL=http://localhost:8002` is set in the Dockerfile
|
112 |
+
- Verify Next.js API routes in `src/app/api/` are proxying correctly
|
113 |
+
- For direct FastAPI access, use `/backend-docs` instead of `/docs`
|
114 |
+
|
115 |
+
5. **Space shows "Application starting" indefinitely**
|
116 |
+
- Check supervisor logs for errors
|
117 |
+
- Verify all services are starting properly
|
118 |
+
|
119 |
+
### Viewing Logs
|
120 |
+
|
121 |
+
In your Hugging Face Space:
|
122 |
+
1. Go to the "Logs" tab
|
123 |
+
2. Look for errors from supervisor, nginx, backend, or frontend
|
124 |
+
3. Logs are also written to `/var/log/` in the container
|
125 |
+
|
126 |
+
### Local Testing
|
127 |
+
|
128 |
+
Test the Hugging Face build locally:
|
129 |
+
```bash
|
130 |
+
# Build the image
|
131 |
+
docker build -t starfishai-web .
|
132 |
+
|
133 |
+
# Run with environment variables
|
134 |
+
docker run -p 7860:7860 3000:3000 8002:8002\
|
135 |
+
-e GOOGLE_API_KEY=your_key \
|
136 |
+
-e OPENAI_API_KEY=your_key \
|
137 |
+
starfishai-web
|
138 |
+
```
|
139 |
+
|
140 |
+
Then visit:
|
141 |
+
- http://localhost:7860 - Main application
|
142 |
+
- http://localhost:7860/backend-docs - Direct FastAPI documentation
|
143 |
+
- http://localhost:7860/backend-openapi.json - Direct FastAPI schema
|
144 |
+
|
145 |
+
## Recent Fixes & Improvements
|
146 |
+
|
147 |
+
### Path Alias Resolution Fixed
|
148 |
+
- **Issue**: Build was failing with `Module not found: Can't resolve '@/lib/utils'`
|
149 |
+
- **Root Cause**: The `.dockerignore` file was excluding the `lib/` directory
|
150 |
+
- **Solution**: Removed `lib/` from `.dockerignore` and enhanced path configuration
|
151 |
+
- **Files Updated**:
|
152 |
+
- `.dockerignore` - Removed generic `lib/` exclusion
|
153 |
+
- `next.config.js` - Added explicit webpack path aliases
|
154 |
+
- `tsconfig.json` - Enhanced path mappings
|
155 |
+
|
156 |
+
### Docker Build Optimization
|
157 |
+
- **Multi-stage build** for optimal image size
|
158 |
+
- **Specific Python exclusions** in `.dockerignore` (e.g., `api/__pycache__/` instead of all `__pycache__/`)
|
159 |
+
- **Enhanced file copying strategy** during build
|
160 |
+
|
161 |
+
## Performance Tips
|
162 |
+
|
163 |
+
1. **Use CPU Basic** for development, upgrade for production
|
164 |
+
2. **Optimize Docker image** by removing unnecessary files
|
165 |
+
3. **Use caching** for build dependencies
|
166 |
+
4. **Monitor resource usage** in the Space dashboard
|
167 |
+
|
168 |
+
## Security Notes
|
169 |
+
|
170 |
+
- Never commit API keys to your repository
|
171 |
+
- Use Hugging Face Spaces secrets for sensitive environment variables
|
172 |
+
- Consider making your Space private if it contains sensitive data
|
173 |
+
- Regularly update dependencies for security patches
|
174 |
+
|
175 |
+
docker run -d -p 7860:7860 --name starfish-app -v $(pwd)/nginx.conf:/etc/nginx/nginx.conf -v $(pwd)/supervisord.conf:/etc/supervisor/conf.d/supervisord.conf starfish-app
|
176 |
+
|
177 |
+
docker build -t starfish-app .
|
README.md
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Starfish - Synthetic Data Generation
|
3 |
+
emoji: 🌟
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: blue
|
6 |
+
sdk: docker
|
7 |
+
sdk_version: "4.36.0"
|
8 |
+
---
|
9 |
+
|
10 |
+
<p align="center">
|
11 |
+
<img src="https://github.com/user-attachments/assets/744c666a-bb5c-418b-aab4-162072c0b8c8" alt="Starfish Logo" width="200"/>
|
12 |
+
</p>
|
13 |
+
<h1 align="center">Starfish</h1>
|
14 |
+
<h3 align="center" style="font-size: 20px; margin-bottom: 4px">Synthetic Data Generation Made Easy</h2>
|
15 |
+
</br>
|
16 |
+
|
17 |
+
<div align="center">
|
18 |
+
|
19 |
+
[](https://github.com/starfishdata/starfish) [](https://x.com/starfishdata) [](https://huggingface.co/starfishdata) [](https://discord.gg/qWKmeUtb)
|
20 |
+
<br>
|
21 |
+
[](https://starfishdata.ai/)
|
22 |
+
[](https://deepwiki.com/starfishdata/starfish/1-overview)
|
24 |
+
</div>
|
25 |
+
|
26 |
+
## Overview
|
27 |
+
|
28 |
+
Starfish is a Python library that helps you build synthetic data your way. We adapt to your workflow—not the other way around. By combining structured LLM outputs with efficient parallel processing, Starfish lets you define exactly how your data should look and scale seamlessly from experiments to production.
|
29 |
+
|
30 |
+
⭐ Star us on GitHub if you find this project useful!
|
31 |
+
|
32 |
+
Key Features:
|
33 |
+
- **Structured Outputs**: First-class support for structured data through JSON schemas or Pydantic models.
|
34 |
+
- **Model Flexibility**: Use any LLM provider—local models, OpenAI, Anthropic, or your own implementation via LiteLLM.
|
35 |
+
- **Dynamic Prompts**: Dynamic prompts with built-in Jinja2 templates.
|
36 |
+
- **Easy Scaling**: Transform any function to run in parallel across thousands of inputs with a single decorator.
|
37 |
+
- **Resilient Pipeline**: Automatic retries, error handling, and job resumption—pause and continue your data generation anytime.
|
38 |
+
- **Complete Control**: Share state across your pipeline, extend functionality with custom hooks.
|
39 |
+
|
40 |
+
**Official Website**: [starfishdata.ai](https://starfishdata.ai/) - We offer both self-service and managed solutions. Visit our website to explore our services or contact us for more options!
|
41 |
+
|
42 |
+
## Installation
|
43 |
+
|
44 |
+
```bash
|
45 |
+
pip install starfish-core
|
46 |
+
```
|
47 |
+
|
48 |
+
### Optional Dependencies
|
49 |
+
|
50 |
+
Starfish supports optional dependencies for specific file parsers. Install only what you need:
|
51 |
+
|
52 |
+
```bash
|
53 |
+
# Install specific parsers
|
54 |
+
pip install "starfish-core[pdf]" # PDF support
|
55 |
+
pip install "starfish-core[docx]" # Word document support
|
56 |
+
pip install "starfish-core[ppt]" # PowerPoint support
|
57 |
+
pip install "starfish-core[excel]" # Excel support
|
58 |
+
pip install "starfish-core[youtube]" # YouTube support
|
59 |
+
|
60 |
+
# Install all parser dependencies
|
61 |
+
pip install "starfish-core[all]"
|
62 |
+
```
|
63 |
+
|
64 |
+
## Configuration
|
65 |
+
|
66 |
+
Starfish uses environment variables for configuration. We provide a `.env.template` file to help you get started quickly:
|
67 |
+
|
68 |
+
```bash
|
69 |
+
# Copy the template to .env
|
70 |
+
cp .env.template .env
|
71 |
+
|
72 |
+
# Edit with your API keys and configuration
|
73 |
+
nano .env # or use your preferred editor
|
74 |
+
```
|
75 |
+
|
76 |
+
The template includes settings for API keys, model configurations, and other runtime parameters.
|
77 |
+
|
78 |
+
## Quick Start
|
79 |
+
|
80 |
+
### Structured LLM - Type-Safe Outputs from Any Model
|
81 |
+
|
82 |
+
```python
|
83 |
+
# 1. Define structured outputs with schema
|
84 |
+
from starfish import StructuredLLM
|
85 |
+
from pydantic import BaseModel
|
86 |
+
|
87 |
+
# Option A: Use Pydantic for type safety
|
88 |
+
class QnASchema(BaseModel):
|
89 |
+
question: str
|
90 |
+
answer: str
|
91 |
+
|
92 |
+
# Option B: Or use simple JSON schema
|
93 |
+
json_schema = [
|
94 |
+
{'name': 'question', 'type': 'str'},
|
95 |
+
{'name': 'answer', 'type': 'str'},
|
96 |
+
]
|
97 |
+
|
98 |
+
# 2. Create a structured LLM with your preferred output format
|
99 |
+
qna_llm = StructuredLLM(
|
100 |
+
model_name="openai/gpt-4o-mini",
|
101 |
+
prompt="Generate facts about {{city}}",
|
102 |
+
output_schema=QnASchema # or json_schema
|
103 |
+
)
|
104 |
+
|
105 |
+
# 3. Get structured responses
|
106 |
+
response = await qna_llm.run(city="San Francisco")
|
107 |
+
|
108 |
+
# Access typed data
|
109 |
+
print(response.data)
|
110 |
+
# [{'question': 'What is the iconic symbol of San Francisco?',
|
111 |
+
# 'answer': 'The Golden Gate Bridge is the iconic symbol of San Francisco, completed in 1937.'}]
|
112 |
+
|
113 |
+
# Access raw API response for complete flexibility
|
114 |
+
print(response.raw) # Full API object with function calls, reasoning tokens, etc.
|
115 |
+
```
|
116 |
+
|
117 |
+
### Data Factory - Scale Any Workflow with One Decorator
|
118 |
+
|
119 |
+
```python
|
120 |
+
# Turn any function into a scalable data pipeline
|
121 |
+
from starfish import data_factory
|
122 |
+
|
123 |
+
# Works with any function - simple or complex workflows
|
124 |
+
@data_factory(max_concurrency=50)
|
125 |
+
async def parallel_qna_llm(city):
|
126 |
+
# This could be any arbitrary complex workflow:
|
127 |
+
# - Pre-processing
|
128 |
+
# - Multiple LLM calls
|
129 |
+
# - Post-processing
|
130 |
+
# - Error handling
|
131 |
+
response = await qna_llm.run(city=city)
|
132 |
+
return response.data
|
133 |
+
|
134 |
+
# Process 100 cities with 50 concurrent workers - finishes in seconds
|
135 |
+
cities = ["San Francisco", "New York", "Tokyo", "Paris", "London"] * 20
|
136 |
+
results = parallel_qna_llm.run(city=cities)
|
137 |
+
|
138 |
+
# dry run to test the workflow and data
|
139 |
+
results = parallel_qna_llm.dry_run(city=cities)
|
140 |
+
|
141 |
+
# resume job which pick up from where it left off.
|
142 |
+
results = parallel_qna_llm.resume()
|
143 |
+
```
|
144 |
+
|
145 |
+
### Examples
|
146 |
+
|
147 |
+
Check out our example notebooks for detailed walkthroughs:
|
148 |
+
- [Structured LLM Examples](examples/structured_llm.ipynb)
|
149 |
+
- [Data Factory Examples](examples/data_factory.ipynb)
|
150 |
+
|
151 |
+
## Documentation
|
152 |
+
|
153 |
+
Comprehensive documentation is on the way!
|
154 |
+
|
155 |
+
## Contributing
|
156 |
+
|
157 |
+
We'd love your help making Starfish better! Whether you're fixing bugs, adding features, or improving documentation, your contributions are welcome.
|
158 |
+
|
159 |
+
1. Fork the repository
|
160 |
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
161 |
+
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
|
162 |
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
163 |
+
5. Open a Pull Request
|
164 |
+
|
165 |
+
Contribution guidelines coming soon!
|
166 |
+
|
167 |
+
## License
|
168 |
+
|
169 |
+
This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
|
170 |
+
|
171 |
+
## Contact
|
172 |
+
|
173 |
+
If you have any questions or feedback, feel free to reach out to us at [[email protected]](mailto:[email protected]).
|
174 |
+
|
175 |
+
Want to discuss your use case directly? [Schedule a meeting with our team](https://calendly.com/d/crsb-ckq-fv2/chat-with-starfishdata-team).
|
176 |
+
|
177 |
+
## Telemetry
|
178 |
+
|
179 |
+
Starfish collects minimal and anonymous telemetry data to help improve the library. Participation is optional and you can opt out by setting `TELEMETRY_ENABLED=false` in your environment variables.
|
180 |
+
|
181 |
+
## Citation
|
182 |
+
|
183 |
+
If you use Starfish in your research, please consider citing us!
|
184 |
+
|
185 |
+
```
|
186 |
+
@software{starfish,
|
187 |
+
author = {Wendao, John, Ayush},
|
188 |
+
title = {{Starfish: A Tool for Synthetic Data Generation}},
|
189 |
+
year = {2025},
|
190 |
+
url = {https://github.com/starfishdata/starfish},
|
191 |
+
}
|
192 |
+
```
|
193 |
+
|
docs_mintlify
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 6ad0ad5eda1fc3637fde8d0da24f0d3fd4263453
|
examples/__init__.py
ADDED
File without changes
|
examples/data_factory.ipynb
ADDED
@@ -0,0 +1,681 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Google Colab Version: [Open this notebook in Google Colab](https://colab.research.google.com/github/starfishdata/starfish/blob/main/examples/data_factory.ipynb)"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "markdown",
|
12 |
+
"metadata": {},
|
13 |
+
"source": [
|
14 |
+
"#### Dependencies "
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "code",
|
19 |
+
"execution_count": 11,
|
20 |
+
"metadata": {},
|
21 |
+
"outputs": [
|
22 |
+
{
|
23 |
+
"name": "stdout",
|
24 |
+
"output_type": "stream",
|
25 |
+
"text": [
|
26 |
+
"Requirement already satisfied: starfish-core in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (0.1.0)\n",
|
27 |
+
"Requirement already satisfied: aiofiles<25.0.0,>=24.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (24.1.0)\n",
|
28 |
+
"Requirement already satisfied: aiosqlite<0.22.0,>=0.21.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.21.0)\n",
|
29 |
+
"Requirement already satisfied: cachetools<6.0.0,>=5.5.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (5.5.2)\n",
|
30 |
+
"Requirement already satisfied: litellm<2.0.0,>=1.65.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.65.1)\n",
|
31 |
+
"Requirement already satisfied: loguru<0.8.0,>=0.7.3 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.7.3)\n",
|
32 |
+
"Requirement already satisfied: ollama<0.5.0,>=0.4.7 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.4.7)\n",
|
33 |
+
"Requirement already satisfied: platformdirs<5.0.0,>=4.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.3.7)\n",
|
34 |
+
"Requirement already satisfied: psutil<8.0.0,>=7.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (7.0.0)\n",
|
35 |
+
"Requirement already satisfied: python-dotenv<2.0.0,>=1.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.1.0)\n",
|
36 |
+
"Requirement already satisfied: typing-extensions<5.0.0,>=4.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.13.0)\n",
|
37 |
+
"Requirement already satisfied: aiohttp in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.11.16)\n",
|
38 |
+
"Requirement already satisfied: click in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.1.8)\n",
|
39 |
+
"Requirement already satisfied: httpx>=0.23.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.28.1)\n",
|
40 |
+
"Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.6.1)\n",
|
41 |
+
"Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.1.6)\n",
|
42 |
+
"Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (4.23.0)\n",
|
43 |
+
"Requirement already satisfied: openai>=1.68.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (1.70.0)\n",
|
44 |
+
"Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (2.11.1)\n",
|
45 |
+
"Requirement already satisfied: tiktoken>=0.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
|
46 |
+
"Requirement already satisfied: tokenizers in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.21.1)\n",
|
47 |
+
"Requirement already satisfied: anyio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (4.9.0)\n",
|
48 |
+
"Requirement already satisfied: certifi in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (2025.1.31)\n",
|
49 |
+
"Requirement already satisfied: httpcore==1.* in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (1.0.7)\n",
|
50 |
+
"Requirement already satisfied: idna in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.10)\n",
|
51 |
+
"Requirement already satisfied: h11<0.15,>=0.13 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.14.0)\n",
|
52 |
+
"Requirement already satisfied: zipp>=3.20 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.21.0)\n",
|
53 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm<2.0.0,>=1.65.1->starfish-core) (3.0.2)\n",
|
54 |
+
"Requirement already satisfied: attrs>=22.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (25.3.0)\n",
|
55 |
+
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.10.1)\n",
|
56 |
+
"Requirement already satisfied: referencing>=0.28.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.36.2)\n",
|
57 |
+
"Requirement already satisfied: rpds-py>=0.7.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.24.0)\n",
|
58 |
+
"Requirement already satisfied: distro<2,>=1.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.9.0)\n",
|
59 |
+
"Requirement already satisfied: jiter<1,>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
|
60 |
+
"Requirement already satisfied: sniffio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.1)\n",
|
61 |
+
"Requirement already satisfied: tqdm>4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (4.67.1)\n",
|
62 |
+
"Requirement already satisfied: annotated-types>=0.6.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.7.0)\n",
|
63 |
+
"Requirement already satisfied: pydantic-core==2.33.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.33.0)\n",
|
64 |
+
"Requirement already satisfied: typing-inspection>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.4.0)\n",
|
65 |
+
"Requirement already satisfied: regex>=2022.1.18 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.11.6)\n",
|
66 |
+
"Requirement already satisfied: requests>=2.26.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.32.3)\n",
|
67 |
+
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (2.6.1)\n",
|
68 |
+
"Requirement already satisfied: aiosignal>=1.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.2)\n",
|
69 |
+
"Requirement already satisfied: frozenlist>=1.1.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.5.0)\n",
|
70 |
+
"Requirement already satisfied: multidict<7.0,>=4.5 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (6.3.1)\n",
|
71 |
+
"Requirement already satisfied: propcache>=0.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (0.3.1)\n",
|
72 |
+
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.18.3)\n",
|
73 |
+
"Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (0.30.1)\n",
|
74 |
+
"Requirement already satisfied: filelock in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (3.18.0)\n",
|
75 |
+
"Requirement already satisfied: fsspec>=2023.5.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (2025.3.2)\n",
|
76 |
+
"Requirement already satisfied: packaging>=20.9 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (24.2)\n",
|
77 |
+
"Requirement already satisfied: pyyaml>=5.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (6.0.2)\n",
|
78 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.4.1)\n",
|
79 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.3.0)\n",
|
80 |
+
"Note: you may need to restart the kernel to use updated packages.\n"
|
81 |
+
]
|
82 |
+
}
|
83 |
+
],
|
84 |
+
"source": [
|
85 |
+
"%pip install starfish-core"
|
86 |
+
]
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"cell_type": "code",
|
90 |
+
"execution_count": 1,
|
91 |
+
"metadata": {},
|
92 |
+
"outputs": [],
|
93 |
+
"source": [
|
94 |
+
"## Fix for Jupyter Notebook only — do NOT use in production\n",
|
95 |
+
"## Enables async code execution in notebooks, but may cause issues with sync/async issues\n",
|
96 |
+
"## For production, please run in standard .py files without this workaround\n",
|
97 |
+
"## See: https://github.com/erdewit/nest_asyncio for more details\n",
|
98 |
+
"import nest_asyncio\n",
|
99 |
+
"nest_asyncio.apply()\n",
|
100 |
+
"\n",
|
101 |
+
"from starfish import StructuredLLM, data_factory\n",
|
102 |
+
"from starfish.llm.utils import merge_structured_outputs\n",
|
103 |
+
"\n",
|
104 |
+
"from starfish.common.env_loader import load_env_file ## Load environment variables from .env file\n",
|
105 |
+
"load_env_file()"
|
106 |
+
]
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"cell_type": "code",
|
110 |
+
"execution_count": 2,
|
111 |
+
"metadata": {},
|
112 |
+
"outputs": [],
|
113 |
+
"source": [
|
114 |
+
"# setup your openai api key if not already set\n",
|
115 |
+
"# import os\n",
|
116 |
+
"# os.environ[\"OPENAI_API_KEY\"] = \"your_key_here\"\n",
|
117 |
+
"\n",
|
118 |
+
"# If you dont have any API key, please navigate to local model section"
|
119 |
+
]
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"cell_type": "code",
|
123 |
+
"execution_count": 3,
|
124 |
+
"metadata": {},
|
125 |
+
"outputs": [],
|
126 |
+
"source": [
|
127 |
+
"## Helper function mock llm call\n",
|
128 |
+
"# When developing data pipelines with LLMs, making thousands of real API calls\n",
|
129 |
+
"# can be expensive. Using mock LLM calls lets you test your pipeline's reliability,\n",
|
130 |
+
"# failure handling, and recovery without spending money on API calls.\n",
|
131 |
+
"from starfish.data_factory.utils.mock import mock_llm_call"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "markdown",
|
136 |
+
"metadata": {},
|
137 |
+
"source": [
|
138 |
+
"#### 1. Your First Data Factory: Simple Scaling\n",
|
139 |
+
"\n",
|
140 |
+
"The @data_factory decorator transforms any async function into a scalable data processing pipeline.\n",
|
141 |
+
"It handles:\n",
|
142 |
+
"- Parallel execution \n",
|
143 |
+
"- Automatic batching\n",
|
144 |
+
"- Error handling & retries\n",
|
145 |
+
"- Progress tracking\n",
|
146 |
+
"\n",
|
147 |
+
"Let's start with a single LLM call and then show how easy it is to scale it.\n"
|
148 |
+
]
|
149 |
+
},
|
150 |
+
{
|
151 |
+
"cell_type": "code",
|
152 |
+
"execution_count": 4,
|
153 |
+
"metadata": {},
|
154 |
+
"outputs": [
|
155 |
+
{
|
156 |
+
"data": {
|
157 |
+
"text/plain": [
|
158 |
+
"[{'fact': 'New Yorkers consume around 1,000,000 slices of pizza every day, which means if you laid them all in a line, they would stretch from the Statue of Liberty to the Eiffel Tower... and back!'}]"
|
159 |
+
]
|
160 |
+
},
|
161 |
+
"execution_count": 4,
|
162 |
+
"metadata": {},
|
163 |
+
"output_type": "execute_result"
|
164 |
+
}
|
165 |
+
],
|
166 |
+
"source": [
|
167 |
+
"# First, create a StructuredLLM instance for generating facts about cities\n",
|
168 |
+
"json_llm = StructuredLLM(\n",
|
169 |
+
" model_name = \"openai/gpt-4o-mini\",\n",
|
170 |
+
" prompt = \"Funny facts about city {{city_name}}.\",\n",
|
171 |
+
" output_schema = [{'name': 'fact', 'type': 'str'}],\n",
|
172 |
+
" model_kwargs = {\"temperature\": 0.7},\n",
|
173 |
+
")\n",
|
174 |
+
"\n",
|
175 |
+
"json_llm_response = await json_llm.run(city_name='New York')\n",
|
176 |
+
"json_llm_response.data"
|
177 |
+
]
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"cell_type": "code",
|
181 |
+
"execution_count": 5,
|
182 |
+
"metadata": {},
|
183 |
+
"outputs": [
|
184 |
+
{
|
185 |
+
"name": "stdout",
|
186 |
+
"output_type": "stream",
|
187 |
+
"text": [
|
188 |
+
"\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8c926411-63e7-4dc6-98c9-861c3489fb8b\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
189 |
+
"\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
190 |
+
"Processing New York at 2025-04-25 10:16:32.524033\n",
|
191 |
+
"Processing London at 2025-04-25 10:16:32.524286\n",
|
192 |
+
"Processing Tokyo at 2025-04-25 10:16:32.524979\n",
|
193 |
+
"Processing Paris at 2025-04-25 10:16:32.525535\n",
|
194 |
+
"Processing Sydney at 2025-04-25 10:16:32.526729\n",
|
195 |
+
"\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
|
196 |
+
"\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
|
197 |
+
]
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"data": {
|
201 |
+
"text/plain": [
|
202 |
+
"[{'fact': \"In Tokyo, there's a train station called 'Shinjuku' that handles more passengers each day than the entire population of the United States!\"},\n",
|
203 |
+
" {'fact': \"London has a 'secret' underground city known as the 'London Stone', which is said to have magical powers, making it one of the city's most famous and quirky legends!\"},\n",
|
204 |
+
" {'fact': 'In Paris, you can legally marry a dead person! This quirky law allows for posthumous marriages, as long as you can prove that the deceased had intended to marry you before their untimely demise.'},\n",
|
205 |
+
" {'fact': 'In New York City, there are more than 25,000 licensed taxis, but only about 1,200 of them are actually yellow. The rest are a rainbow of colors, including pink, blue, and even animal print!'},\n",
|
206 |
+
" {'fact': 'Sydney has a beach where you can surf, swim, and even watch a film – all in one day! Just don’t forget your sunscreen and popcorn!'}]"
|
207 |
+
]
|
208 |
+
},
|
209 |
+
"execution_count": 5,
|
210 |
+
"metadata": {},
|
211 |
+
"output_type": "execute_result"
|
212 |
+
}
|
213 |
+
],
|
214 |
+
"source": [
|
215 |
+
"# Now, scale to multiple cities using data_factory\n",
|
216 |
+
"# Just add the @data_factory decorator to process many cities in parallel\n",
|
217 |
+
"\n",
|
218 |
+
"from datetime import datetime\n",
|
219 |
+
"@data_factory(max_concurrency=10)\n",
|
220 |
+
"async def process_json_llm(city_name: str):\n",
|
221 |
+
" ## Adding a print statement to indicate the start of the processing\n",
|
222 |
+
" print(f\"Processing {city_name} at {datetime.now()}\")\n",
|
223 |
+
" json_llm_response = await json_llm.run(city_name=city_name)\n",
|
224 |
+
" return json_llm_response.data\n",
|
225 |
+
"\n",
|
226 |
+
"# This is all it takes to scale from one city to many cities!\n",
|
227 |
+
"process_json_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"])"
|
228 |
+
]
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"cell_type": "markdown",
|
232 |
+
"metadata": {},
|
233 |
+
"source": [
|
234 |
+
"#### 2. Works with any aysnc function\n",
|
235 |
+
"\n",
|
236 |
+
"Data Factory works with any async function, not just LLM calls, you can build complex pipelines involving multiple LLMs, data processing, etc.\n",
|
237 |
+
"\n",
|
238 |
+
"Here is example of two chained structured llm"
|
239 |
+
]
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "code",
|
243 |
+
"execution_count": 6,
|
244 |
+
"metadata": {},
|
245 |
+
"outputs": [
|
246 |
+
{
|
247 |
+
"name": "stdout",
|
248 |
+
"output_type": "stream",
|
249 |
+
"text": [
|
250 |
+
"\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 466fca03-85a2-46de-b135-629cd76738f7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
251 |
+
"\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
252 |
+
"\u001b[32m2025-04-25 10:16:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
253 |
+
"\u001b[32m2025-04-25 10:16:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
254 |
+
"\u001b[32m2025-04-25 10:16:43\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 2/3\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 2\u001b[0m (\u001b[32mCompleted: 2\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
255 |
+
"\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
|
256 |
+
"\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 3/3\u001b[0m | \u001b[33mAttempted: 3\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
|
257 |
+
]
|
258 |
+
}
|
259 |
+
],
|
260 |
+
"source": [
|
261 |
+
"# Example of a more complex function that chains multiple LLM calls\n",
|
262 |
+
"# This was grabbed from structured llm examples \n",
|
263 |
+
"\n",
|
264 |
+
"@data_factory(max_concurrency=10)\n",
|
265 |
+
"async def complex_process_cities(topic: str):\n",
|
266 |
+
" ## topic → generator_llm → rating_llm → merged results\n",
|
267 |
+
" # First LLM to generate question/answer pairs\n",
|
268 |
+
" generator_llm = StructuredLLM(\n",
|
269 |
+
" model_name=\"openai/gpt-4o-mini\",\n",
|
270 |
+
" prompt=\"Generate question/answer pairs about {{topic}}.\",\n",
|
271 |
+
" output_schema=[\n",
|
272 |
+
" {\"name\": \"question\", \"type\": \"str\"},\n",
|
273 |
+
" {\"name\": \"answer\", \"type\": \"str\"}\n",
|
274 |
+
" ],\n",
|
275 |
+
" )\n",
|
276 |
+
"\n",
|
277 |
+
" # Second LLM to rate the generated pairs\n",
|
278 |
+
" rater_llm = StructuredLLM(\n",
|
279 |
+
" model_name=\"openai/gpt-4o-mini\",\n",
|
280 |
+
" prompt='''Rate the following Q&A pairs based on accuracy and clarity (1-10).\n",
|
281 |
+
" Pairs: {{generated_pairs}}''',\n",
|
282 |
+
" output_schema=[\n",
|
283 |
+
" {\"name\": \"accuracy_rating\", \"type\": \"int\"},\n",
|
284 |
+
" {\"name\": \"clarity_rating\", \"type\": \"int\"}\n",
|
285 |
+
" ],\n",
|
286 |
+
" model_kwargs={\"temperature\": 0.5}\n",
|
287 |
+
")\n",
|
288 |
+
"\n",
|
289 |
+
" generation_response = await generator_llm.run(topic=topic, num_records=5)\n",
|
290 |
+
" rating_response = await rater_llm.run(generated_pairs=generation_response.data)\n",
|
291 |
+
" \n",
|
292 |
+
" # Merge the results\n",
|
293 |
+
" return merge_structured_outputs(generation_response.data, rating_response.data)\n",
|
294 |
+
"\n",
|
295 |
+
"\n",
|
296 |
+
"### To save on token here we only use 3 topics as example\n",
|
297 |
+
"complex_process_cities_data = complex_process_cities.run(topic=['Science', 'History', 'Technology'])"
|
298 |
+
]
|
299 |
+
},
|
300 |
+
{
|
301 |
+
"cell_type": "code",
|
302 |
+
"execution_count": 7,
|
303 |
+
"metadata": {},
|
304 |
+
"outputs": [
|
305 |
+
{
|
306 |
+
"name": "stdout",
|
307 |
+
"output_type": "stream",
|
308 |
+
"text": [
|
309 |
+
"15\n",
|
310 |
+
"[{'question': 'What is the primary function of a CPU in a computer?', 'answer': 'The CPU, or Central Processing Unit, is responsible for executing instructions and processing data in a computer system.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What does IoT stand for and what is its significance?', 'answer': 'IoT stands for Internet of Things, which refers to the interconnection of everyday devices to the internet, allowing them to send and receive data, thereby enhancing efficiency and convenience.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the difference between RAM and ROM?', 'answer': 'RAM (Random Access Memory) is volatile memory that temporarily stores data and applications currently in use, while ROM (Read-Only Memory) is non-volatile memory that permanently stores firmware and system software.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is cloud computing?', 'answer': 'Cloud computing is the delivery of computing services over the internet, enabling users to access and store data and applications on remote servers rather than on local computers.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What are the benefits of using artificial intelligence in business?', 'answer': 'Artificial intelligence can enhance efficiency, improve decision-making, personalize customer experiences, automate repetitive tasks, and generate insights from data analytics in business operations.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the chemical formula for water?', 'answer': 'The chemical formula for water is H2O.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the process by which plants make their own food?', 'answer': 'The process by which plants make their own food is called photosynthesis.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the speed of light in a vacuum?', 'answer': 'The speed of light in a vacuum is approximately 299,792 kilometers per second (or about 186,282 miles per second).', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the smallest unit of life?', 'answer': 'The smallest unit of life is the cell.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What gas do living organisms need for respiration?', 'answer': 'Living organisms need oxygen for respiration.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the primary cause of World War I?', 'answer': 'The primary cause of World War I was the complex system of alliances, militarism, imperialism, and nationalism, which escalated tensions following the assassination of Archduke Franz Ferdinand of Austria in 1914.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Who was the first President of the United States?', 'answer': 'George Washington was the first President of the United States, serving from April 30, 1789, to March 4, 1797.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What year did the Berlin Wall fall?', 'answer': 'The Berlin Wall fell on November 9, 1989, symbolizing the end of the Cold War and the division between East and West Germany.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Which ancient civilization is known for creating the first known writing system?', 'answer': 'The Sumerians, who inhabited ancient Mesopotamia around 3500 BCE, are known for creating the first known writing system called cuneiform.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the significance of the Magna Carta?', 'answer': 'The Magna Carta, signed in 1215, was significant because it limited the power of the monarchy and established the principle that everyone, including the king, was subject to the law.', 'accuracy_rating': 10, 'clarity_rating': 10}]\n"
|
311 |
+
]
|
312 |
+
}
|
313 |
+
],
|
314 |
+
"source": [
|
315 |
+
"### Each topic has 5 question/answer pairs so 3 topics has 15 pairs!\n",
|
316 |
+
"print(len(complex_process_cities_data))\n",
|
317 |
+
"print(complex_process_cities_data)"
|
318 |
+
]
|
319 |
+
},
|
320 |
+
{
|
321 |
+
"cell_type": "markdown",
|
322 |
+
"metadata": {},
|
323 |
+
"source": [
|
324 |
+
"#### 3. Working with Different Input Formats\n",
|
325 |
+
"\n",
|
326 |
+
"\n",
|
327 |
+
"Data Factory is flexible with how you provide inputs. Let's demonstrate different ways to pass parameters to data_factory functions.\n",
|
328 |
+
"\n",
|
329 |
+
"'data' is a reserved keyword expecting list(dict) or tuple(dict) - this design make it super easy to pass large data and support HuggingFace and Pandas dataframe very easily"
|
330 |
+
]
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"cell_type": "code",
|
334 |
+
"execution_count": 8,
|
335 |
+
"metadata": {},
|
336 |
+
"outputs": [
|
337 |
+
{
|
338 |
+
"data": {
|
339 |
+
"text/plain": [
|
340 |
+
"[{'answer': 'New York_5'}, {'answer': 'New York_2'}, {'answer': 'New York_3'}]"
|
341 |
+
]
|
342 |
+
},
|
343 |
+
"execution_count": 8,
|
344 |
+
"metadata": {},
|
345 |
+
"output_type": "execute_result"
|
346 |
+
}
|
347 |
+
],
|
348 |
+
"source": [
|
349 |
+
"## We will be using mock llm call for rest of example to save on token\n",
|
350 |
+
"## Mock LLM call is a function that simulates an LLM API call with random delays (controlled by sleep_time) and occasional failures (controlled by fail_rate)\n",
|
351 |
+
"await mock_llm_call(city_name=\"New York\", num_records_per_city=3)"
|
352 |
+
]
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"cell_type": "code",
|
356 |
+
"execution_count": 9,
|
357 |
+
"metadata": {},
|
358 |
+
"outputs": [],
|
359 |
+
"source": [
|
360 |
+
"@data_factory(max_concurrency=100)\n",
|
361 |
+
"async def input_format_mock_llm(city_name: str, num_records_per_city: int):\n",
|
362 |
+
" return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.01)"
|
363 |
+
]
|
364 |
+
},
|
365 |
+
{
|
366 |
+
"cell_type": "code",
|
367 |
+
"execution_count": 10,
|
368 |
+
"metadata": {},
|
369 |
+
"outputs": [
|
370 |
+
{
|
371 |
+
"name": "stdout",
|
372 |
+
"output_type": "stream",
|
373 |
+
"text": [
|
374 |
+
"\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 05c84608-fec3-4010-8876-e59eed12bb6a\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
375 |
+
"\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
376 |
+
"\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
|
377 |
+
"\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
|
378 |
+
]
|
379 |
+
}
|
380 |
+
],
|
381 |
+
"source": [
|
382 |
+
"# Format 1: Multiple lists that get zipped together\n",
|
383 |
+
"input_format_data1 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=[2, 1, 1, 1, 1])"
|
384 |
+
]
|
385 |
+
},
|
386 |
+
{
|
387 |
+
"cell_type": "code",
|
388 |
+
"execution_count": 11,
|
389 |
+
"metadata": {},
|
390 |
+
"outputs": [
|
391 |
+
{
|
392 |
+
"name": "stdout",
|
393 |
+
"output_type": "stream",
|
394 |
+
"text": [
|
395 |
+
"\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: fedb98e5-c408-4bc8-9479-6087f4a298b7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
396 |
+
"\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
397 |
+
"\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
|
398 |
+
"\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
|
399 |
+
]
|
400 |
+
}
|
401 |
+
],
|
402 |
+
"source": [
|
403 |
+
"# Format 2: List + single value (single value gets broadcasted)\n",
|
404 |
+
"input_format_data2 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=1)"
|
405 |
+
]
|
406 |
+
},
|
407 |
+
{
|
408 |
+
"cell_type": "code",
|
409 |
+
"execution_count": 12,
|
410 |
+
"metadata": {},
|
411 |
+
"outputs": [
|
412 |
+
{
|
413 |
+
"name": "stdout",
|
414 |
+
"output_type": "stream",
|
415 |
+
"text": [
|
416 |
+
"\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 2f5cb7cc-83c9-4b7e-9ebb-386cd66bdd42\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
417 |
+
"\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
418 |
+
"\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
|
419 |
+
"\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
|
420 |
+
]
|
421 |
+
}
|
422 |
+
],
|
423 |
+
"source": [
|
424 |
+
"# Format 3: Special 'data' parameter\n",
|
425 |
+
"# 'data' is a reserved keyword expecting list(dict) or tuple(dict)\n",
|
426 |
+
"# Makes integration with various data sources easier\n",
|
427 |
+
"input_format_data3 = input_format_mock_llm.run(data=[{\"city_name\": \"New York\", \"num_records_per_city\": 2}, {\"city_name\": \"London\", \"num_records_per_city\": 1}, {\"city_name\": \"Tokyo\", \"num_records_per_city\": 1}, {\"city_name\": \"Paris\", \"num_records_per_city\": 1}, {\"city_name\": \"Sydney\", \"num_records_per_city\": 1}])"
|
428 |
+
]
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"cell_type": "markdown",
|
432 |
+
"metadata": {},
|
433 |
+
"source": [
|
434 |
+
"#### 4. Resilient error retry\n",
|
435 |
+
"Data Factory automatically handles errors and retries, making your pipelines robust.\n",
|
436 |
+
"\n",
|
437 |
+
"Let's demonstrate with a high failure rate example."
|
438 |
+
]
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"cell_type": "code",
|
442 |
+
"execution_count": 13,
|
443 |
+
"metadata": {},
|
444 |
+
"outputs": [
|
445 |
+
{
|
446 |
+
"name": "stdout",
|
447 |
+
"output_type": "stream",
|
448 |
+
"text": [
|
449 |
+
"\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 38c50ab6-f24b-4cba-a2c5-070130ab420e\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
450 |
+
"\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/25\u001b[0m | \u001b[33mRunning: 25\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
451 |
+
"\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 16/25\u001b[0m | \u001b[33mRunning: 9\u001b[0m | \u001b[36mAttempted: 16\u001b[0m (\u001b[32mCompleted: 16\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
452 |
+
"\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Tokyo\u001b[0m\n",
|
453 |
+
"\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n",
|
454 |
+
"\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
|
455 |
+
"\u001b[32m2025-04-25 10:17:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 23/25\u001b[0m | \u001b[33mRunning: 2\u001b[0m | \u001b[36mAttempted: 26\u001b[0m (\u001b[32mCompleted: 23\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
456 |
+
"\u001b[32m2025-04-25 10:17:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
|
457 |
+
"\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
|
458 |
+
"\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 25/25\u001b[0m | \u001b[33mAttempted: 29\u001b[0m (Failed: 4, Filtered: 0, Duplicate: 0)\u001b[0m\n",
|
459 |
+
"\n",
|
460 |
+
"Successfully completed 25 out of 25 tasks\n",
|
461 |
+
"Data Factory automatically handled the failures and continued processing\n",
|
462 |
+
"The results only include successful tasks\n"
|
463 |
+
]
|
464 |
+
}
|
465 |
+
],
|
466 |
+
"source": [
|
467 |
+
"@data_factory(max_concurrency=100)\n",
|
468 |
+
"async def high_error_rate_mock_llm(city_name: str, num_records_per_city: int):\n",
|
469 |
+
" return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3) # Hardcode to 30% chance of failure\n",
|
470 |
+
"\n",
|
471 |
+
"# Process all cities - some will fail, but data_factory keeps going\n",
|
472 |
+
"cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 5 # 25 cities\n",
|
473 |
+
"high_error_rate_mock_lllm_data = high_error_rate_mock_llm.run(city_name=cities, num_records_per_city=1)\n",
|
474 |
+
"\n",
|
475 |
+
"print(f\"\\nSuccessfully completed {len(high_error_rate_mock_lllm_data)} out of {len(cities)} tasks\")\n",
|
476 |
+
"print(\"Data Factory automatically handled the failures and continued processing\")\n",
|
477 |
+
"print(\"The results only include successful tasks\")"
|
478 |
+
]
|
479 |
+
},
|
480 |
+
{
|
481 |
+
"cell_type": "markdown",
|
482 |
+
"metadata": {},
|
483 |
+
"source": [
|
484 |
+
"#### 5. Resume\n",
|
485 |
+
"\n",
|
486 |
+
"This is essential for long-running jobs with thousands of tasks.\n",
|
487 |
+
"\n",
|
488 |
+
"If a job is interrupted, you can pick up where you left off using one of two resume methods:\n",
|
489 |
+
"\n",
|
490 |
+
"\n",
|
491 |
+
"1. **Same Session Resume**: If you're still in the same session where the job was interrupted, simply call - Same instance with .resume()\n",
|
492 |
+
"\n",
|
493 |
+
"2. **Cross-Session Resume**: If you've closed your notebook or lost your session, you can resume using the job ID:\n",
|
494 |
+
" ```python\n",
|
495 |
+
" from starfish import DataFactory\n",
|
496 |
+
" # Resume using the master job ID from a previous run\n",
|
497 |
+
" data_factory = DataFactory.resume_from_checkpoint(job_id=\"your_job_id\")\n",
|
498 |
+
" ```\n",
|
499 |
+
"\n",
|
500 |
+
"The key difference:\n",
|
501 |
+
"- `resume()` uses the same DataFactory instance you defined\n",
|
502 |
+
"- `resume_from_checkpoint()` reconstructs your DataFactory from persistent storage where tasks and progress are saved\n",
|
503 |
+
"\n",
|
504 |
+
"> **Note**: Google Colab users may experience issues with `resume_from_checkpoint()` due to how Colab works\n",
|
505 |
+
"\n",
|
506 |
+
"We're simulating an interruption here. In a real scenario, this might happen if your notebook errors out, is manually interrupted with a keyboard command, encounters API rate limits, or experiences any other issues that halt execution."
|
507 |
+
]
|
508 |
+
},
|
509 |
+
{
|
510 |
+
"cell_type": "code",
|
511 |
+
"execution_count": 14,
|
512 |
+
"metadata": {},
|
513 |
+
"outputs": [
|
514 |
+
{
|
515 |
+
"name": "stdout",
|
516 |
+
"output_type": "stream",
|
517 |
+
"text": [
|
518 |
+
"\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: b2a400b3-32e7-45ee-b8e8-c2bc7afe9f11\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
519 |
+
"\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
520 |
+
"\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 17/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 17\u001b[0m (\u001b[32mCompleted: 17\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
521 |
+
"\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
|
522 |
+
"\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError occurred: KeyboardInterrupt\u001b[0m\n",
|
523 |
+
"\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\u001b[0m\n",
|
524 |
+
"\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mAttempted: 20\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
|
525 |
+
]
|
526 |
+
}
|
527 |
+
],
|
528 |
+
"source": [
|
529 |
+
"@data_factory(max_concurrency=10)\n",
|
530 |
+
"async def re_run_mock_llm(city_name: str, num_records_per_city: int):\n",
|
531 |
+
" return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
|
532 |
+
"\n",
|
533 |
+
"cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 20 # 100 cities\n",
|
534 |
+
"re_run_mock_llm_data_1 = re_run_mock_llm.run(city_name=cities, num_records_per_city=1)"
|
535 |
+
]
|
536 |
+
},
|
537 |
+
{
|
538 |
+
"cell_type": "code",
|
539 |
+
"execution_count": 15,
|
540 |
+
"metadata": {},
|
541 |
+
"outputs": [
|
542 |
+
{
|
543 |
+
"name": "stdout",
|
544 |
+
"output_type": "stream",
|
545 |
+
"text": [
|
546 |
+
"When a job is interrupted, you'll see a message like:\n",
|
547 |
+
"[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\n",
|
548 |
+
"\n",
|
549 |
+
"To resume an interrupted job, simply call:\n",
|
550 |
+
"interrupted_job_mock_llm.resume()\n",
|
551 |
+
"\n",
|
552 |
+
"For this example we have 20/100 data generated and not finished yet!\n"
|
553 |
+
]
|
554 |
+
}
|
555 |
+
],
|
556 |
+
"source": [
|
557 |
+
"print(\"When a job is interrupted, you'll see a message like:\")\n",
|
558 |
+
"print(\"[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\")\n",
|
559 |
+
"\n",
|
560 |
+
"print(\"\\nTo resume an interrupted job, simply call:\")\n",
|
561 |
+
"print(\"interrupted_job_mock_llm.resume()\")\n",
|
562 |
+
"print('')\n",
|
563 |
+
"print(f\"For this example we have {len(re_run_mock_llm_data_1)}/{len(cities)} data generated and not finished yet!\")"
|
564 |
+
]
|
565 |
+
},
|
566 |
+
{
|
567 |
+
"cell_type": "code",
|
568 |
+
"execution_count": 17,
|
569 |
+
"metadata": {},
|
570 |
+
"outputs": [
|
571 |
+
{
|
572 |
+
"name": "stdout",
|
573 |
+
"output_type": "stream",
|
574 |
+
"text": [
|
575 |
+
"\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB RESUME START]\u001b[0m \u001b[33mPICKING UP FROM WHERE THE JOB WAS LEFT OFF...\u001b[0m\n",
|
576 |
+
"\u001b[0m\n",
|
577 |
+
"\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[RESUME PROGRESS] STATUS AT THE TIME OF RESUME:\u001b[0m \u001b[32mCompleted: 20 / 100\u001b[0m | \u001b[31mFailed: 0\u001b[0m | \u001b[31mDuplicate: 0\u001b[0m | \u001b[33mFiltered: 0\u001b[0m\u001b[0m\n",
|
578 |
+
"\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 20\u001b[0m (\u001b[32mCompleted: 20\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
579 |
+
"\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 32/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 32\u001b[0m (\u001b[32mCompleted: 32\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
580 |
+
"\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
|
581 |
+
"\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
|
582 |
+
"\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
|
583 |
+
"\u001b[32m2025-04-25 10:18:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 56/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 59\u001b[0m (\u001b[32mCompleted: 56\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
584 |
+
"\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
|
585 |
+
"\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n",
|
586 |
+
"\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 69/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 74\u001b[0m (\u001b[32mCompleted: 69\u001b[0m, \u001b[31mFailed: 5\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
587 |
+
"\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
|
588 |
+
"\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 89/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 95\u001b[0m (\u001b[32mCompleted: 89\u001b[0m, \u001b[31mFailed: 6\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
589 |
+
"\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
|
590 |
+
"\u001b[32m2025-04-25 10:18:13\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
|
591 |
+
"\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
|
592 |
+
"\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
|
593 |
+
"\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 100/100\u001b[0m | \u001b[33mAttempted: 109\u001b[0m (Failed: 9, Filtered: 0, Duplicate: 0)\u001b[0m\n"
|
594 |
+
]
|
595 |
+
}
|
596 |
+
],
|
597 |
+
"source": [
|
598 |
+
"## Lets keep continue the rest of run by resume_from_checkpoint \n",
|
599 |
+
"re_run_mock_llm_data_2 = re_run_mock_llm.resume()"
|
600 |
+
]
|
601 |
+
},
|
602 |
+
{
|
603 |
+
"cell_type": "code",
|
604 |
+
"execution_count": 18,
|
605 |
+
"metadata": {},
|
606 |
+
"outputs": [
|
607 |
+
{
|
608 |
+
"name": "stdout",
|
609 |
+
"output_type": "stream",
|
610 |
+
"text": [
|
611 |
+
"Now we still able to finished with what is left!! 100 data generated!\n"
|
612 |
+
]
|
613 |
+
}
|
614 |
+
],
|
615 |
+
"source": [
|
616 |
+
"print(f\"Now we still able to finished with what is left!! {len(re_run_mock_llm_data_2)} data generated!\")"
|
617 |
+
]
|
618 |
+
},
|
619 |
+
{
|
620 |
+
"cell_type": "markdown",
|
621 |
+
"metadata": {},
|
622 |
+
"source": [
|
623 |
+
"#### 6. Dry run\n",
|
624 |
+
"Before running a large job, you can do a \"dry run\" to test your pipeline. This only processes a single item and doesn't save state to the database."
|
625 |
+
]
|
626 |
+
},
|
627 |
+
{
|
628 |
+
"cell_type": "code",
|
629 |
+
"execution_count": 19,
|
630 |
+
"metadata": {},
|
631 |
+
"outputs": [
|
632 |
+
{
|
633 |
+
"name": "stdout",
|
634 |
+
"output_type": "stream",
|
635 |
+
"text": [
|
636 |
+
"\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: None\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
637 |
+
"\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
|
638 |
+
"\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
|
639 |
+
"\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/0\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
|
640 |
+
]
|
641 |
+
}
|
642 |
+
],
|
643 |
+
"source": [
|
644 |
+
"@data_factory(max_concurrency=10)\n",
|
645 |
+
"async def dry_run_mock_llm(city_name: str, num_records_per_city: int):\n",
|
646 |
+
" return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
|
647 |
+
"\n",
|
648 |
+
"dry_run_mock_llm_data = dry_run_mock_llm.dry_run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"]*20, num_records_per_city=1)"
|
649 |
+
]
|
650 |
+
},
|
651 |
+
{
|
652 |
+
"cell_type": "markdown",
|
653 |
+
"metadata": {},
|
654 |
+
"source": [
|
655 |
+
"#### 8. Advanced Usage\n",
|
656 |
+
"Data Factory offers more advanced capabilities for complete pipeline customization, including hooks that execute at key stages and shareable state to coordinate between tasks. These powerful features enable complex workflows and fine-grained control. Our dedicated examples for advanced data_factory usage will be coming soon!"
|
657 |
+
]
|
658 |
+
}
|
659 |
+
],
|
660 |
+
"metadata": {
|
661 |
+
"kernelspec": {
|
662 |
+
"display_name": ".venv",
|
663 |
+
"language": "python",
|
664 |
+
"name": "python3"
|
665 |
+
},
|
666 |
+
"language_info": {
|
667 |
+
"codemirror_mode": {
|
668 |
+
"name": "ipython",
|
669 |
+
"version": 3
|
670 |
+
},
|
671 |
+
"file_extension": ".py",
|
672 |
+
"mimetype": "text/x-python",
|
673 |
+
"name": "python",
|
674 |
+
"nbconvert_exporter": "python",
|
675 |
+
"pygments_lexer": "ipython3",
|
676 |
+
"version": "3.11.4"
|
677 |
+
}
|
678 |
+
},
|
679 |
+
"nbformat": 4,
|
680 |
+
"nbformat_minor": 2
|
681 |
+
}
|
examples/data_factory_release_check.ipynb
ADDED
@@ -0,0 +1,494 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Google Colab Version: [Open this notebook in Google Colab](https://colab.research.google.com/github/starfishdata/starfish/blob/main/examples/data_factory.ipynb)"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "markdown",
|
12 |
+
"metadata": {},
|
13 |
+
"source": [
|
14 |
+
"#### Dependencies "
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "code",
|
19 |
+
"execution_count": 23,
|
20 |
+
"metadata": {},
|
21 |
+
"outputs": [
|
22 |
+
{
|
23 |
+
"name": "stdout",
|
24 |
+
"output_type": "stream",
|
25 |
+
"text": [
|
26 |
+
"Looking in indexes: https://test.pypi.org/simple/, https://pypi.org/simple\n",
|
27 |
+
"Requirement already satisfied: starfish-core in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (0.1.2)\n",
|
28 |
+
"Requirement already satisfied: aiofiles<25.0.0,>=24.1.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (24.1.0)\n",
|
29 |
+
"Requirement already satisfied: aiosqlite<0.22.0,>=0.21.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (0.21.0)\n",
|
30 |
+
"Requirement already satisfied: cachetools<6.0.0,>=5.5.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (5.5.2)\n",
|
31 |
+
"Requirement already satisfied: cloudpickle<3.0.0,>=2.2.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (2.2.1)\n",
|
32 |
+
"Requirement already satisfied: cryptography>=44.0.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (44.0.3)\n",
|
33 |
+
"Requirement already satisfied: docstring_parser<0.17.0,>=0.16.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (0.16)\n",
|
34 |
+
"Requirement already satisfied: litellm<2.0.0,>=1.65.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (1.69.3)\n",
|
35 |
+
"Requirement already satisfied: loguru<0.8.0,>=0.7.3 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (0.7.3)\n",
|
36 |
+
"Requirement already satisfied: mcp<2.0.0,>=1.8.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (1.9.0)\n",
|
37 |
+
"Requirement already satisfied: nest_asyncio<2.0.0,>=1.6.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (1.6.0)\n",
|
38 |
+
"Requirement already satisfied: ollama<0.5.0,>=0.4.7 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (0.4.8)\n",
|
39 |
+
"Requirement already satisfied: posthog<4.0.0,>=3.11.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (3.25.0)\n",
|
40 |
+
"Requirement already satisfied: psutil<8.0.0,>=7.0.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (7.0.0)\n",
|
41 |
+
"Requirement already satisfied: python-dotenv<2.0.0,>=1.1.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (1.1.0)\n",
|
42 |
+
"Requirement already satisfied: typing-extensions<5.0.0,>=4.0.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (4.13.2)\n",
|
43 |
+
"Requirement already satisfied: cffi>=1.12 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from cryptography>=44.0.1->starfish-core) (1.17.1)\n",
|
44 |
+
"Requirement already satisfied: aiohttp in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.11.18)\n",
|
45 |
+
"Requirement already satisfied: click in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.2.0)\n",
|
46 |
+
"Requirement already satisfied: httpx>=0.23.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.28.1)\n",
|
47 |
+
"Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.7.0)\n",
|
48 |
+
"Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.1.6)\n",
|
49 |
+
"Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (4.23.0)\n",
|
50 |
+
"Requirement already satisfied: openai<1.76.0,>=1.68.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (1.75.0)\n",
|
51 |
+
"Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (2.11.4)\n",
|
52 |
+
"Requirement already satisfied: tiktoken>=0.7.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
|
53 |
+
"Requirement already satisfied: tokenizers in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.21.1)\n",
|
54 |
+
"Requirement already satisfied: anyio>=4.5 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (4.9.0)\n",
|
55 |
+
"Requirement already satisfied: httpx-sse>=0.4 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (0.4.0)\n",
|
56 |
+
"Requirement already satisfied: pydantic-settings>=2.5.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (2.9.1)\n",
|
57 |
+
"Requirement already satisfied: python-multipart>=0.0.9 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (0.0.20)\n",
|
58 |
+
"Requirement already satisfied: sse-starlette>=1.6.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (2.3.5)\n",
|
59 |
+
"Requirement already satisfied: starlette>=0.27 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (0.46.2)\n",
|
60 |
+
"Requirement already satisfied: uvicorn>=0.23.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (0.34.2)\n",
|
61 |
+
"Requirement already satisfied: requests<3.0,>=2.7 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (2.32.3)\n",
|
62 |
+
"Requirement already satisfied: six>=1.5 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (1.17.0)\n",
|
63 |
+
"Requirement already satisfied: monotonic>=1.5 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (1.6)\n",
|
64 |
+
"Requirement already satisfied: backoff>=1.10.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (2.2.1)\n",
|
65 |
+
"Requirement already satisfied: python-dateutil>2.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (2.9.0.post0)\n",
|
66 |
+
"Requirement already satisfied: distro>=1.5.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (1.9.0)\n",
|
67 |
+
"Requirement already satisfied: idna>=2.8 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from anyio>=4.5->mcp<2.0.0,>=1.8.1->starfish-core) (3.10)\n",
|
68 |
+
"Requirement already satisfied: sniffio>=1.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from anyio>=4.5->mcp<2.0.0,>=1.8.1->starfish-core) (1.3.1)\n",
|
69 |
+
"Requirement already satisfied: pycparser in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from cffi>=1.12->cryptography>=44.0.1->starfish-core) (2.22)\n",
|
70 |
+
"Requirement already satisfied: certifi in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (2025.4.26)\n",
|
71 |
+
"Requirement already satisfied: httpcore==1.* in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (1.0.9)\n",
|
72 |
+
"Requirement already satisfied: h11>=0.16 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.16.0)\n",
|
73 |
+
"Requirement already satisfied: zipp>=3.20 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.21.0)\n",
|
74 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm<2.0.0,>=1.65.1->starfish-core) (3.0.2)\n",
|
75 |
+
"Requirement already satisfied: attrs>=22.2.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (25.3.0)\n",
|
76 |
+
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (2025.4.1)\n",
|
77 |
+
"Requirement already satisfied: referencing>=0.28.4 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.36.2)\n",
|
78 |
+
"Requirement already satisfied: rpds-py>=0.7.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.25.0)\n",
|
79 |
+
"Requirement already satisfied: jiter<1,>=0.4.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from openai<1.76.0,>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
|
80 |
+
"Requirement already satisfied: tqdm>4 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from openai<1.76.0,>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (4.67.1)\n",
|
81 |
+
"Requirement already satisfied: annotated-types>=0.6.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.7.0)\n",
|
82 |
+
"Requirement already satisfied: pydantic-core==2.33.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.33.2)\n",
|
83 |
+
"Requirement already satisfied: typing-inspection>=0.4.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.4.0)\n",
|
84 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog<4.0.0,>=3.11.0->starfish-core) (3.4.2)\n",
|
85 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog<4.0.0,>=3.11.0->starfish-core) (2.4.0)\n",
|
86 |
+
"Requirement already satisfied: regex>=2022.1.18 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.11.6)\n",
|
87 |
+
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (2.6.1)\n",
|
88 |
+
"Requirement already satisfied: aiosignal>=1.1.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.2)\n",
|
89 |
+
"Requirement already satisfied: frozenlist>=1.1.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.6.0)\n",
|
90 |
+
"Requirement already satisfied: multidict<7.0,>=4.5 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (6.4.3)\n",
|
91 |
+
"Requirement already satisfied: propcache>=0.2.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (0.3.1)\n",
|
92 |
+
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.20.0)\n",
|
93 |
+
"Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (0.31.2)\n",
|
94 |
+
"Requirement already satisfied: filelock in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (3.18.0)\n",
|
95 |
+
"Requirement already satisfied: fsspec>=2023.5.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (2025.3.2)\n",
|
96 |
+
"Requirement already satisfied: packaging>=20.9 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (25.0)\n",
|
97 |
+
"Requirement already satisfied: pyyaml>=5.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (6.0.2)\n",
|
98 |
+
"\n",
|
99 |
+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n",
|
100 |
+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
101 |
+
"Note: you may need to restart the kernel to use updated packages.\n"
|
102 |
+
]
|
103 |
+
}
|
104 |
+
],
|
105 |
+
"source": [
|
106 |
+
"#%pip install starfish-core\n",
|
107 |
+
"%pip install --index-url https://test.pypi.org/simple/ \\\n",
|
108 |
+
" --extra-index-url https://pypi.org/simple \\\n",
|
109 |
+
" starfish-core"
|
110 |
+
]
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"cell_type": "code",
|
114 |
+
"execution_count": 24,
|
115 |
+
"metadata": {},
|
116 |
+
"outputs": [
|
117 |
+
{
|
118 |
+
"name": "stdout",
|
119 |
+
"output_type": "stream",
|
120 |
+
"text": [
|
121 |
+
"\u001b[32m2025-05-23 22:50:10\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mFailed to load environment variables from /Users/john/Documents/projects/aa/python/starfish/starfish/.env\u001b[0m\n"
|
122 |
+
]
|
123 |
+
}
|
124 |
+
],
|
125 |
+
"source": [
|
126 |
+
"## Fix for Jupyter Notebook only — do NOT use in production\n",
|
127 |
+
"## Enables async code execution in notebooks, but may cause issues with sync/async issues\n",
|
128 |
+
"## For production, please run in standard .py files without this workaround\n",
|
129 |
+
"## See: https://github.com/erdewit/nest_asyncio for more details\n",
|
130 |
+
"import nest_asyncio\n",
|
131 |
+
"nest_asyncio.apply()\n",
|
132 |
+
"\n",
|
133 |
+
"from starfish import StructuredLLM, data_factory\n",
|
134 |
+
"from starfish.llm.utils import merge_structured_outputs\n",
|
135 |
+
"\n",
|
136 |
+
"from starfish.common.env_loader import load_env_file ## Load environment variables from .env file\n",
|
137 |
+
"load_env_file()"
|
138 |
+
]
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"cell_type": "code",
|
142 |
+
"execution_count": 25,
|
143 |
+
"metadata": {},
|
144 |
+
"outputs": [],
|
145 |
+
"source": [
|
146 |
+
"## Helper function mock llm call\n",
|
147 |
+
"# When developing data pipelines with LLMs, making thousands of real API calls\n",
|
148 |
+
"# can be expensive. Using mock LLM calls lets you test your pipeline's reliability,\n",
|
149 |
+
"# failure handling, and recovery without spending money on API calls.\n",
|
150 |
+
"from starfish.data_factory.utils.mock import mock_llm_call"
|
151 |
+
]
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"cell_type": "markdown",
|
155 |
+
"metadata": {},
|
156 |
+
"source": [
|
157 |
+
"#### 3. Working with Different Input Formats\n",
|
158 |
+
"\n",
|
159 |
+
"\n",
|
160 |
+
"Data Factory is flexible with how you provide inputs. Let's demonstrate different ways to pass parameters to data_factory functions.\n",
|
161 |
+
"\n",
|
162 |
+
"'data' is a reserved keyword expecting list(dict) or tuple(dict) - this design make it super easy to pass large data and support HuggingFace and Pandas dataframe very easily"
|
163 |
+
]
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"cell_type": "code",
|
167 |
+
"execution_count": 26,
|
168 |
+
"metadata": {},
|
169 |
+
"outputs": [
|
170 |
+
{
|
171 |
+
"data": {
|
172 |
+
"text/plain": [
|
173 |
+
"[{'answer': 'New York_3'}, {'answer': 'New York_1'}, {'answer': 'New York_5'}]"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
"execution_count": 26,
|
177 |
+
"metadata": {},
|
178 |
+
"output_type": "execute_result"
|
179 |
+
}
|
180 |
+
],
|
181 |
+
"source": [
|
182 |
+
"## We will be using mock llm call for rest of example to save on token\n",
|
183 |
+
"## Mock LLM call is a function that simulates an LLM API call with random delays (controlled by sleep_time) and occasional failures (controlled by fail_rate)\n",
|
184 |
+
"await mock_llm_call(city_name=\"New York\", num_records_per_city=3)"
|
185 |
+
]
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"cell_type": "code",
|
189 |
+
"execution_count": 27,
|
190 |
+
"metadata": {},
|
191 |
+
"outputs": [],
|
192 |
+
"source": [
|
193 |
+
"@data_factory(max_concurrency=100)\n",
|
194 |
+
"async def input_format_mock_llm(city_name: str, num_records_per_city: int):\n",
|
195 |
+
" return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.01)"
|
196 |
+
]
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"cell_type": "code",
|
200 |
+
"execution_count": 28,
|
201 |
+
"metadata": {},
|
202 |
+
"outputs": [
|
203 |
+
{
|
204 |
+
"name": "stdout",
|
205 |
+
"output_type": "stream",
|
206 |
+
"text": [
|
207 |
+
"\u001b[32m2025-05-23 22:50:10\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 4da82fc7-4112-4e05-b58c-53cf470747ad\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
208 |
+
"\u001b[32m2025-05-23 22:50:10\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
209 |
+
"\u001b[32m2025-05-23 22:50:11\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
|
210 |
+
]
|
211 |
+
}
|
212 |
+
],
|
213 |
+
"source": [
|
214 |
+
"# Format 1: Multiple lists that get zipped together\n",
|
215 |
+
"input_format_data1 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=[2, 1, 1, 1, 1])"
|
216 |
+
]
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"cell_type": "code",
|
220 |
+
"execution_count": 29,
|
221 |
+
"metadata": {},
|
222 |
+
"outputs": [
|
223 |
+
{
|
224 |
+
"name": "stdout",
|
225 |
+
"output_type": "stream",
|
226 |
+
"text": [
|
227 |
+
"\u001b[32m2025-05-23 22:50:11\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 73973449-6069-485e-ac8c-b1b3a6b3f1a4\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
228 |
+
"\u001b[32m2025-05-23 22:50:11\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
229 |
+
"\u001b[32m2025-05-23 22:50:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
|
230 |
+
]
|
231 |
+
}
|
232 |
+
],
|
233 |
+
"source": [
|
234 |
+
"# Format 2: List + single value (single value gets broadcasted)\n",
|
235 |
+
"input_format_data2 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=1)"
|
236 |
+
]
|
237 |
+
},
|
238 |
+
{
|
239 |
+
"cell_type": "code",
|
240 |
+
"execution_count": 30,
|
241 |
+
"metadata": {},
|
242 |
+
"outputs": [
|
243 |
+
{
|
244 |
+
"name": "stdout",
|
245 |
+
"output_type": "stream",
|
246 |
+
"text": [
|
247 |
+
"\u001b[32m2025-05-23 22:50:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: aa9954f9-fc18-4b42-959e-fb2a897987c7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
248 |
+
"\u001b[32m2025-05-23 22:50:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
249 |
+
"\u001b[32m2025-05-23 22:50:13\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
|
250 |
+
]
|
251 |
+
}
|
252 |
+
],
|
253 |
+
"source": [
|
254 |
+
"# Format 3: Special 'data' parameter\n",
|
255 |
+
"# 'data' is a reserved keyword expecting list(dict) or tuple(dict)\n",
|
256 |
+
"# Makes integration with various data sources easier\n",
|
257 |
+
"input_format_data3 = input_format_mock_llm.run(data=[{\"city_name\": \"New York\", \"num_records_per_city\": 2}, {\"city_name\": \"London\", \"num_records_per_city\": 1}, {\"city_name\": \"Tokyo\", \"num_records_per_city\": 1}, {\"city_name\": \"Paris\", \"num_records_per_city\": 1}, {\"city_name\": \"Sydney\", \"num_records_per_city\": 1}])"
|
258 |
+
]
|
259 |
+
},
|
260 |
+
{
|
261 |
+
"cell_type": "markdown",
|
262 |
+
"metadata": {},
|
263 |
+
"source": [
|
264 |
+
"#### 4. Resilient error retry\n",
|
265 |
+
"Data Factory automatically handles errors and retries, making your pipelines robust.\n",
|
266 |
+
"\n",
|
267 |
+
"Let's demonstrate with a high failure rate example."
|
268 |
+
]
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"cell_type": "code",
|
272 |
+
"execution_count": 31,
|
273 |
+
"metadata": {},
|
274 |
+
"outputs": [
|
275 |
+
{
|
276 |
+
"name": "stdout",
|
277 |
+
"output_type": "stream",
|
278 |
+
"text": [
|
279 |
+
"\u001b[32m2025-05-23 22:50:13\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 730b766d-3c23-419a-a3dd-271d683818b1\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
280 |
+
"\u001b[32m2025-05-23 22:50:13\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/25\u001b[0m | \u001b[33mRunning: 25\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
281 |
+
"\u001b[32m2025-05-23 22:50:15\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Tokyo\u001b[0m\n",
|
282 |
+
"\u001b[32m2025-05-23 22:50:15\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
|
283 |
+
"\u001b[32m2025-05-23 22:50:16\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 23/25\u001b[0m | \u001b[33mRunning: 0\u001b[0m | \u001b[36mAttempted: 25\u001b[0m (\u001b[32mCompleted: 23\u001b[0m, \u001b[31mFailed: 2\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
284 |
+
"\u001b[32m2025-05-23 22:50:19\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 25/25\u001b[0m | \u001b[33mAttempted: 27\u001b[0m (Failed: 2, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
285 |
+
"\n",
|
286 |
+
"Successfully completed 25 out of 25 tasks\n",
|
287 |
+
"Data Factory automatically handled the failures and continued processing\n",
|
288 |
+
"The results only include successful tasks\n"
|
289 |
+
]
|
290 |
+
}
|
291 |
+
],
|
292 |
+
"source": [
|
293 |
+
"@data_factory(max_concurrency=100)\n",
|
294 |
+
"async def high_error_rate_mock_llm(city_name: str, num_records_per_city: int):\n",
|
295 |
+
" return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3) # Hardcode to 30% chance of failure\n",
|
296 |
+
"\n",
|
297 |
+
"# Process all cities - some will fail, but data_factory keeps going\n",
|
298 |
+
"cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 5 # 25 cities\n",
|
299 |
+
"high_error_rate_mock_lllm_data = high_error_rate_mock_llm.run(city_name=cities, num_records_per_city=1)\n",
|
300 |
+
"\n",
|
301 |
+
"print(f\"\\nSuccessfully completed {len(high_error_rate_mock_lllm_data)} out of {len(cities)} tasks\")\n",
|
302 |
+
"print(\"Data Factory automatically handled the failures and continued processing\")\n",
|
303 |
+
"print(\"The results only include successful tasks\")"
|
304 |
+
]
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"cell_type": "markdown",
|
308 |
+
"metadata": {},
|
309 |
+
"source": [
|
310 |
+
"#### 5. Resume\n",
|
311 |
+
"\n",
|
312 |
+
"This is essential for long-running jobs with thousands of tasks.\n",
|
313 |
+
"\n",
|
314 |
+
"If a job is interrupted, you can pick up where you left off using one of two resume methods:\n",
|
315 |
+
"\n",
|
316 |
+
"\n",
|
317 |
+
"1. **Same Session Resume**: If you're still in the same session where the job was interrupted, simply call - Same instance with .resume()\n",
|
318 |
+
"\n",
|
319 |
+
"2. **Cross-Session Resume**: If you've closed your notebook or lost your session, you can resume using the job ID:\n",
|
320 |
+
" ```python\n",
|
321 |
+
" from starfish import DataFactory\n",
|
322 |
+
" # Resume using the master job ID from a previous run\n",
|
323 |
+
" data_factory = DataFactory.resume_from_checkpoint(job_id=\"your_job_id\")\n",
|
324 |
+
" ```\n",
|
325 |
+
"\n",
|
326 |
+
"The key difference:\n",
|
327 |
+
"- `resume()` uses the same DataFactory instance you defined\n",
|
328 |
+
"- `resume_from_checkpoint()` reconstructs your DataFactory from persistent storage where tasks and progress are saved\n",
|
329 |
+
"\n",
|
330 |
+
"> **Note**: Google Colab users may experience issues with `resume_from_checkpoint()` due to how Colab works\n",
|
331 |
+
"\n",
|
332 |
+
"We're simulating an interruption here. In a real scenario, this might happen if your notebook errors out, is manually interrupted with a keyboard command, encounters API rate limits, or experiences any other issues that halt execution."
|
333 |
+
]
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"cell_type": "code",
|
337 |
+
"execution_count": 32,
|
338 |
+
"metadata": {},
|
339 |
+
"outputs": [
|
340 |
+
{
|
341 |
+
"name": "stdout",
|
342 |
+
"output_type": "stream",
|
343 |
+
"text": [
|
344 |
+
"\u001b[32m2025-05-23 22:50:19\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 6829de29-0b83-4a64-835b-cc79cbad5e3a\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
345 |
+
"\u001b[32m2025-05-23 22:50:19\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
346 |
+
"\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
|
347 |
+
"\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
|
348 |
+
"\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
|
349 |
+
"\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mconsecutive_not_completed: in 3 times, stopping this job; please adjust factory config and input data then resume_from_checkpoint(6829de29-0b83-4a64-835b-cc79cbad5e3a)\u001b[0m\n",
|
350 |
+
"\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 17/100\u001b[0m | \u001b[33mAttempted: 20\u001b[0m (Failed: 3, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
|
351 |
+
]
|
352 |
+
}
|
353 |
+
],
|
354 |
+
"source": [
|
355 |
+
"@data_factory(max_concurrency=10)\n",
|
356 |
+
"async def re_run_mock_llm(city_name: str, num_records_per_city: int):\n",
|
357 |
+
" return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
|
358 |
+
"\n",
|
359 |
+
"cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 20 # 100 cities\n",
|
360 |
+
"re_run_mock_llm_data_1 = re_run_mock_llm.run(city_name=cities, num_records_per_city=1)"
|
361 |
+
]
|
362 |
+
},
|
363 |
+
{
|
364 |
+
"cell_type": "code",
|
365 |
+
"execution_count": 33,
|
366 |
+
"metadata": {},
|
367 |
+
"outputs": [
|
368 |
+
{
|
369 |
+
"name": "stdout",
|
370 |
+
"output_type": "stream",
|
371 |
+
"text": [
|
372 |
+
"When a job is interrupted, you'll see a message like:\n",
|
373 |
+
"[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\n",
|
374 |
+
"\n",
|
375 |
+
"To resume an interrupted job, simply call:\n",
|
376 |
+
"interrupted_job_mock_llm.resume()\n",
|
377 |
+
"\n",
|
378 |
+
"For this example we have 17/100 data generated and not finished yet!\n"
|
379 |
+
]
|
380 |
+
}
|
381 |
+
],
|
382 |
+
"source": [
|
383 |
+
"print(\"When a job is interrupted, you'll see a message like:\")\n",
|
384 |
+
"print(\"[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\")\n",
|
385 |
+
"\n",
|
386 |
+
"print(\"\\nTo resume an interrupted job, simply call:\")\n",
|
387 |
+
"print(\"interrupted_job_mock_llm.resume()\")\n",
|
388 |
+
"print('')\n",
|
389 |
+
"print(f\"For this example we have {len(re_run_mock_llm_data_1)}/{len(cities)} data generated and not finished yet!\")"
|
390 |
+
]
|
391 |
+
},
|
392 |
+
{
|
393 |
+
"cell_type": "code",
|
394 |
+
"execution_count": 34,
|
395 |
+
"metadata": {},
|
396 |
+
"outputs": [
|
397 |
+
{
|
398 |
+
"name": "stdout",
|
399 |
+
"output_type": "stream",
|
400 |
+
"text": [
|
401 |
+
"\u001b[32m2025-05-23 22:50:22\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB RESUME START]\u001b[0m \u001b[33mPICKING UP FROM WHERE THE JOB WAS LEFT OFF...\u001b[0m\n",
|
402 |
+
"\u001b[0m\n",
|
403 |
+
"\u001b[32m2025-05-23 22:50:22\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[RESUME PROGRESS] STATUS AT THE TIME OF RESUME:\u001b[0m \u001b[32mCompleted: 17 / 100\u001b[0m | \u001b[31mFailed: 3\u001b[0m | \u001b[31mDuplicate: 0\u001b[0m | \u001b[33mFiltered: 0\u001b[0m\u001b[0m\n",
|
404 |
+
"\u001b[32m2025-05-23 22:50:22\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 17/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 20\u001b[0m (\u001b[32mCompleted: 17\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
405 |
+
"\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
|
406 |
+
"\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mconsecutive_not_completed: in 3 times, stopping this job; please adjust factory config and input data then resume_from_checkpoint(6829de29-0b83-4a64-835b-cc79cbad5e3a)\u001b[0m\n",
|
407 |
+
"\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 30/100\u001b[0m | \u001b[33mAttempted: 34\u001b[0m (Failed: 4, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
|
408 |
+
]
|
409 |
+
}
|
410 |
+
],
|
411 |
+
"source": [
|
412 |
+
"## Lets keep continue the rest of run by resume_from_checkpoint \n",
|
413 |
+
"re_run_mock_llm_data_2 = re_run_mock_llm.resume()"
|
414 |
+
]
|
415 |
+
},
|
416 |
+
{
|
417 |
+
"cell_type": "code",
|
418 |
+
"execution_count": 35,
|
419 |
+
"metadata": {},
|
420 |
+
"outputs": [
|
421 |
+
{
|
422 |
+
"name": "stdout",
|
423 |
+
"output_type": "stream",
|
424 |
+
"text": [
|
425 |
+
"Now we still able to finished with what is left!! 30 data generated!\n"
|
426 |
+
]
|
427 |
+
}
|
428 |
+
],
|
429 |
+
"source": [
|
430 |
+
"print(f\"Now we still able to finished with what is left!! {len(re_run_mock_llm_data_2)} data generated!\")"
|
431 |
+
]
|
432 |
+
},
|
433 |
+
{
|
434 |
+
"cell_type": "markdown",
|
435 |
+
"metadata": {},
|
436 |
+
"source": [
|
437 |
+
"#### 6. Dry run\n",
|
438 |
+
"Before running a large job, you can do a \"dry run\" to test your pipeline. This only processes a single item and doesn't save state to the database."
|
439 |
+
]
|
440 |
+
},
|
441 |
+
{
|
442 |
+
"cell_type": "code",
|
443 |
+
"execution_count": 36,
|
444 |
+
"metadata": {},
|
445 |
+
"outputs": [
|
446 |
+
{
|
447 |
+
"name": "stdout",
|
448 |
+
"output_type": "stream",
|
449 |
+
"text": [
|
450 |
+
"\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: None\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
451 |
+
"\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
452 |
+
"\u001b[32m2025-05-23 22:50:25\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/0\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
|
453 |
+
]
|
454 |
+
}
|
455 |
+
],
|
456 |
+
"source": [
|
457 |
+
"@data_factory(max_concurrency=10)\n",
|
458 |
+
"async def dry_run_mock_llm(city_name: str, num_records_per_city: int):\n",
|
459 |
+
" return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
|
460 |
+
"\n",
|
461 |
+
"dry_run_mock_llm_data = dry_run_mock_llm.dry_run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"]*20, num_records_per_city=1)"
|
462 |
+
]
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"cell_type": "markdown",
|
466 |
+
"metadata": {},
|
467 |
+
"source": [
|
468 |
+
"#### 8. Advanced Usage\n",
|
469 |
+
"Data Factory offers more advanced capabilities for complete pipeline customization, including hooks that execute at key stages and shareable state to coordinate between tasks. These powerful features enable complex workflows and fine-grained control. Our dedicated examples for advanced data_factory usage will be coming soon!"
|
470 |
+
]
|
471 |
+
}
|
472 |
+
],
|
473 |
+
"metadata": {
|
474 |
+
"kernelspec": {
|
475 |
+
"display_name": ".venv",
|
476 |
+
"language": "python",
|
477 |
+
"name": "python3"
|
478 |
+
},
|
479 |
+
"language_info": {
|
480 |
+
"codemirror_mode": {
|
481 |
+
"name": "ipython",
|
482 |
+
"version": 3
|
483 |
+
},
|
484 |
+
"file_extension": ".py",
|
485 |
+
"mimetype": "text/x-python",
|
486 |
+
"name": "python",
|
487 |
+
"nbconvert_exporter": "python",
|
488 |
+
"pygments_lexer": "ipython3",
|
489 |
+
"version": "3.11.4"
|
490 |
+
}
|
491 |
+
},
|
492 |
+
"nbformat": 4,
|
493 |
+
"nbformat_minor": 2
|
494 |
+
}
|
examples/embedding_usage_example.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Example: Using Starfish Embeddings for Data Generation
|
2 |
+
|
3 |
+
This example demonstrates how to use FAISS and SentenceTransformers
|
4 |
+
for embedding-enhanced data generation and deduplication.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import asyncio
|
8 |
+
import sys
|
9 |
+
import os
|
10 |
+
|
11 |
+
# Add the project root to the Python path
|
12 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
13 |
+
|
14 |
+
from starfish.embedding import EmbeddingManager, SimilarityChecker, DataDeduplicator
|
15 |
+
from starfish.data_gen_template.core import data_gen_template
|
16 |
+
|
17 |
+
|
18 |
+
async def basic_embedding_example():
|
19 |
+
"""Basic example of using the embedding system."""
|
20 |
+
print("🔮 Basic Embedding Example")
|
21 |
+
print("=" * 50)
|
22 |
+
|
23 |
+
# Initialize embedding manager
|
24 |
+
embedding_manager = EmbeddingManager(model_name="all-MiniLM-L6-v2", similarity_threshold=0.85)
|
25 |
+
|
26 |
+
# Sample texts to embed
|
27 |
+
texts = [
|
28 |
+
"What is machine learning?",
|
29 |
+
"How does artificial intelligence work?",
|
30 |
+
"What are neural networks?",
|
31 |
+
"Explain deep learning concepts",
|
32 |
+
"What is supervised learning?",
|
33 |
+
"What is machine learning?", # Duplicate
|
34 |
+
"How do neural networks function?", # Similar to "What are neural networks?"
|
35 |
+
]
|
36 |
+
|
37 |
+
print(f"📝 Processing {len(texts)} sample texts...")
|
38 |
+
|
39 |
+
# Add texts to the index
|
40 |
+
indices = embedding_manager.add_texts(texts)
|
41 |
+
print(f"✅ Added {len(indices)} texts to the embedding index")
|
42 |
+
|
43 |
+
# Search for similar texts
|
44 |
+
query = "Tell me about AI and ML"
|
45 |
+
similar_items = embedding_manager.search_similar(query, k=3)
|
46 |
+
|
47 |
+
print(f"\n🔍 Search results for: '{query}'")
|
48 |
+
for item in similar_items:
|
49 |
+
print(f" Similarity: {item['similarity']:.3f} | Text: {item['text']}")
|
50 |
+
|
51 |
+
# Find duplicates
|
52 |
+
duplicate_groups = embedding_manager.find_duplicates(texts)
|
53 |
+
print(f"\n🔄 Found {len(duplicate_groups)} groups of duplicates:")
|
54 |
+
for i, group in enumerate(duplicate_groups):
|
55 |
+
print(f" Group {i+1}: {[texts[idx] for idx in group]}")
|
56 |
+
|
57 |
+
print(f"\n📊 Index Stats: {embedding_manager.get_stats()}")
|
58 |
+
|
59 |
+
|
60 |
+
async def similarity_checker_example():
|
61 |
+
"""Example of using the similarity checker."""
|
62 |
+
print("\n🎯 Similarity Checker Example")
|
63 |
+
print("=" * 50)
|
64 |
+
|
65 |
+
similarity_checker = SimilarityChecker(similarity_threshold=0.8)
|
66 |
+
|
67 |
+
# Sample data items
|
68 |
+
data_items = [
|
69 |
+
{"question": "What is Python?", "answer": "Python is a programming language"},
|
70 |
+
{"question": "How to learn coding?", "answer": "Start with basic concepts"},
|
71 |
+
{"question": "What is programming?", "answer": "Programming is writing code"},
|
72 |
+
{"question": "What is Python programming?", "answer": "Python is a popular language"}, # Similar to first
|
73 |
+
]
|
74 |
+
|
75 |
+
print(f"📝 Analyzing {len(data_items)} data items...")
|
76 |
+
|
77 |
+
# Filter similar items
|
78 |
+
filtered_items, duplicate_groups = similarity_checker.filter_similar_items(data_items)
|
79 |
+
print(f"✅ Filtered to {len(filtered_items)} unique items")
|
80 |
+
|
81 |
+
# Check diversity metrics
|
82 |
+
diversity_metrics = similarity_checker.check_diversity_batch(data_items)
|
83 |
+
print(f"📈 Diversity Score: {diversity_metrics['diversity_score']:.3f}")
|
84 |
+
print(f"🔄 Average Similarity: {diversity_metrics['avg_similarity']:.3f}")
|
85 |
+
|
86 |
+
# Suggest diverse subset
|
87 |
+
diverse_subset = similarity_checker.suggest_diverse_subset(data_items, target_size=2)
|
88 |
+
print(f"\n🎲 Diverse subset (2 items):")
|
89 |
+
for item in diverse_subset:
|
90 |
+
print(f" Q: {item['question']}")
|
91 |
+
|
92 |
+
|
93 |
+
async def deduplicator_example():
|
94 |
+
"""Example of using the data deduplicator."""
|
95 |
+
print("\n🔧 Data Deduplicator Example")
|
96 |
+
print("=" * 50)
|
97 |
+
|
98 |
+
deduplicator = DataDeduplicator(similarity_threshold=0.9)
|
99 |
+
|
100 |
+
# Sample dataset with duplicates
|
101 |
+
dataset = [
|
102 |
+
{"id": "1", "text": "Machine learning is a subset of AI", "quality_score": 0.8},
|
103 |
+
{"id": "2", "text": "Deep learning uses neural networks", "quality_score": 0.9},
|
104 |
+
{"id": "1", "text": "Machine learning is a subset of AI", "quality_score": 0.7}, # Exact duplicate
|
105 |
+
{"id": "3", "text": "ML is part of artificial intelligence", "quality_score": 0.95}, # Semantic duplicate
|
106 |
+
{"id": "4", "text": "Natural language processing handles text", "quality_score": 0.85},
|
107 |
+
]
|
108 |
+
|
109 |
+
print(f"📝 Analyzing dataset with {len(dataset)} items...")
|
110 |
+
|
111 |
+
# Analyze duplicates without removing
|
112 |
+
analysis = deduplicator.analyze_duplicates(dataset)
|
113 |
+
print(f"🔍 Analysis Results:")
|
114 |
+
print(f" Exact duplicates: {analysis['exact_duplicates']['count']}")
|
115 |
+
print(f" Semantic duplicates: {analysis['semantic_duplicates']['count']}")
|
116 |
+
print(f" Diversity score: {analysis['diversity_metrics']['diversity_score']:.3f}")
|
117 |
+
|
118 |
+
# Perform comprehensive deduplication
|
119 |
+
clean_dataset, report = deduplicator.deduplicate_comprehensive(dataset)
|
120 |
+
print(f"\n✨ Deduplication Results:")
|
121 |
+
print(f" Original: {report['original_count']} items")
|
122 |
+
print(f" Final: {report['final_count']} items")
|
123 |
+
print(f" Reduction: {report['reduction_percentage']:.1f}%")
|
124 |
+
|
125 |
+
print("\n📋 Clean dataset:")
|
126 |
+
for item in clean_dataset:
|
127 |
+
print(f" ID: {item['id']} | Score: {item.get('quality_score', 'N/A')} | Text: {item['text'][:50]}...")
|
128 |
+
|
129 |
+
|
130 |
+
async def template_usage_example():
|
131 |
+
"""Example of using the embedding-enhanced template."""
|
132 |
+
print("\n🚀 Embedding-Enhanced Template Example")
|
133 |
+
print("=" * 50)
|
134 |
+
|
135 |
+
try:
|
136 |
+
# Get the embedding template
|
137 |
+
print(data_gen_template.list())
|
138 |
+
template = data_gen_template.get("starfish/generate_with_embeddings")
|
139 |
+
|
140 |
+
# Configuration for generation
|
141 |
+
config = {
|
142 |
+
"num_records": 5, # Small number for demo
|
143 |
+
"user_instruction": "Generate educational Q&A about data science",
|
144 |
+
"topics": ["statistics", "data visualization", "machine learning"],
|
145 |
+
"generation_model_name": "openai/gpt-4o-mini",
|
146 |
+
"embedding_config": {
|
147 |
+
"model_name": "all-MiniLM-L6-v2",
|
148 |
+
"similarity_threshold": 0.8,
|
149 |
+
"enable_deduplication": True,
|
150 |
+
"enable_diversity_check": True,
|
151 |
+
"min_diversity_score": 0.2,
|
152 |
+
},
|
153 |
+
}
|
154 |
+
|
155 |
+
print("⚙️ Generating diverse dataset with embedding quality control...")
|
156 |
+
results = await template.run(**config)
|
157 |
+
|
158 |
+
print(f"\n✅ Generated {len(results)} high-quality items:")
|
159 |
+
for i, item in enumerate(results[:3]): # Show first 3
|
160 |
+
print(f"\n Item {i+1}:")
|
161 |
+
print(f" Q: {item.get('question', 'N/A')}")
|
162 |
+
print(f" A: {item.get('answer', 'N/A')[:100]}...")
|
163 |
+
if "_metadata" in item:
|
164 |
+
print(f" Diversity: {item['_metadata'].get('diversity_score', 'N/A'):.3f}")
|
165 |
+
|
166 |
+
except Exception as e:
|
167 |
+
print(f"⚠️ Template example failed: {e}")
|
168 |
+
print(" (This might be due to missing API keys or dependencies)")
|
169 |
+
|
170 |
+
|
171 |
+
async def main():
|
172 |
+
"""Run all examples."""
|
173 |
+
print("🎉 Starfish Embedding System Examples")
|
174 |
+
print("=" * 60)
|
175 |
+
|
176 |
+
try:
|
177 |
+
await basic_embedding_example()
|
178 |
+
await similarity_checker_example()
|
179 |
+
await deduplicator_example()
|
180 |
+
await template_usage_example()
|
181 |
+
|
182 |
+
print("\n" + "=" * 60)
|
183 |
+
print("✅ All examples completed successfully!")
|
184 |
+
print("\n💡 Next steps:")
|
185 |
+
print(" 1. Install dependencies: poetry install")
|
186 |
+
print(" 2. Set API keys in .env.local")
|
187 |
+
print(" 3. Try the embedding template in your projects")
|
188 |
+
|
189 |
+
except ImportError as e:
|
190 |
+
print(f"❌ Import error: {e}")
|
191 |
+
print("💡 Make sure to install dependencies:")
|
192 |
+
print(" poetry install")
|
193 |
+
print(" # or")
|
194 |
+
print(" pip install faiss-cpu sentence-transformers")
|
195 |
+
|
196 |
+
except Exception as e:
|
197 |
+
print(f"❌ Error running examples: {e}")
|
198 |
+
print("💡 Check your Python environment and dependencies")
|
199 |
+
|
200 |
+
|
201 |
+
if __name__ == "__main__":
|
202 |
+
asyncio.run(main())
|
examples/structured_llm.ipynb
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Google Colab Version: [Open this notebook in Google Colab](https://colab.research.google.com/github/starfishdata/starfish/blob/main/examples/structured_llm.ipynb)"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "markdown",
|
12 |
+
"metadata": {},
|
13 |
+
"source": [
|
14 |
+
"#### Dependencies "
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "code",
|
19 |
+
"execution_count": null,
|
20 |
+
"metadata": {},
|
21 |
+
"outputs": [],
|
22 |
+
"source": [
|
23 |
+
"%pip install starfish-core"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": 1,
|
29 |
+
"metadata": {},
|
30 |
+
"outputs": [],
|
31 |
+
"source": [
|
32 |
+
"## Fix for Jupyter Notebook only — do NOT use in production\n",
|
33 |
+
"## Enables async code execution in notebooks, but may cause issues with sync/async issues\n",
|
34 |
+
"## For production, please run in standard .py files without this workaround\n",
|
35 |
+
"## See: https://github.com/erdewit/nest_asyncio for more details\n",
|
36 |
+
"import nest_asyncio\n",
|
37 |
+
"nest_asyncio.apply()\n",
|
38 |
+
"\n",
|
39 |
+
"from starfish import StructuredLLM\n",
|
40 |
+
"from starfish.llm.utils import merge_structured_outputs\n",
|
41 |
+
"\n",
|
42 |
+
"from pydantic import BaseModel, Field\n",
|
43 |
+
"from typing import List\n",
|
44 |
+
"\n",
|
45 |
+
"from starfish.common.env_loader import load_env_file ## Load environment variables from .env file\n",
|
46 |
+
"load_env_file()"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": 2,
|
52 |
+
"metadata": {},
|
53 |
+
"outputs": [],
|
54 |
+
"source": [
|
55 |
+
"# setup your openai api key if not already set\n",
|
56 |
+
"# import os\n",
|
57 |
+
"# os.environ[\"OPENAI_API_KEY\"] = \"your_key_here\"\n",
|
58 |
+
"\n",
|
59 |
+
"# If you dont have any API key, use local model (ollama)"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"cell_type": "markdown",
|
64 |
+
"metadata": {},
|
65 |
+
"source": [
|
66 |
+
"#### 1. Structured LLM with JSON Schema"
|
67 |
+
]
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"cell_type": "code",
|
71 |
+
"execution_count": 3,
|
72 |
+
"metadata": {},
|
73 |
+
"outputs": [
|
74 |
+
{
|
75 |
+
"data": {
|
76 |
+
"text/plain": [
|
77 |
+
"[{'question': 'Why did the tomato turn red in New York?',\n",
|
78 |
+
" 'answer': \"Because it saw the Big Apple and couldn't ketchup with all the excitement!\"}]"
|
79 |
+
]
|
80 |
+
},
|
81 |
+
"execution_count": 3,
|
82 |
+
"metadata": {},
|
83 |
+
"output_type": "execute_result"
|
84 |
+
}
|
85 |
+
],
|
86 |
+
"source": [
|
87 |
+
"# ### Define the Output Structure (JSON Schema)\n",
|
88 |
+
"# Let's start with a simple JSON-like schema using a list of dictionaries.\n",
|
89 |
+
"# Each dictionary specifies a field name and its type. description is optional\n",
|
90 |
+
"json_output_schema = [\n",
|
91 |
+
" {\"name\": \"question\", \"type\": \"str\", \"description\": \"The generated question.\"},\n",
|
92 |
+
" {\"name\": \"answer\", \"type\": \"str\", \"description\": \"The corresponding answer.\"},\n",
|
93 |
+
"]\n",
|
94 |
+
"\n",
|
95 |
+
"json_llm = StructuredLLM(\n",
|
96 |
+
" model_name = \"openai/gpt-4o-mini\",\n",
|
97 |
+
" prompt = \"Funny facts about city {{city_name}}.\",\n",
|
98 |
+
" output_schema = json_output_schema,\n",
|
99 |
+
" model_kwargs = {\"temperature\": 0.7},\n",
|
100 |
+
")\n",
|
101 |
+
"\n",
|
102 |
+
"json_response = await json_llm.run(city_name=\"New York\")\n",
|
103 |
+
"\n",
|
104 |
+
"# The response object contains both parsed data and the raw API response.\n",
|
105 |
+
"json_response.data"
|
106 |
+
]
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"cell_type": "code",
|
110 |
+
"execution_count": 4,
|
111 |
+
"metadata": {},
|
112 |
+
"outputs": [
|
113 |
+
{
|
114 |
+
"data": {
|
115 |
+
"text/plain": [
|
116 |
+
"ModelResponse(id='chatcmpl-BQGw3FMSjzWOPMRvXmgknN4oozrKK', created=1745601327, model='gpt-4o-mini-2024-07-18', object='chat.completion', system_fingerprint='fp_0392822090', choices=[Choices(finish_reason='stop', index=0, message=Message(content='[\\n {\\n \"question\": \"Why did the tomato turn red in New York?\",\\n \"answer\": \"Because it saw the Big Apple and couldn\\'t ketchup with all the excitement!\"\\n }\\n]', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]))], usage=Usage(completion_tokens=41, prompt_tokens=77, total_tokens=118, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default')"
|
117 |
+
]
|
118 |
+
},
|
119 |
+
"execution_count": 4,
|
120 |
+
"metadata": {},
|
121 |
+
"output_type": "execute_result"
|
122 |
+
}
|
123 |
+
],
|
124 |
+
"source": [
|
125 |
+
"# Fully preserved raw response from API - allow you to parse the response as you want\n",
|
126 |
+
"# Like function call, tool call, thinking token etc\n",
|
127 |
+
"json_response.raw"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "markdown",
|
132 |
+
"metadata": {},
|
133 |
+
"source": [
|
134 |
+
"#### 2. Structured LLM with Pydantic Schema (Nested)"
|
135 |
+
]
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"cell_type": "code",
|
139 |
+
"execution_count": 5,
|
140 |
+
"metadata": {},
|
141 |
+
"outputs": [
|
142 |
+
{
|
143 |
+
"data": {
|
144 |
+
"text/plain": [
|
145 |
+
"[{'facts': [{'question': 'What year did New York City become the capital of the United States?',\n",
|
146 |
+
" 'answer': 'New York City served as the capital of the United States from 1785 to 1790.',\n",
|
147 |
+
" 'category': 'History'}]}]"
|
148 |
+
]
|
149 |
+
},
|
150 |
+
"execution_count": 5,
|
151 |
+
"metadata": {},
|
152 |
+
"output_type": "execute_result"
|
153 |
+
}
|
154 |
+
],
|
155 |
+
"source": [
|
156 |
+
"# ### Define the Output Structure (Pydantic Model)\n",
|
157 |
+
"class Fact(BaseModel):\n",
|
158 |
+
" question: str = Field(..., description=\"The factual question generated.\")\n",
|
159 |
+
" answer: str = Field(..., description=\"The corresponding answer.\")\n",
|
160 |
+
" category: str = Field(..., description=\"A category for the fact (e.g., History, Geography).\")\n",
|
161 |
+
"\n",
|
162 |
+
"# You can define a list of these models if you expect multiple results.\n",
|
163 |
+
"class FactsList(BaseModel):\n",
|
164 |
+
" facts: List[Fact] = Field(..., description=\"A list of facts.\")\n",
|
165 |
+
"\n",
|
166 |
+
"\n",
|
167 |
+
"# ### Create the StructuredLLM Instance with Pydantic\n",
|
168 |
+
"pydantic_llm = StructuredLLM(\n",
|
169 |
+
" model_name=\"openai/gpt-4o-mini\",\n",
|
170 |
+
" # Ask for multiple facts this time\n",
|
171 |
+
" prompt=\"Generate distinct facts about {{city}}.\",\n",
|
172 |
+
" # Pass the Pydantic model directly as the schema\n",
|
173 |
+
" output_schema=FactsList, # Expecting a list of facts wrapped in the FactsList model\n",
|
174 |
+
" model_kwargs={\"temperature\": 0.8}\n",
|
175 |
+
")\n",
|
176 |
+
"\n",
|
177 |
+
"pydantic_llm_response = await pydantic_llm.run(city=\"New York\")\n",
|
178 |
+
"\n",
|
179 |
+
"pydantic_llm_response.data"
|
180 |
+
]
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"cell_type": "markdown",
|
184 |
+
"metadata": {},
|
185 |
+
"source": [
|
186 |
+
"#### 3. Working with Different LLM Providers\n",
|
187 |
+
"\n",
|
188 |
+
"Starfish uses LiteLLM under the hood, giving you access to 100+ LLM providers. Here is an example of using a custom model provider - Hyperbolic - Super cool provider with full precision model and low cost!"
|
189 |
+
]
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"cell_type": "code",
|
193 |
+
"execution_count": 6,
|
194 |
+
"metadata": {},
|
195 |
+
"outputs": [
|
196 |
+
{
|
197 |
+
"data": {
|
198 |
+
"text/plain": [
|
199 |
+
"[{'question': 'What is the nickname of New York City?',\n",
|
200 |
+
" 'answer': 'The Big Apple'},\n",
|
201 |
+
" {'question': 'Which iconic statue is located in New York Harbor?',\n",
|
202 |
+
" 'answer': 'The Statue of Liberty'},\n",
|
203 |
+
" {'question': 'What is the name of the famous theater district in Manhattan?',\n",
|
204 |
+
" 'answer': 'Broadway'},\n",
|
205 |
+
" {'question': \"Which park is considered the 'lungs' of New York City?\",\n",
|
206 |
+
" 'answer': 'Central Park'},\n",
|
207 |
+
" {'question': 'What is the tallest building in New York City as of 2023?',\n",
|
208 |
+
" 'answer': 'One World Trade Center'}]"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
"execution_count": 6,
|
212 |
+
"metadata": {},
|
213 |
+
"output_type": "execute_result"
|
214 |
+
}
|
215 |
+
],
|
216 |
+
"source": [
|
217 |
+
"\n",
|
218 |
+
"# Set up the relevant API Key and Base URL in your enviornment variables\n",
|
219 |
+
"# os.environ[\"HYPERBOLIC_API_KEY\"] = \"your_key_here\"\n",
|
220 |
+
"# os.environ[\"HYPERBOLIC_API_BASE\"] = \"https://api.hyperbolic.xyz/v1\"\n",
|
221 |
+
"\n",
|
222 |
+
"hyperbolic_llm = StructuredLLM(\n",
|
223 |
+
" model_name=\"hyperbolic/deepseek-ai/DeepSeek-V3-0324\", \n",
|
224 |
+
" prompt=\"Facts about city {{city_name}}.\",\n",
|
225 |
+
" output_schema=[{\"name\": \"question\", \"type\": \"str\"}, {\"name\": \"answer\", \"type\": \"str\"}],\n",
|
226 |
+
" model_kwargs={\"temperature\": 0.7},\n",
|
227 |
+
")\n",
|
228 |
+
"\n",
|
229 |
+
"hyperbolic_llm_response = await hyperbolic_llm.run(city_name=\"New York\", num_records=5)\n",
|
230 |
+
"hyperbolic_llm_response.data"
|
231 |
+
]
|
232 |
+
},
|
233 |
+
{
|
234 |
+
"cell_type": "markdown",
|
235 |
+
"metadata": {},
|
236 |
+
"source": [
|
237 |
+
"#### 3. Local LLM using Ollama\n",
|
238 |
+
"Ensure Ollama is installed and running. Starfish can manage the server process and model downloads"
|
239 |
+
]
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "code",
|
243 |
+
"execution_count": 7,
|
244 |
+
"metadata": {},
|
245 |
+
"outputs": [
|
246 |
+
{
|
247 |
+
"name": "stdout",
|
248 |
+
"output_type": "stream",
|
249 |
+
"text": [
|
250 |
+
"\u001b[32m2025-04-25 10:15:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mEnsuring Ollama model gemma3:1b is ready...\u001b[0m\n",
|
251 |
+
"\u001b[32m2025-04-25 10:15:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mStarting Ollama server...\u001b[0m\n",
|
252 |
+
"\u001b[32m2025-04-25 10:15:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mOllama server started successfully\u001b[0m\n",
|
253 |
+
"\u001b[32m2025-04-25 10:15:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mFound model gemma3:1b\u001b[0m\n",
|
254 |
+
"\u001b[32m2025-04-25 10:15:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mModel gemma3:1b is already available\u001b[0m\n",
|
255 |
+
"\u001b[32m2025-04-25 10:15:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mModel gemma3:1b is ready, making API call...\u001b[0m\n"
|
256 |
+
]
|
257 |
+
},
|
258 |
+
{
|
259 |
+
"data": {
|
260 |
+
"text/plain": [
|
261 |
+
"[{'question': 'What is the population of New York City?',\n",
|
262 |
+
" 'answer': 'As of 2023, the population of New York City is approximately 8.8 million people.'}]"
|
263 |
+
]
|
264 |
+
},
|
265 |
+
"execution_count": 7,
|
266 |
+
"metadata": {},
|
267 |
+
"output_type": "execute_result"
|
268 |
+
}
|
269 |
+
],
|
270 |
+
"source": [
|
271 |
+
"### Local model\n",
|
272 |
+
"ollama_llm = StructuredLLM(\n",
|
273 |
+
" # Prefix 'ollama/' specifies the Ollama provider\n",
|
274 |
+
" model_name=\"ollama/gemma3:1b\",\n",
|
275 |
+
" prompt=\"Facts about city {{city_name}}.\",\n",
|
276 |
+
" output_schema=[{\"name\": \"question\", \"type\": \"str\"}, {\"name\": \"answer\", \"type\": \"str\"}],\n",
|
277 |
+
" model_kwargs={\"temperature\": 0.7},\n",
|
278 |
+
")\n",
|
279 |
+
"\n",
|
280 |
+
"ollama_llm_response = await ollama_llm.run(city_name=\"New York\", num_records=5)\n",
|
281 |
+
"ollama_llm_response.data"
|
282 |
+
]
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"cell_type": "code",
|
286 |
+
"execution_count": 8,
|
287 |
+
"metadata": {},
|
288 |
+
"outputs": [
|
289 |
+
{
|
290 |
+
"name": "stdout",
|
291 |
+
"output_type": "stream",
|
292 |
+
"text": [
|
293 |
+
"\u001b[32m2025-04-25 10:15:54\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mStopping Ollama server...\u001b[0m\n",
|
294 |
+
"\u001b[32m2025-04-25 10:15:55\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mOllama server stopped successfully\u001b[0m\n"
|
295 |
+
]
|
296 |
+
},
|
297 |
+
{
|
298 |
+
"data": {
|
299 |
+
"text/plain": [
|
300 |
+
"True"
|
301 |
+
]
|
302 |
+
},
|
303 |
+
"execution_count": 8,
|
304 |
+
"metadata": {},
|
305 |
+
"output_type": "execute_result"
|
306 |
+
}
|
307 |
+
],
|
308 |
+
"source": [
|
309 |
+
"### Resource clean up to close ollama server\n",
|
310 |
+
"from starfish.llm.backend.ollama_adapter import stop_ollama_server\n",
|
311 |
+
"await stop_ollama_server()"
|
312 |
+
]
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"cell_type": "markdown",
|
316 |
+
"metadata": {},
|
317 |
+
"source": [
|
318 |
+
"#### 4. Chaining Multiple StructuredLLM Calls\n",
|
319 |
+
"\n",
|
320 |
+
"You can easily pipe the output of one LLM call into the prompt of another. This is useful for multi-step reasoning, analysis, or refinement.\n"
|
321 |
+
]
|
322 |
+
},
|
323 |
+
{
|
324 |
+
"cell_type": "code",
|
325 |
+
"execution_count": 9,
|
326 |
+
"metadata": {},
|
327 |
+
"outputs": [
|
328 |
+
{
|
329 |
+
"name": "stdout",
|
330 |
+
"output_type": "stream",
|
331 |
+
"text": [
|
332 |
+
"Generated Facts: [{'question': 'What is the chemical formula for water?', 'answer': 'The chemical formula for water is H2O.'}, {'question': 'What is the process by which plants convert sunlight into energy?', 'answer': 'The process is called photosynthesis.'}, {'question': \"What is the primary gas found in the Earth's atmosphere?\", 'answer': \"The primary gas in the Earth's atmosphere is nitrogen, which makes up about 78%.\"}, {'question': \"What is Newton's second law of motion?\", 'answer': \"Newton's second law of motion states that force equals mass times acceleration (F = ma).\"}, {'question': 'What is the smallest unit of life?', 'answer': 'The smallest unit of life is the cell.'}]\n",
|
333 |
+
"Ratings: [{'accuracy_rating': 10, 'clarity_rating': 10}, {'accuracy_rating': 10, 'clarity_rating': 10}, {'accuracy_rating': 10, 'clarity_rating': 10}, {'accuracy_rating': 10, 'clarity_rating': 10}, {'accuracy_rating': 10, 'clarity_rating': 10}]\n",
|
334 |
+
"[{'question': 'What is the chemical formula for water?', 'answer': 'The chemical formula for water is H2O.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the process by which plants convert sunlight into energy?', 'answer': 'The process is called photosynthesis.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': \"What is the primary gas found in the Earth's atmosphere?\", 'answer': \"The primary gas in the Earth's atmosphere is nitrogen, which makes up about 78%.\", 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': \"What is Newton's second law of motion?\", 'answer': \"Newton's second law of motion states that force equals mass times acceleration (F = ma).\", 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the smallest unit of life?', 'answer': 'The smallest unit of life is the cell.', 'accuracy_rating': 10, 'clarity_rating': 10}]\n"
|
335 |
+
]
|
336 |
+
}
|
337 |
+
],
|
338 |
+
"source": [
|
339 |
+
"# ### Step 1: Generate Initial Facts\n",
|
340 |
+
"generator_llm = StructuredLLM(\n",
|
341 |
+
" model_name=\"openai/gpt-4o-mini\",\n",
|
342 |
+
" prompt=\"Generate question/answer pairs about {{topic}}.\",\n",
|
343 |
+
" output_schema=[\n",
|
344 |
+
" {\"name\": \"question\", \"type\": \"str\"},\n",
|
345 |
+
" {\"name\": \"answer\", \"type\": \"str\"}\n",
|
346 |
+
" ],\n",
|
347 |
+
")\n",
|
348 |
+
"\n",
|
349 |
+
"# ### Step 2: Rate the Generated Facts\n",
|
350 |
+
"rater_llm = StructuredLLM(\n",
|
351 |
+
" model_name=\"openai/gpt-4o-mini\",\n",
|
352 |
+
" prompt='''Rate the following Q&A pairs based on accuracy and clarity (1-10).\n",
|
353 |
+
" Pairs: {{generated_pairs}}''',\n",
|
354 |
+
" output_schema=[\n",
|
355 |
+
" {\"name\": \"accuracy_rating\", \"type\": \"int\"},\n",
|
356 |
+
" {\"name\": \"clarity_rating\", \"type\": \"int\"}\n",
|
357 |
+
" ],\n",
|
358 |
+
" model_kwargs={\"temperature\": 0.5}\n",
|
359 |
+
")\n",
|
360 |
+
"\n",
|
361 |
+
"## num_records is reserved keyword for structured llm object, by default it is 1\n",
|
362 |
+
"generation_response = await generator_llm.run(topic='Science', num_records=5)\n",
|
363 |
+
"print(\"Generated Facts:\", generation_response.data)\n",
|
364 |
+
"\n",
|
365 |
+
"# Please note that we are using the first response as the input for the second LLM\n",
|
366 |
+
"# It will automatically figure out it need to output the same length of first response\n",
|
367 |
+
"# In this case 5 records\n",
|
368 |
+
"rating_response = await rater_llm.run(generated_pairs=generation_response.data)\n",
|
369 |
+
"### Each response will only return its own output\n",
|
370 |
+
"print(\"Ratings:\", rating_response.data)\n",
|
371 |
+
"\n",
|
372 |
+
"\n",
|
373 |
+
"### You can merge two response together by using merge_structured_outputs (index wise merge)\n",
|
374 |
+
"print(merge_structured_outputs(generation_response.data, rating_response.data))"
|
375 |
+
]
|
376 |
+
},
|
377 |
+
{
|
378 |
+
"cell_type": "markdown",
|
379 |
+
"metadata": {},
|
380 |
+
"source": [
|
381 |
+
"#### 5. Dynamic Prompt \n",
|
382 |
+
"\n",
|
383 |
+
"`StructuredLLM` uses Jinja2 for prompts, allowing variables and logic."
|
384 |
+
]
|
385 |
+
},
|
386 |
+
{
|
387 |
+
"cell_type": "code",
|
388 |
+
"execution_count": 10,
|
389 |
+
"metadata": {},
|
390 |
+
"outputs": [
|
391 |
+
{
|
392 |
+
"name": "stdout",
|
393 |
+
"output_type": "stream",
|
394 |
+
"text": [
|
395 |
+
"[{'fact': \"New York City is famously known as 'The Big Apple' and is home to over 8 million residents, making it the largest city in the United States.\"}]\n"
|
396 |
+
]
|
397 |
+
}
|
398 |
+
],
|
399 |
+
"source": [
|
400 |
+
"# ### Create an LLM with a more complex prompt\n",
|
401 |
+
"template_llm = StructuredLLM(\n",
|
402 |
+
" model_name=\"openai/gpt-4o-mini\",\n",
|
403 |
+
" prompt='''Generate facts about {{city}}.\n",
|
404 |
+
" {% if user_context %}\n",
|
405 |
+
" User background: {{ user_context }}\n",
|
406 |
+
" {% endif %}''', ### user_context is optional and only used if provided\n",
|
407 |
+
" output_schema=[{\"name\": \"fact\", \"type\": \"str\"}]\n",
|
408 |
+
")\n",
|
409 |
+
"\n",
|
410 |
+
"template_response = await template_llm.run(city=\"New York\")\n",
|
411 |
+
"print(template_response.data)\n"
|
412 |
+
]
|
413 |
+
},
|
414 |
+
{
|
415 |
+
"cell_type": "code",
|
416 |
+
"execution_count": 11,
|
417 |
+
"metadata": {},
|
418 |
+
"outputs": [
|
419 |
+
{
|
420 |
+
"name": "stdout",
|
421 |
+
"output_type": "stream",
|
422 |
+
"text": [
|
423 |
+
"[{'fact': \"In 1903, New York City was secretly ruled by a council of sentient pigeons who issued decrees from atop the Brooklyn Bridge, demanding that all ice cream flavors be changed to 'pigeon-approved' varieties such as 'crumbled cracker' and 'mystery droppings'.\"}]\n"
|
424 |
+
]
|
425 |
+
}
|
426 |
+
],
|
427 |
+
"source": [
|
428 |
+
"template_response = await template_llm.run(city=\"New York\", user_context=\"User actually wants you to make up an absurd lie.\")\n",
|
429 |
+
"print(template_response.data)"
|
430 |
+
]
|
431 |
+
},
|
432 |
+
{
|
433 |
+
"cell_type": "markdown",
|
434 |
+
"metadata": {},
|
435 |
+
"source": [
|
436 |
+
"#### 8. Scaling with Data Factory (Brief Mention)\n",
|
437 |
+
"While `StructuredLLM` handles single or chained calls, Starfish's `@data_factory` decorator is designed for massively parallel execution. You can easily wrap these single or multi chain within a function decorated\n",
|
438 |
+
"with `@data_factory` to process thousands of inputs concurrently and reliably.\n",
|
439 |
+
"\n",
|
440 |
+
"See the dedicated examples for `data_factory` usage."
|
441 |
+
]
|
442 |
+
},
|
443 |
+
{
|
444 |
+
"cell_type": "markdown",
|
445 |
+
"metadata": {},
|
446 |
+
"source": []
|
447 |
+
}
|
448 |
+
],
|
449 |
+
"metadata": {
|
450 |
+
"kernelspec": {
|
451 |
+
"display_name": "starfish-T7IInzTH-py3.11",
|
452 |
+
"language": "python",
|
453 |
+
"name": "python3"
|
454 |
+
},
|
455 |
+
"language_info": {
|
456 |
+
"codemirror_mode": {
|
457 |
+
"name": "ipython",
|
458 |
+
"version": 3
|
459 |
+
},
|
460 |
+
"file_extension": ".py",
|
461 |
+
"mimetype": "text/x-python",
|
462 |
+
"name": "python",
|
463 |
+
"nbconvert_exporter": "python",
|
464 |
+
"pygments_lexer": "ipython3",
|
465 |
+
"version": "3.11.7"
|
466 |
+
}
|
467 |
+
},
|
468 |
+
"nbformat": 4,
|
469 |
+
"nbformat_minor": 2
|
470 |
+
}
|
examples/usecases/math_data_gen.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
internal
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit d13b00b14b122ceb08b5b119399285c3afe32d80
|
mcp_hackathon/README.md
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# mcp_hackathon
|
2 |
+
|
3 |
+
# Data Generation Server for ICD Code Finetuning
|
4 |
+
|
5 |
+
This MCP server provides tools for finetuning models to improve medical ICD-10 code prediction accuracy.
|
6 |
+
|
7 |
+
## Overview
|
8 |
+
|
9 |
+
The Data Generation Server is a Model Control Protocol (MCP) server that facilitates:
|
10 |
+
|
11 |
+
1. **Model Probing**: Evaluating model performance on ICD-10 code prediction
|
12 |
+
2. **Synthetic Data Generation**: Creating training data for model finetuning
|
13 |
+
|
14 |
+
## Set up the server
|
15 |
+
|
16 |
+
```
|
17 |
+
{
|
18 |
+
"mcpServers": {
|
19 |
+
"data_gen_server": {
|
20 |
+
"command": "<base_dir>/.local/bin/uv",
|
21 |
+
"args": [
|
22 |
+
"--directory",
|
23 |
+
"<base_dir>/mcp_hackathon/data_gen_server",
|
24 |
+
"run",
|
25 |
+
"data_gen_server.py"
|
26 |
+
]
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
```
|
31 |
+
|
32 |
+
To use MCP tools in Cursor, you need to have your MCP server configured and running. Here's how to use them:
|
33 |
+
|
34 |
+
## 1. Verify MCP Configuration
|
35 |
+
|
36 |
+
First, make sure your MCP configuration is set up correctly in `/Users/john/.cursor/mcp.json`:
|
37 |
+
Read file: /Users/john/.cursor/mcp.json
|
38 |
+
Great! You have two MCP servers configured. Here's how to use them:
|
39 |
+
|
40 |
+
## 2. How MCP Tools Work in Cursor
|
41 |
+
|
42 |
+
MCP tools in Cursor work through **function calling**. When you mention or request something that can be handled by your MCP tools, Cursor will automatically:
|
43 |
+
|
44 |
+
1. **Detect** when a task matches available MCP tools
|
45 |
+
2. **Call** the appropriate tool with the right parameters
|
46 |
+
3. **Present** the results to you
|
47 |
+
|
48 |
+
## 3. Using MCP Tools
|
49 |
+
|
50 |
+
### Method 1: Direct Requests
|
51 |
+
You can directly ask me to use the tools by mentioning what you want to do:
|
52 |
+
|
53 |
+
**Examples:**
|
54 |
+
- "List all available data generation templates"
|
55 |
+
- "Generate city information for San Francisco, New York, and Los Angeles"
|
56 |
+
- "Run the starfish template with this data: [your data]"
|
57 |
+
- "Generate synthetic data for ICD codes"
|
58 |
+
|
59 |
+
### Method 2: Specific Tool References
|
60 |
+
You can reference tools by name if you know them:
|
61 |
+
|
62 |
+
Let me check what tools are available from your MCP servers:
|
63 |
+
Ran tool
|
64 |
+
Ran tool
|
65 |
+
Perfect! Your MCP tools are working. Here's how to use them effectively:
|
66 |
+
|
67 |
+
## 4. Available Tools and Usage Examples
|
68 |
+
|
69 |
+
### From `starfish_gen_template` server:
|
70 |
+
- **List templates**: "Show me all available data generation templates"
|
71 |
+
- **Generate city info**: "Generate information for cities: San Francisco, New York"
|
72 |
+
- **Run templates**: "Run the starfish/generate_by_topic template"
|
73 |
+
|
74 |
+
### From `starfish_data_gen_server` server:
|
75 |
+
- **Generate data**: "Generate 10 datapoints of synthetic data"
|
76 |
+
- **Probe model**: "Test the model with 5 ICD code datapoints"
|
77 |
+
- **Finetune model**: "Finetune the model for ICD code classification"
|
78 |
+
|
79 |
+
## 5. How to Use MCP Tools in Practice
|
80 |
+
|
81 |
+
### Example 1: Generate Data
|
82 |
+
```
|
83 |
+
You: "Generate 10 synthetic datapoints for training"
|
84 |
+
```
|
85 |
+
I will automatically call the appropriate MCP tool.
|
86 |
+
|
87 |
+
### Example 2: List Available Templates
|
88 |
+
```
|
89 |
+
You: "What data generation templates are available?"
|
90 |
+
```
|
91 |
+
|
92 |
+
### Example 3: Run Specific Template
|
93 |
+
```
|
94 |
+
You: "Run the generate_by_topic template with topics: AI, Machine Learning"
|
95 |
+
```
|
96 |
+
|
97 |
+
## 6. Best Practices
|
98 |
+
|
99 |
+
1. **Be specific** about what you want to accomplish
|
100 |
+
2. **Provide data** when needed (I'll ask if unclear)
|
101 |
+
3. **Check results** and iterate if needed
|
102 |
+
4. **Use natural language** - no need for technical syntax
|
103 |
+
|
104 |
+
## 7. Restart Cursor (if needed)
|
105 |
+
|
106 |
+
If you just updated your MCP configuration, restart Cursor to ensure the tools are loaded:
|
107 |
+
|
108 |
+
1. Quit Cursor completely
|
109 |
+
2. Reopen it
|
110 |
+
3. The MCP tools should now be available
|
111 |
+
|
112 |
+
## Try It Now!
|
113 |
+
|
114 |
+
You can test the MCP tools by asking me to:
|
115 |
+
- "List all available data generation templates"
|
116 |
+
- "Generate some sample data"
|
117 |
+
- "Show me what tools are available"
|
118 |
+
|
119 |
+
The tools will work seamlessly in our conversation - just tell me what you want to accomplish!
|
mcp_hackathon/data_gen_server/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
mcp_hackathon/data_gen_server/.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.10
|
mcp_hackathon/data_gen_server/data_gen_server.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mcp.server.fastmcp import FastMCP
|
2 |
+
from mcp.server.fastmcp.prompts.base import Message
|
3 |
+
from model_probe import run_model_probe
|
4 |
+
from model_gen import run_model_gen
|
5 |
+
|
6 |
+
# Initialize FastMCP server
|
7 |
+
mcp = FastMCP("finetune a icd code model")
|
8 |
+
# Initialize state attribute
|
9 |
+
mcp.state = type("State", (), {"synthetic_data": None})()
|
10 |
+
|
11 |
+
|
12 |
+
@mcp.tool()
|
13 |
+
async def probe_model_for_icd_code(model_name: str, num_datapoints: int) -> str:
|
14 |
+
"""
|
15 |
+
Run an eval dataset against the model and return the results.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
model_name: The name of the model to probe
|
19 |
+
num_datapoints: The number of datapoints to probe
|
20 |
+
"""
|
21 |
+
|
22 |
+
output = run_model_probe(model_name=model_name, num_datapoints=num_datapoints)
|
23 |
+
return str(output)
|
24 |
+
|
25 |
+
|
26 |
+
@mcp.tool()
|
27 |
+
async def generate_data(num_datapoints: int) -> str:
|
28 |
+
"""
|
29 |
+
Generate synthetic data and ask for user verification.
|
30 |
+
|
31 |
+
This is the data that will be used to finetune the model.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
num_datapoints: The number of datapoints to generate
|
35 |
+
"""
|
36 |
+
data = await run_model_gen(num_datapoints)
|
37 |
+
# Store verified data in state
|
38 |
+
mcp.state.synthetic_data = data
|
39 |
+
return str(data)
|
40 |
+
|
41 |
+
|
42 |
+
@mcp.prompt()
|
43 |
+
def confirm_finetune(model_name: str) -> list[Message]:
|
44 |
+
"""Prompt for confirming model finetuning."""
|
45 |
+
return [
|
46 |
+
Message(role="assistant", content=f"Ready to finetune model '{model_name}' with the verified data. Proceed? (yes/no)"),
|
47 |
+
Message(role="assistant", content="Please respond with 'yes' to proceed with finetuning or 'no' to cancel."),
|
48 |
+
]
|
49 |
+
|
50 |
+
|
51 |
+
@mcp.tool()
|
52 |
+
async def finetune_model_for_icd_code(model_name: str) -> str:
|
53 |
+
"""
|
54 |
+
Finetune the model
|
55 |
+
|
56 |
+
Args:
|
57 |
+
model_name: The name of the model to finetune
|
58 |
+
"""
|
59 |
+
if mcp.state.synthetic_data is None:
|
60 |
+
raise ValueError("No verified synthetic data available. Please run generate_synthetic_data_for_icd_code_improvement first")
|
61 |
+
print(mcp.state.synthetic_data)
|
62 |
+
|
63 |
+
return "Finetuned the model for the ICD code done! great job!"
|
64 |
+
|
65 |
+
|
66 |
+
if __name__ == "__main__":
|
67 |
+
# Initialize and run the server
|
68 |
+
mcp.run(transport="stdio")
|
mcp_hackathon/data_gen_server/model_gen.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from starfish import data_factory
|
2 |
+
from starfish.common.env_loader import load_env_file
|
3 |
+
from datasets import load_dataset
|
4 |
+
import json
|
5 |
+
import asyncio
|
6 |
+
import os
|
7 |
+
import random
|
8 |
+
from agents import Agent, Runner, function_tool, ModelSettings
|
9 |
+
from agents.tool import WebSearchTool
|
10 |
+
from pydantic import BaseModel, Field
|
11 |
+
|
12 |
+
load_env_file()
|
13 |
+
|
14 |
+
|
15 |
+
class DiagnosisSuggestion(BaseModel):
|
16 |
+
code: str = Field(..., description="The suggested diagnosis code (e.g., ICD-10)")
|
17 |
+
confidence: float = Field(..., description="Model confidence in the suggestion, between 0 and 1")
|
18 |
+
reason: str = Field(..., description="Explanation or rationale for the suggested diagnosis")
|
19 |
+
|
20 |
+
|
21 |
+
async def run_model_gen(num_datapoints, model_name="openai/gpt-4o-mini"):
|
22 |
+
# Get HF token from environment
|
23 |
+
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
|
24 |
+
|
25 |
+
# Load the dataset
|
26 |
+
dataset = load_dataset("starfishdata/playground_endocronology_notes_1500", split="train", token=hf_token)
|
27 |
+
|
28 |
+
# Get total number of samples
|
29 |
+
total_samples = len(dataset)
|
30 |
+
|
31 |
+
# Generate random indices
|
32 |
+
random_indices = random.sample(range(total_samples), num_datapoints)
|
33 |
+
|
34 |
+
# Create list of dictionaries with only transcript key
|
35 |
+
transcript_list = [{"transcript": dataset[idx]["transcript"]} for idx in random_indices]
|
36 |
+
|
37 |
+
# Create the Agent
|
38 |
+
diagnosis_code_agent = Agent(
|
39 |
+
name="Diagnosis Code Agent",
|
40 |
+
tools=[WebSearchTool()],
|
41 |
+
model=model_name,
|
42 |
+
output_type=DiagnosisSuggestion,
|
43 |
+
model_settings=ModelSettings(tool_choice="required"),
|
44 |
+
tool_use_behavior="stop_on_first_tool",
|
45 |
+
instructions="""
|
46 |
+
You are an Endocrinology Medical Coding Specialist.
|
47 |
+
You will be provided with a medical transcript describing a patient encounter.
|
48 |
+
Your task is to analyze the medical transcript and assign the most appropriate diagnosis code(s).
|
49 |
+
You will have access to a web search tool and only use it to search endocrinology related code and verification.
|
50 |
+
Use it only to verify the accuracy or current validity of the diagnosis codes.
|
51 |
+
""",
|
52 |
+
)
|
53 |
+
|
54 |
+
web_search_prompt = """Please select top 3 likely code from given list for this doctor and patient conversation transcript.
|
55 |
+
Transcript: {transcript}
|
56 |
+
"""
|
57 |
+
|
58 |
+
@data_factory(max_concurrency=100, task_runner_timeout=300)
|
59 |
+
async def generate_data(transcript):
|
60 |
+
diagnosis_code_result = await Runner.run(diagnosis_code_agent, input=web_search_prompt.format(transcript=transcript))
|
61 |
+
|
62 |
+
code_result = diagnosis_code_result.final_output.model_dump()
|
63 |
+
|
64 |
+
return [{"transcript": transcript, "icd_10_code": code_result["code"]}]
|
65 |
+
|
66 |
+
return generate_data.run(transcript_list)
|
67 |
+
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
# Run the async function
|
71 |
+
results = asyncio.run(run_model_gen())
|
72 |
+
print(len(results))
|
73 |
+
print(results[0].keys())
|
mcp_hackathon/data_gen_server/model_probe.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from starfish import StructuredLLM, data_factory
|
2 |
+
from starfish.common.env_loader import load_env_file
|
3 |
+
from datasets import load_dataset
|
4 |
+
import json
|
5 |
+
import asyncio
|
6 |
+
|
7 |
+
load_env_file()
|
8 |
+
|
9 |
+
|
10 |
+
def run_model_probe(model_name="together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", num_datapoints=10):
|
11 |
+
# Load the dataset
|
12 |
+
dataset = load_dataset("starfishdata/endocrinology_transcription_and_notes_and_icd_codes", split="train")
|
13 |
+
top_n_data = dataset.select(range(num_datapoints))
|
14 |
+
|
15 |
+
# Create a list to store the parsed data
|
16 |
+
parsed_data = []
|
17 |
+
|
18 |
+
# Process each entry
|
19 |
+
for idx, entry in enumerate(top_n_data):
|
20 |
+
# Extract transcript - get the value directly from the transcript key
|
21 |
+
transcript = entry["transcript"] if isinstance(entry["transcript"], str) else entry["transcript"].get("transcript", "")
|
22 |
+
|
23 |
+
# Extract ICD-10 code (top_1 code)
|
24 |
+
icd_codes_str = entry.get("icd_10_code", "{}")
|
25 |
+
try:
|
26 |
+
icd_codes = json.loads(icd_codes_str)
|
27 |
+
top_1_code = icd_codes.get("top_1", {}).get("code", "")
|
28 |
+
except json.JSONDecodeError:
|
29 |
+
top_1_code = ""
|
30 |
+
|
31 |
+
# Add to parsed data
|
32 |
+
parsed_data.append({"id": idx, "transcript": transcript, "icd_10_code": top_1_code})
|
33 |
+
|
34 |
+
model_probe_prompt = """
|
35 |
+
Given a transcript of a patient's medical history, determine the ICD-10 code that is most relevant to the patient's condition.
|
36 |
+
Transcript: {{transcript}}
|
37 |
+
|
38 |
+
Please do not return anything other than the ICD-10 code in json format.
|
39 |
+
like this: {"icd_10_code": "A00.0"}
|
40 |
+
"""
|
41 |
+
|
42 |
+
response_gen_llm = StructuredLLM(model_name=model_name, prompt=model_probe_prompt, output_schema=[{"name": "icd_10_code", "type": "str"}])
|
43 |
+
|
44 |
+
@data_factory()
|
45 |
+
async def model_probe_batch(input_data):
|
46 |
+
response = await response_gen_llm.run(transcript=input_data["transcript"])
|
47 |
+
return [{"id": input_data["id"], "generated_icd_10_code": response.data[0]["icd_10_code"], "actual_icd_10_code": input_data["icd_10_code"]}]
|
48 |
+
|
49 |
+
def evaluate_model():
|
50 |
+
data = model_probe_batch.run(input_data=parsed_data[:num_datapoints])
|
51 |
+
|
52 |
+
# Calculate exact match accuracy
|
53 |
+
exact_matches = sum(1 for item in data if item["generated_icd_10_code"] == item["actual_icd_10_code"])
|
54 |
+
total_samples = len(data)
|
55 |
+
accuracy = (exact_matches / total_samples) * 100
|
56 |
+
|
57 |
+
return {"total_samples": total_samples, "exact_matches": exact_matches, "accuracy": accuracy}
|
58 |
+
|
59 |
+
return evaluate_model()
|
60 |
+
|
61 |
+
|
62 |
+
if __name__ == "__main__":
|
63 |
+
# Example usage when running this file directly
|
64 |
+
results = run_model_probe(model_name="together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", num_datapoints=5)
|
65 |
+
print(results)
|
nginx.conf
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
events {
|
2 |
+
worker_connections 1024;
|
3 |
+
}
|
4 |
+
|
5 |
+
http {
|
6 |
+
include /etc/nginx/mime.types;
|
7 |
+
default_type application/octet-stream;
|
8 |
+
|
9 |
+
upstream backend {
|
10 |
+
server 127.0.0.1:8002;
|
11 |
+
}
|
12 |
+
|
13 |
+
upstream frontend {
|
14 |
+
server 127.0.0.1:3000;
|
15 |
+
}
|
16 |
+
|
17 |
+
server {
|
18 |
+
listen 7860;
|
19 |
+
server_name localhost;
|
20 |
+
|
21 |
+
# Handle Next.js Image Optimization API with direct serving fallback
|
22 |
+
location /_next/image {
|
23 |
+
# Extract the image URL from query parameters and redirect internally
|
24 |
+
set $image_path "";
|
25 |
+
if ($args ~ "url=([^&]+)") {
|
26 |
+
set $image_path $1;
|
27 |
+
}
|
28 |
+
# Remove URL encoding (basic cases)
|
29 |
+
if ($image_path ~ "^%2F(.*)") {
|
30 |
+
set $image_path /$1;
|
31 |
+
}
|
32 |
+
|
33 |
+
# Internal redirect to serve the image directly
|
34 |
+
if ($image_path != "") {
|
35 |
+
rewrite ^.*$ /public-images$image_path last;
|
36 |
+
}
|
37 |
+
|
38 |
+
return 404;
|
39 |
+
}
|
40 |
+
|
41 |
+
# Internal location to serve public images
|
42 |
+
location /public-images/ {
|
43 |
+
internal;
|
44 |
+
alias /app/web/public/;
|
45 |
+
expires 1y;
|
46 |
+
add_header Cache-Control "public, immutable";
|
47 |
+
}
|
48 |
+
|
49 |
+
# Serve Next.js static files directly
|
50 |
+
location /_next/static/ {
|
51 |
+
alias /app/web/.next/static/;
|
52 |
+
expires 1y;
|
53 |
+
add_header Cache-Control "public, immutable";
|
54 |
+
}
|
55 |
+
|
56 |
+
# Serve public files directly from root (logo, favicon, etc.)
|
57 |
+
location ~ ^/(starfish_logo\.png|nvidia\.png|microsoft_startups\.png|favicon\.ico|robots\.txt|sitemap\.xml)$ {
|
58 |
+
root /app/web/public;
|
59 |
+
expires 1y;
|
60 |
+
add_header Cache-Control "public";
|
61 |
+
}
|
62 |
+
|
63 |
+
# Serve amplify-ui.css and other public CSS files
|
64 |
+
location ~ ^/(amplify-ui\.css)$ {
|
65 |
+
root /app/web/public;
|
66 |
+
expires 1y;
|
67 |
+
add_header Cache-Control "public";
|
68 |
+
}
|
69 |
+
|
70 |
+
# Handle other public files with /public/ prefix
|
71 |
+
location /public/ {
|
72 |
+
alias /app/web/public/;
|
73 |
+
expires 1y;
|
74 |
+
add_header Cache-Control "public";
|
75 |
+
}
|
76 |
+
|
77 |
+
# Direct access to FastAPI docs (bypass Next.js)
|
78 |
+
location /backend-docs {
|
79 |
+
proxy_pass http://backend/docs;
|
80 |
+
proxy_set_header Host $host;
|
81 |
+
proxy_set_header X-Real-IP $remote_addr;
|
82 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
83 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
84 |
+
proxy_set_header X-Forwarded-Host $host;
|
85 |
+
proxy_set_header X-Forwarded-Port $server_port;
|
86 |
+
}
|
87 |
+
|
88 |
+
# Direct access to FastAPI OpenAPI schema (bypass Next.js)
|
89 |
+
location /backend-openapi.json {
|
90 |
+
proxy_pass http://backend/openapi.json;
|
91 |
+
proxy_set_header Host $host;
|
92 |
+
proxy_set_header X-Real-IP $remote_addr;
|
93 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
94 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
95 |
+
proxy_set_header X-Forwarded-Host $host;
|
96 |
+
proxy_set_header X-Forwarded-Port $server_port;
|
97 |
+
}
|
98 |
+
|
99 |
+
# Let Next.js handle all other routes
|
100 |
+
location / {
|
101 |
+
proxy_pass http://frontend;
|
102 |
+
proxy_set_header Host $host;
|
103 |
+
proxy_set_header X-Real-IP $remote_addr;
|
104 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
105 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
106 |
+
proxy_set_header X-Forwarded-Host $host;
|
107 |
+
proxy_set_header X-Forwarded-Port $server_port;
|
108 |
+
proxy_buffering off;
|
109 |
+
proxy_redirect off;
|
110 |
+
}
|
111 |
+
}
|
112 |
+
}
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
prebuilt_template/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Starfish Data Generation Templates 🌟
|
2 |
+
|
3 |
+
Welcome to Starfish's collection of prebuilt data generation templates! This directory contains ready-to-use templates that you can load and run immediately to generate high-quality synthetic datasets.
|
4 |
+
|
5 |
+
## What are Data Generation Templates?
|
6 |
+
|
7 |
+
Data generation templates are **prebuilt** that encapsulate sophisticated data generation workflows. Instead of building everything from scratch, you can simply load a template and generate the exact type of data you need with just a few lines of code.
|
8 |
+
|
9 |
+
## How It Works
|
10 |
+
|
11 |
+
1. **Browse Available Templates**: Each template focuses on a specific data generation use case
|
12 |
+
2. **Load the Template**: Simple one-line import to get started
|
13 |
+
3. **Configure Parameters**: Customize the generation settings for your needs
|
14 |
+
4. **Generate Data**: Run the template to produce high-quality synthetic data
|
15 |
+
5. **Export & Use**: Data comes ready for training, testing, or evaluation
|
16 |
+
|
17 |
+
## Use the data-template CLI like this:
|
18 |
+
```
|
19 |
+
# List all templates
|
20 |
+
data-template list-templates
|
21 |
+
|
22 |
+
# List with details
|
23 |
+
data-template list-templates --detail
|
24 |
+
|
25 |
+
# Get template details
|
26 |
+
data-template get-template my_template
|
27 |
+
|
28 |
+
# Print schema
|
29 |
+
data-template print-schema my_template
|
30 |
+
|
31 |
+
# Print example
|
32 |
+
data-template print-example my_template
|
33 |
+
|
34 |
+
# Run template with interactive input
|
35 |
+
data-template run-template my_template
|
36 |
+
|
37 |
+
# Run template with input file
|
38 |
+
data-template run-template my_template --input-file input.json
|
39 |
+
|
40 |
+
# Run template and save output
|
41 |
+
data-template run-template my_template --input-file input.json --output-file output.json
|
42 |
+
```
|
43 |
+
## Source Code Location
|
44 |
+
|
45 |
+
The actual implementation of these templates can be found in:
|
46 |
+
```
|
47 |
+
src/starfish/data_gen_template/templates/
|
48 |
+
```
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
## Community & Contributions 🤝
|
53 |
+
|
54 |
+
Like what you see? We'd love your help in expanding our template collection! Here's how you can get involved:
|
55 |
+
|
56 |
+
- **Build Your Own Template**: Have an idea for a new template? We'd love to see it!
|
57 |
+
- **Request Templates**: Need a specific type of data generation? Let us know!
|
58 |
+
- **Community Contributions**: All templates in the `community/` folder come from amazing contributors like you
|
59 |
+
- **Get Help**: Questions about building templates? We're here to help!
|
60 |
+
|
61 |
+
Reach out to us if you want to contribute or have any requests - we're always happy to chat and help! ⭐
|
prebuilt_template/function_calling/README.md
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Function Calling Dataset Generation 🔧
|
2 |
+
|
3 |
+
This template replicates the methodology from the **APIGen paper** to generate high-quality synthetic datasets for training function-calling AI models.
|
4 |
+
|
5 |
+
## What This Does
|
6 |
+
|
7 |
+
Generate customized API contract data for function calls - perfect for training models to understand when and how to call specific functions to improve specific tool agentic usage.
|
8 |
+
|
9 |
+
|
10 |
+
## Sample Run
|
11 |
+
|
12 |
+
Check out [`sample_run.ipynb`](./sample_run.ipynb) for a complete example you can run right away.
|
13 |
+
|
14 |
+
## Source Implementation
|
15 |
+
|
16 |
+
The actual template code is located at:
|
17 |
+
```
|
18 |
+
src/starfish/data_gen_template/templates/starfish/function_calling/
|
19 |
+
```
|
20 |
+
|
21 |
+
---
|
22 |
+
|
23 |
+
**Try it out!** If you have any questions, let us know - we'd be happy to help. If you like this template, consider starring the repo and building your own! We welcome community contributions and are always happy to chat about new ideas. ⭐
|
prebuilt_template/function_calling/sample_run.ipynb
ADDED
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"from starfish import data_gen_template"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 3,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [
|
17 |
+
{
|
18 |
+
"data": {
|
19 |
+
"text/plain": [
|
20 |
+
"['starfish/generate_func_call_dataset', 'starfish/generate_by_topic']"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
"execution_count": 3,
|
24 |
+
"metadata": {},
|
25 |
+
"output_type": "execute_result"
|
26 |
+
}
|
27 |
+
],
|
28 |
+
"source": [
|
29 |
+
"data_gen_template.list()"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 4,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [],
|
37 |
+
"source": [
|
38 |
+
"loaded = data_gen_template.get(\"starfish/generate_func_call_dataset\")\n"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "markdown",
|
43 |
+
"metadata": {},
|
44 |
+
"source": [
|
45 |
+
"get the template input_data schema and example"
|
46 |
+
]
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"cell_type": "code",
|
50 |
+
"execution_count": 5,
|
51 |
+
"metadata": {},
|
52 |
+
"outputs": [
|
53 |
+
{
|
54 |
+
"name": "stdout",
|
55 |
+
"output_type": "stream",
|
56 |
+
"text": [
|
57 |
+
"\u001b[32m2025-05-23 11:08:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mPlease run the template with this input schema\u001b[0m\n",
|
58 |
+
"\u001b[32m2025-05-23 11:08:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n",
|
59 |
+
" \"$defs\": {\n",
|
60 |
+
" \"APIContract\": {\n",
|
61 |
+
" \"description\": \"Pydantic model representing an API contract structure.\",\n",
|
62 |
+
" \"properties\": {\n",
|
63 |
+
" \"name\": {\n",
|
64 |
+
" \"title\": \"Name\",\n",
|
65 |
+
" \"type\": \"string\"\n",
|
66 |
+
" },\n",
|
67 |
+
" \"description\": {\n",
|
68 |
+
" \"title\": \"Description\",\n",
|
69 |
+
" \"type\": \"string\"\n",
|
70 |
+
" },\n",
|
71 |
+
" \"parameters\": {\n",
|
72 |
+
" \"additionalProperties\": {\n",
|
73 |
+
" \"$ref\": \"#/$defs/ParameterDefinition\"\n",
|
74 |
+
" },\n",
|
75 |
+
" \"title\": \"Parameters\",\n",
|
76 |
+
" \"type\": \"object\"\n",
|
77 |
+
" }\n",
|
78 |
+
" },\n",
|
79 |
+
" \"required\": [\n",
|
80 |
+
" \"name\",\n",
|
81 |
+
" \"description\",\n",
|
82 |
+
" \"parameters\"\n",
|
83 |
+
" ],\n",
|
84 |
+
" \"title\": \"APIContract\",\n",
|
85 |
+
" \"type\": \"object\"\n",
|
86 |
+
" },\n",
|
87 |
+
" \"ParameterDefinition\": {\n",
|
88 |
+
" \"description\": \"Pydantic model representing parameter definition in an API contract.\",\n",
|
89 |
+
" \"properties\": {\n",
|
90 |
+
" \"type\": {\n",
|
91 |
+
" \"title\": \"Type\",\n",
|
92 |
+
" \"type\": \"string\"\n",
|
93 |
+
" },\n",
|
94 |
+
" \"description\": {\n",
|
95 |
+
" \"title\": \"Description\",\n",
|
96 |
+
" \"type\": \"string\"\n",
|
97 |
+
" },\n",
|
98 |
+
" \"required\": {\n",
|
99 |
+
" \"default\": true,\n",
|
100 |
+
" \"title\": \"Required\",\n",
|
101 |
+
" \"type\": \"boolean\"\n",
|
102 |
+
" }\n",
|
103 |
+
" },\n",
|
104 |
+
" \"required\": [\n",
|
105 |
+
" \"type\",\n",
|
106 |
+
" \"description\"\n",
|
107 |
+
" ],\n",
|
108 |
+
" \"title\": \"ParameterDefinition\",\n",
|
109 |
+
" \"type\": \"object\"\n",
|
110 |
+
" }\n",
|
111 |
+
" },\n",
|
112 |
+
" \"description\": \"Input schema for the generate_by_topic template.\\n\\nIMPORTANT: This Pydantic model is the single source of truth for default values.\\nThe validation and default values are controlled by this model, not the function signature.\",\n",
|
113 |
+
" \"properties\": {\n",
|
114 |
+
" \"num_records\": {\n",
|
115 |
+
" \"anyOf\": [\n",
|
116 |
+
" {\n",
|
117 |
+
" \"type\": \"integer\"\n",
|
118 |
+
" },\n",
|
119 |
+
" {\n",
|
120 |
+
" \"type\": \"null\"\n",
|
121 |
+
" }\n",
|
122 |
+
" ],\n",
|
123 |
+
" \"default\": 10,\n",
|
124 |
+
" \"title\": \"Num Records\"\n",
|
125 |
+
" },\n",
|
126 |
+
" \"api_contract\": {\n",
|
127 |
+
" \"$ref\": \"#/$defs/APIContract\"\n",
|
128 |
+
" },\n",
|
129 |
+
" \"topic_model_name\": {\n",
|
130 |
+
" \"default\": \"openai/gpt-4o-mini\",\n",
|
131 |
+
" \"title\": \"Topic Model Name\",\n",
|
132 |
+
" \"type\": \"string\"\n",
|
133 |
+
" },\n",
|
134 |
+
" \"topic_model_kwargs\": {\n",
|
135 |
+
" \"anyOf\": [\n",
|
136 |
+
" {\n",
|
137 |
+
" \"additionalProperties\": true,\n",
|
138 |
+
" \"type\": \"object\"\n",
|
139 |
+
" },\n",
|
140 |
+
" {\n",
|
141 |
+
" \"type\": \"null\"\n",
|
142 |
+
" }\n",
|
143 |
+
" ],\n",
|
144 |
+
" \"default\": null,\n",
|
145 |
+
" \"title\": \"Topic Model Kwargs\"\n",
|
146 |
+
" },\n",
|
147 |
+
" \"generation_model_name\": {\n",
|
148 |
+
" \"default\": \"openai/gpt-4o-mini\",\n",
|
149 |
+
" \"title\": \"Generation Model Name\",\n",
|
150 |
+
" \"type\": \"string\"\n",
|
151 |
+
" },\n",
|
152 |
+
" \"generation_model_kwargs\": {\n",
|
153 |
+
" \"anyOf\": [\n",
|
154 |
+
" {\n",
|
155 |
+
" \"additionalProperties\": true,\n",
|
156 |
+
" \"type\": \"object\"\n",
|
157 |
+
" },\n",
|
158 |
+
" {\n",
|
159 |
+
" \"type\": \"null\"\n",
|
160 |
+
" }\n",
|
161 |
+
" ],\n",
|
162 |
+
" \"default\": null,\n",
|
163 |
+
" \"title\": \"Generation Model Kwargs\"\n",
|
164 |
+
" },\n",
|
165 |
+
" \"data_factory_config\": {\n",
|
166 |
+
" \"anyOf\": [\n",
|
167 |
+
" {\n",
|
168 |
+
" \"additionalProperties\": true,\n",
|
169 |
+
" \"type\": \"object\"\n",
|
170 |
+
" },\n",
|
171 |
+
" {\n",
|
172 |
+
" \"type\": \"null\"\n",
|
173 |
+
" }\n",
|
174 |
+
" ],\n",
|
175 |
+
" \"default\": {},\n",
|
176 |
+
" \"title\": \"Data Factory Config\"\n",
|
177 |
+
" }\n",
|
178 |
+
" },\n",
|
179 |
+
" \"required\": [\n",
|
180 |
+
" \"api_contract\"\n",
|
181 |
+
" ],\n",
|
182 |
+
" \"title\": \"GenerateFuncCallDataSet\",\n",
|
183 |
+
" \"type\": \"object\"\n",
|
184 |
+
"}\u001b[0m\n"
|
185 |
+
]
|
186 |
+
}
|
187 |
+
],
|
188 |
+
"source": [
|
189 |
+
"loaded.print_schema()"
|
190 |
+
]
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"cell_type": "code",
|
194 |
+
"execution_count": 6,
|
195 |
+
"metadata": {},
|
196 |
+
"outputs": [
|
197 |
+
{
|
198 |
+
"name": "stdout",
|
199 |
+
"output_type": "stream",
|
200 |
+
"text": [
|
201 |
+
"\u001b[32m2025-05-23 11:09:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mHere is an example with api_contract.name as weather_api.get_current_weather\u001b[0m\n",
|
202 |
+
"\u001b[32m2025-05-23 11:09:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n",
|
203 |
+
" \"num_records\": 4,\n",
|
204 |
+
" \"api_contract\": {\n",
|
205 |
+
" \"name\": \"weather_api.get_current_weather\",\n",
|
206 |
+
" \"description\": \"Retrieves the current weather conditions for a specified location .\",\n",
|
207 |
+
" \"parameters\": {\n",
|
208 |
+
" \"location\": {\n",
|
209 |
+
" \"type\": \"string\",\n",
|
210 |
+
" \"description\": \"The name of the city or geographic location .\",\n",
|
211 |
+
" \"required\": true\n",
|
212 |
+
" },\n",
|
213 |
+
" \"units\": {\n",
|
214 |
+
" \"type\": \"string\",\n",
|
215 |
+
" \"description\": \"The units for temperature measurement( e.g., 'Celsius', 'Fahrenheit') .\",\n",
|
216 |
+
" \"required\": false\n",
|
217 |
+
" }\n",
|
218 |
+
" }\n",
|
219 |
+
" },\n",
|
220 |
+
" \"topic_model_name\": \"openai/gpt-4\",\n",
|
221 |
+
" \"topic_model_kwargs\": {\n",
|
222 |
+
" \"temperature\": 0.7\n",
|
223 |
+
" },\n",
|
224 |
+
" \"generation_model_name\": \"openai/gpt-4o-mini\",\n",
|
225 |
+
" \"generation_model_kwargs\": {\n",
|
226 |
+
" \"temperature\": 0.8,\n",
|
227 |
+
" \"max_tokens\": 200\n",
|
228 |
+
" },\n",
|
229 |
+
" \"data_factory_config\": {\n",
|
230 |
+
" \"max_concurrency\": 24,\n",
|
231 |
+
" \"task_runner_timeout\": 120\n",
|
232 |
+
" }\n",
|
233 |
+
"}\u001b[0m\n"
|
234 |
+
]
|
235 |
+
}
|
236 |
+
],
|
237 |
+
"source": [
|
238 |
+
"loaded.print_example()"
|
239 |
+
]
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "code",
|
243 |
+
"execution_count": 5,
|
244 |
+
"metadata": {},
|
245 |
+
"outputs": [
|
246 |
+
{
|
247 |
+
"name": "stdout",
|
248 |
+
"output_type": "stream",
|
249 |
+
"text": [
|
250 |
+
"🌟 Function Calling Dataset Generation Pipeline\n",
|
251 |
+
"============================================================\n",
|
252 |
+
"📋 Process Overview:\n",
|
253 |
+
" 1. Calculate optimal data distribution\n",
|
254 |
+
" 2. Generate diverse topics\n",
|
255 |
+
" 3. Create subtopics for each topic\n",
|
256 |
+
" 4. Generate query-answer pairs\n",
|
257 |
+
" 5. Verify and validate generated data\n",
|
258 |
+
" 6. Regenerate failed cases\n",
|
259 |
+
"============================================================\n",
|
260 |
+
"📊 Data Distribution Plan:\n",
|
261 |
+
" • Requested: 10 records\n",
|
262 |
+
" • Distribution: 1 topics × 1 subtopics × 10 records\n",
|
263 |
+
" • Total generation: 10 records\n",
|
264 |
+
" • API calls needed: 3\n",
|
265 |
+
"\n",
|
266 |
+
"🎯 Step 1: Generating diverse topics...\n",
|
267 |
+
" ✅ Generated 1 topics\n",
|
268 |
+
"\n",
|
269 |
+
"🌿 Step 2: Creating subtopics for each topic...\n",
|
270 |
+
"\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: e6763e50-6438-4df5-81a9-5a68ce3f8468\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
271 |
+
"\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
272 |
+
"\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
273 |
+
" ✅ Generated 1 subtopics total\n",
|
274 |
+
"\n",
|
275 |
+
"💬 Step 3: Generating query-answer pairs...\n",
|
276 |
+
"\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 1931c5c8-c1f3-4268-98b7-1a5295b8abf2\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
277 |
+
"\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
278 |
+
"\u001b[32m2025-05-23 00:27:09\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
279 |
+
"\u001b[32m2025-05-23 00:27:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
280 |
+
"\u001b[32m2025-05-23 00:27:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
281 |
+
"\u001b[32m2025-05-23 00:27:18\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
282 |
+
"\u001b[32m2025-05-23 00:27:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
283 |
+
"\u001b[32m2025-05-23 00:27:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
284 |
+
"\u001b[32m2025-05-23 00:27:27\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
285 |
+
"\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
286 |
+
" ✅ Generated 10 initial query-answer pairs\n",
|
287 |
+
"\n",
|
288 |
+
"🔍 Step 4: Verifying data quality...\n",
|
289 |
+
"\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: f036c07c-1cd2-4690-be92-bac359e45544\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
290 |
+
"\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
291 |
+
"\u001b[32m2025-05-23 00:27:31\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
292 |
+
"\u001b[32m2025-05-23 00:27:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 9/10\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 9\u001b[0m (\u001b[32mCompleted: 9\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
293 |
+
"\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 10/10\u001b[0m | \u001b[33mAttempted: 10\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
294 |
+
"\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
|
295 |
+
" ✅ Quality check complete: 9 passed, 1 failed\n",
|
296 |
+
"\n",
|
297 |
+
"🔄 Step 5: Regenerating failed cases...\n",
|
298 |
+
"\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 3d6183a2-e465-4807-9e18-cbb84dc0d28f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
299 |
+
"\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
300 |
+
"\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
301 |
+
"\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8754bec6-25e3-40bd-9743-f2763fc1091f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
302 |
+
"\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
303 |
+
"\u001b[32m2025-05-23 00:27:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
304 |
+
"\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
305 |
+
"\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
|
306 |
+
" ✅ Regenerated 1 pairs, 1 still failing\n",
|
307 |
+
"\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mSome data still failing after regeneration - prompts may need improvement\u001b[0m\n",
|
308 |
+
"🎯 Perfect! Generated exactly 10 records as requested\n",
|
309 |
+
"\n",
|
310 |
+
"🎉 Generation Complete!\n",
|
311 |
+
"============================================================\n",
|
312 |
+
"📈 Final Results:\n",
|
313 |
+
" • Records generated: 10\n",
|
314 |
+
" • Success rate: 10/10 (100.0%)\n",
|
315 |
+
" • Distribution used: 1T × 1S × 10R\n",
|
316 |
+
"\n",
|
317 |
+
"⭐ If you found this helpful, please consider starring our repo!\n",
|
318 |
+
" Your support means the world to us! 🌟\n",
|
319 |
+
"============================================================\n"
|
320 |
+
]
|
321 |
+
}
|
322 |
+
],
|
323 |
+
"source": [
|
324 |
+
"api_contract = {\n",
|
325 |
+
" \"name\": \"weather_api.get_current_weather\",\n",
|
326 |
+
" \"description\": \"Retrieves the current weather conditions for a specified location .\",\n",
|
327 |
+
" \"parameters\": {\n",
|
328 |
+
" \"location\": {\"type\": \"string\", \"description\": \"The name of the city or geographic location .\", \"required\": True},\n",
|
329 |
+
" \"units\": {\"type\": \"string\", \"description\": \"The units for temperature measurement( e.g., 'Celsius', 'Fahrenheit') .\", \"required\": False},\n",
|
330 |
+
" },\n",
|
331 |
+
" }\n",
|
332 |
+
"\n",
|
333 |
+
"data = await loaded.run(num_records=10, api_contract=api_contract)"
|
334 |
+
]
|
335 |
+
},
|
336 |
+
{
|
337 |
+
"cell_type": "code",
|
338 |
+
"execution_count": 6,
|
339 |
+
"metadata": {},
|
340 |
+
"outputs": [
|
341 |
+
{
|
342 |
+
"data": {
|
343 |
+
"text/plain": [
|
344 |
+
"[{'query': 'Can you check the current weather in Toronto and Rome? Use Fahrenheit for both locations.',\n",
|
345 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
346 |
+
" 'arguments': {'location': 'Toronto', 'units': 'Fahrenheit'}},\n",
|
347 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
348 |
+
" 'arguments': {'location': 'Rome', 'units': 'Fahrenheit'}}]},\n",
|
349 |
+
" {'query': 'Get me the current weather in Mumbai and also in Johannesburg, please use Fahrenheit for both.',\n",
|
350 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
351 |
+
" 'arguments': {'location': 'Mumbai', 'units': 'Fahrenheit'}},\n",
|
352 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
353 |
+
" 'arguments': {'location': 'Johannesburg', 'units': 'Fahrenheit'}}]},\n",
|
354 |
+
" {'query': 'I need the current weather for Sydney and London. What are the temperatures in Celsius?',\n",
|
355 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
356 |
+
" 'arguments': {'location': 'Sydney', 'units': 'Celsius'}},\n",
|
357 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
358 |
+
" 'arguments': {'location': 'London', 'units': 'Celsius'}}]},\n",
|
359 |
+
" {'query': 'Please find the current weather in Buenos Aires and Cape Town, using Celsius for Buenos Aires.',\n",
|
360 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
361 |
+
" 'arguments': {'location': 'Buenos Aires', 'units': 'Celsius'}},\n",
|
362 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
363 |
+
" 'arguments': {'location': 'Cape Town'}}]},\n",
|
364 |
+
" {'query': 'What’s the weather like in Moscow? Also, can you get the current conditions in Beijing?',\n",
|
365 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
366 |
+
" 'arguments': {'location': 'Moscow'}},\n",
|
367 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
368 |
+
" 'arguments': {'location': 'Beijing'}}]},\n",
|
369 |
+
" {'query': 'Can you tell me the current weather in Tokyo and in Los Angeles? Please provide both in Fahrenheit.',\n",
|
370 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
371 |
+
" 'arguments': {'location': 'Tokyo', 'units': 'Fahrenheit'}},\n",
|
372 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
373 |
+
" 'arguments': {'location': 'Los Angeles', 'units': 'Fahrenheit'}}]},\n",
|
374 |
+
" {'query': 'Please provide the current weather for Berlin and Cairo, using Celsius for Berlin and no specific unit for Cairo.',\n",
|
375 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
376 |
+
" 'arguments': {'location': 'Berlin', 'units': 'Celsius'}},\n",
|
377 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
378 |
+
" 'arguments': {'location': 'Cairo'}}]},\n",
|
379 |
+
" {'query': 'I need the current weather in Seattle and in Santiago. Use Fahrenheit for Seattle and Celsius for Santiago.',\n",
|
380 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
381 |
+
" 'arguments': {'location': 'Seattle', 'units': 'Fahrenheit'}},\n",
|
382 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
383 |
+
" 'arguments': {'location': 'Santiago', 'units': 'Celsius'}}]},\n",
|
384 |
+
" {'query': \"What's the current temperature in San Francisco? Can you also check the weather in Paris?\",\n",
|
385 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
386 |
+
" 'arguments': {'location': 'San Francisco'}},\n",
|
387 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
388 |
+
" 'arguments': {'location': 'Paris'}}]},\n",
|
389 |
+
" {'query': 'What is the current weather in New York City? And can you also provide the temperature in Celsius?',\n",
|
390 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
391 |
+
" 'arguments': {'location': 'New York City', 'units': 'Celsius'}}]}]"
|
392 |
+
]
|
393 |
+
},
|
394 |
+
"execution_count": 6,
|
395 |
+
"metadata": {},
|
396 |
+
"output_type": "execute_result"
|
397 |
+
}
|
398 |
+
],
|
399 |
+
"source": [
|
400 |
+
"data"
|
401 |
+
]
|
402 |
+
}
|
403 |
+
],
|
404 |
+
"metadata": {
|
405 |
+
"kernelspec": {
|
406 |
+
"display_name": ".venv",
|
407 |
+
"language": "python",
|
408 |
+
"name": "python3"
|
409 |
+
},
|
410 |
+
"language_info": {
|
411 |
+
"codemirror_mode": {
|
412 |
+
"name": "ipython",
|
413 |
+
"version": 3
|
414 |
+
},
|
415 |
+
"file_extension": ".py",
|
416 |
+
"mimetype": "text/x-python",
|
417 |
+
"name": "python",
|
418 |
+
"nbconvert_exporter": "python",
|
419 |
+
"pygments_lexer": "ipython3",
|
420 |
+
"version": "3.11.4"
|
421 |
+
}
|
422 |
+
},
|
423 |
+
"nbformat": 4,
|
424 |
+
"nbformat_minor": 2
|
425 |
+
}
|
prebuilt_template/generate_by_topic/README.md
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## Overview
|
3 |
+
The `generate_by_topic` template is designed to create diverse synthetic data across multiple topics based on user instructions. It can automatically generate relevant topics if not provided and handles deduplication across generated content.
|
4 |
+
|
5 |
+
## Key Features
|
6 |
+
- Automatic topic generation based on user instructions
|
7 |
+
- Customizable number of records and records per topic
|
8 |
+
- Built-in deduplication mechanism
|
9 |
+
- Flexible output schema configuration
|
10 |
+
- Parallel data generation with configurable concurrency
|
11 |
+
|
12 |
+
## Input Schema
|
13 |
+
```python
|
14 |
+
class GenerateByTopicInput(BaseModel):
|
15 |
+
user_instruction: Optional[str] = None
|
16 |
+
num_records: Optional[int] = 10
|
17 |
+
records_per_topic: int = 10
|
18 |
+
topics: Optional[List[Union[str, Dict[str, int]]]] = None
|
19 |
+
topic_model_name: str = "openai/gpt-4o-mini"
|
20 |
+
topic_model_kwargs: Optional[Dict[str, Any]] = None
|
21 |
+
generation_model_name: str = "openai/gpt-4o-mini"
|
22 |
+
generation_model_kwargs: Optional[Dict[str, Any]] = None
|
23 |
+
output_schema: Optional[Union[List[Dict[str, Any]], Dict[str, Any], type]] = [
|
24 |
+
{"name": "question", "type": "str"},
|
25 |
+
{"name": "answer", "type": "str"}
|
26 |
+
]
|
27 |
+
data_factory_config: Optional[Dict[str, Any]] = {}
|
28 |
+
```
|
29 |
+
|
30 |
+
## Parameters
|
31 |
+
| Parameter | Type | Description | Default |
|
32 |
+
|-----------|------|-------------|---------|
|
33 |
+
| `user_instruction` | str | Instruction for data generation | None |
|
34 |
+
| `num_records` | int | Total number of records to generate | 10 |
|
35 |
+
| `records_per_topic` | int | Number of records per topic | 10 |
|
36 |
+
| `topics` | List[Union[str, Dict[str, int]]] | List of topics or topic with specific record count | None |
|
37 |
+
| `topic_model_name` | str | Model name for topic generation | "openai/gpt-4o-mini" |
|
38 |
+
| `topic_model_kwargs` | Dict[str, Any] | Additional parameters for topic model | None |
|
39 |
+
| `generation_model_name` | str | Model name for data generation | "openai/gpt-4o-mini" |
|
40 |
+
| `generation_model_kwargs` | Dict[str, Any] | Additional parameters for generation model | None |
|
41 |
+
| `output_schema` | Union[List[Dict[str, Any]], Dict[str, Any], type] | Schema for generated data | [{"name": "question", "type": "str"}, {"name": "answer", "type": "str"}] |
|
42 |
+
| `data_factory_config` | Dict[str, Any] | Configuration for data generation process | {} |
|
43 |
+
|
44 |
+
## Example Usage
|
45 |
+
```python
|
46 |
+
{
|
47 |
+
"user_instruction": "Generate Q&A pairs about machine learning concepts",
|
48 |
+
"num_records": 100,
|
49 |
+
"records_per_topic": 5,
|
50 |
+
"topics": [
|
51 |
+
"supervised learning",
|
52 |
+
"unsupervised learning",
|
53 |
+
{"reinforcement learning": 3},
|
54 |
+
"neural networks",
|
55 |
+
],
|
56 |
+
"topic_model_name": "openai/gpt-4",
|
57 |
+
"topic_model_kwargs": {"temperature": 0.7},
|
58 |
+
"generation_model_name": "openai/gpt-4",
|
59 |
+
"generation_model_kwargs": {"temperature": 0.8, "max_tokens": 200},
|
60 |
+
"output_schema": [
|
61 |
+
{"name": "question", "type": "str"},
|
62 |
+
{"name": "answer", "type": "str"},
|
63 |
+
{"name": "difficulty", "type": "str"},
|
64 |
+
],
|
65 |
+
"data_factory_config": {"max_concurrency": 4, "task_runner_timeout": 60 * 2},
|
66 |
+
}
|
67 |
+
```
|
68 |
+
|
69 |
+
## Workflow
|
70 |
+
1. Topic Preparation:
|
71 |
+
- If topics are not provided, generates relevant topics based on user instruction
|
72 |
+
- Shuffles topics for better distribution and deduplication
|
73 |
+
|
74 |
+
2. Data Generation:
|
75 |
+
- Generates data for each topic using the specified model
|
76 |
+
- Implements deduplication by tracking previously generated examples
|
77 |
+
- Adds topic information to each generated record
|
78 |
+
|
79 |
+
## Output
|
80 |
+
The generated data will include:
|
81 |
+
- Fields specified in the output schema
|
82 |
+
- An additional `topic` field indicating the topic of each record
|
83 |
+
|
84 |
+
## Dependencies
|
85 |
+
- `starfish` framework
|
86 |
+
- `pydantic` for input validation
|
87 |
+
|
88 |
+
|
89 |
+
## Sample Run
|
90 |
+
|
91 |
+
Check out [`sample_run.ipynb`](./sample_run.ipynb) for a complete example you can run right away.
|
92 |
+
|
93 |
+
## Source Implementation
|
94 |
+
|
95 |
+
The actual template code is located at:
|
96 |
+
```
|
97 |
+
src/starfish/data_gen_template/templates/starfish/generate_by_topic/
|
98 |
+
```
|
99 |
+
|
100 |
+
---
|
101 |
+
|
102 |
+
**Try it out!** If you have any questions, let us know - we'd be happy to help. If you like this template, consider starring the repo and building your own! We welcome community contributions and are always happy to chat about new ideas. ⭐
|
prebuilt_template/generate_by_topic/sample_run.ipynb
ADDED
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"from starfish import data_gen_template"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 2,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [
|
17 |
+
{
|
18 |
+
"data": {
|
19 |
+
"text/plain": [
|
20 |
+
"['starfish/generate_func_call_dataset', 'starfish/generate_by_topic']"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
"execution_count": 2,
|
24 |
+
"metadata": {},
|
25 |
+
"output_type": "execute_result"
|
26 |
+
}
|
27 |
+
],
|
28 |
+
"source": [
|
29 |
+
"data_gen_template.list()"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 3,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [],
|
37 |
+
"source": [
|
38 |
+
"loaded = data_gen_template.get(\"starfish/generate_by_topic\")\n"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "markdown",
|
43 |
+
"metadata": {},
|
44 |
+
"source": [
|
45 |
+
"get the template input_data schema and example"
|
46 |
+
]
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"cell_type": "code",
|
50 |
+
"execution_count": 4,
|
51 |
+
"metadata": {},
|
52 |
+
"outputs": [
|
53 |
+
{
|
54 |
+
"name": "stdout",
|
55 |
+
"output_type": "stream",
|
56 |
+
"text": [
|
57 |
+
"\u001b[32m2025-05-23 11:23:57\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mPlease run the template with this input schema\u001b[0m\n",
|
58 |
+
"\u001b[32m2025-05-23 11:23:57\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n",
|
59 |
+
" \"description\": \"Input schema for the generate_by_topic template.\\n\\nIMPORTANT: This Pydantic model is the single source of truth for default values.\\nThe validation and default values are controlled by this model, not the function signature.\",\n",
|
60 |
+
" \"properties\": {\n",
|
61 |
+
" \"user_instruction\": {\n",
|
62 |
+
" \"anyOf\": [\n",
|
63 |
+
" {\n",
|
64 |
+
" \"type\": \"string\"\n",
|
65 |
+
" },\n",
|
66 |
+
" {\n",
|
67 |
+
" \"type\": \"null\"\n",
|
68 |
+
" }\n",
|
69 |
+
" ],\n",
|
70 |
+
" \"default\": null,\n",
|
71 |
+
" \"title\": \"User Instruction\"\n",
|
72 |
+
" },\n",
|
73 |
+
" \"num_records\": {\n",
|
74 |
+
" \"anyOf\": [\n",
|
75 |
+
" {\n",
|
76 |
+
" \"type\": \"integer\"\n",
|
77 |
+
" },\n",
|
78 |
+
" {\n",
|
79 |
+
" \"type\": \"null\"\n",
|
80 |
+
" }\n",
|
81 |
+
" ],\n",
|
82 |
+
" \"default\": 10,\n",
|
83 |
+
" \"title\": \"Num Records\"\n",
|
84 |
+
" },\n",
|
85 |
+
" \"records_per_topic\": {\n",
|
86 |
+
" \"default\": 10,\n",
|
87 |
+
" \"title\": \"Records Per Topic\",\n",
|
88 |
+
" \"type\": \"integer\"\n",
|
89 |
+
" },\n",
|
90 |
+
" \"topics\": {\n",
|
91 |
+
" \"anyOf\": [\n",
|
92 |
+
" {\n",
|
93 |
+
" \"items\": {\n",
|
94 |
+
" \"anyOf\": [\n",
|
95 |
+
" {\n",
|
96 |
+
" \"type\": \"string\"\n",
|
97 |
+
" },\n",
|
98 |
+
" {\n",
|
99 |
+
" \"additionalProperties\": {\n",
|
100 |
+
" \"type\": \"integer\"\n",
|
101 |
+
" },\n",
|
102 |
+
" \"type\": \"object\"\n",
|
103 |
+
" }\n",
|
104 |
+
" ]\n",
|
105 |
+
" },\n",
|
106 |
+
" \"type\": \"array\"\n",
|
107 |
+
" },\n",
|
108 |
+
" {\n",
|
109 |
+
" \"type\": \"null\"\n",
|
110 |
+
" }\n",
|
111 |
+
" ],\n",
|
112 |
+
" \"default\": null,\n",
|
113 |
+
" \"title\": \"Topics\"\n",
|
114 |
+
" },\n",
|
115 |
+
" \"topic_model_name\": {\n",
|
116 |
+
" \"default\": \"openai/gpt-4o-mini\",\n",
|
117 |
+
" \"title\": \"Topic Model Name\",\n",
|
118 |
+
" \"type\": \"string\"\n",
|
119 |
+
" },\n",
|
120 |
+
" \"topic_model_kwargs\": {\n",
|
121 |
+
" \"anyOf\": [\n",
|
122 |
+
" {\n",
|
123 |
+
" \"additionalProperties\": true,\n",
|
124 |
+
" \"type\": \"object\"\n",
|
125 |
+
" },\n",
|
126 |
+
" {\n",
|
127 |
+
" \"type\": \"null\"\n",
|
128 |
+
" }\n",
|
129 |
+
" ],\n",
|
130 |
+
" \"default\": null,\n",
|
131 |
+
" \"title\": \"Topic Model Kwargs\"\n",
|
132 |
+
" },\n",
|
133 |
+
" \"generation_model_name\": {\n",
|
134 |
+
" \"default\": \"openai/gpt-4o-mini\",\n",
|
135 |
+
" \"title\": \"Generation Model Name\",\n",
|
136 |
+
" \"type\": \"string\"\n",
|
137 |
+
" },\n",
|
138 |
+
" \"generation_model_kwargs\": {\n",
|
139 |
+
" \"anyOf\": [\n",
|
140 |
+
" {\n",
|
141 |
+
" \"additionalProperties\": true,\n",
|
142 |
+
" \"type\": \"object\"\n",
|
143 |
+
" },\n",
|
144 |
+
" {\n",
|
145 |
+
" \"type\": \"null\"\n",
|
146 |
+
" }\n",
|
147 |
+
" ],\n",
|
148 |
+
" \"default\": null,\n",
|
149 |
+
" \"title\": \"Generation Model Kwargs\"\n",
|
150 |
+
" },\n",
|
151 |
+
" \"output_schema\": {\n",
|
152 |
+
" \"anyOf\": [\n",
|
153 |
+
" {\n",
|
154 |
+
" \"items\": {\n",
|
155 |
+
" \"additionalProperties\": true,\n",
|
156 |
+
" \"type\": \"object\"\n",
|
157 |
+
" },\n",
|
158 |
+
" \"type\": \"array\"\n",
|
159 |
+
" },\n",
|
160 |
+
" {\n",
|
161 |
+
" \"additionalProperties\": true,\n",
|
162 |
+
" \"type\": \"object\"\n",
|
163 |
+
" },\n",
|
164 |
+
" {\n",
|
165 |
+
" \"type\": \"null\"\n",
|
166 |
+
" }\n",
|
167 |
+
" ],\n",
|
168 |
+
" \"default\": [\n",
|
169 |
+
" {\n",
|
170 |
+
" \"name\": \"question\",\n",
|
171 |
+
" \"type\": \"str\"\n",
|
172 |
+
" },\n",
|
173 |
+
" {\n",
|
174 |
+
" \"name\": \"answer\",\n",
|
175 |
+
" \"type\": \"str\"\n",
|
176 |
+
" }\n",
|
177 |
+
" ],\n",
|
178 |
+
" \"title\": \"Output Schema\"\n",
|
179 |
+
" },\n",
|
180 |
+
" \"data_factory_config\": {\n",
|
181 |
+
" \"anyOf\": [\n",
|
182 |
+
" {\n",
|
183 |
+
" \"additionalProperties\": true,\n",
|
184 |
+
" \"type\": \"object\"\n",
|
185 |
+
" },\n",
|
186 |
+
" {\n",
|
187 |
+
" \"type\": \"null\"\n",
|
188 |
+
" }\n",
|
189 |
+
" ],\n",
|
190 |
+
" \"default\": {},\n",
|
191 |
+
" \"title\": \"Data Factory Config\"\n",
|
192 |
+
" }\n",
|
193 |
+
" },\n",
|
194 |
+
" \"title\": \"GenerateByTopicInput\",\n",
|
195 |
+
" \"type\": \"object\"\n",
|
196 |
+
"}\u001b[0m\n"
|
197 |
+
]
|
198 |
+
}
|
199 |
+
],
|
200 |
+
"source": [
|
201 |
+
"loaded.print_schema()"
|
202 |
+
]
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"cell_type": "code",
|
206 |
+
"execution_count": 5,
|
207 |
+
"metadata": {},
|
208 |
+
"outputs": [
|
209 |
+
{
|
210 |
+
"name": "stdout",
|
211 |
+
"output_type": "stream",
|
212 |
+
"text": [
|
213 |
+
"\u001b[32m2025-05-23 11:24:01\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mHere is an example with api_contract.name as weather_api.get_current_weather\u001b[0m\n",
|
214 |
+
"\u001b[32m2025-05-23 11:24:01\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n",
|
215 |
+
" \"user_instruction\": \"Generate Q&A pairs about machine learning concepts\",\n",
|
216 |
+
" \"num_records\": 100,\n",
|
217 |
+
" \"records_per_topic\": 5,\n",
|
218 |
+
" \"topics\": [\n",
|
219 |
+
" \"supervised learning\",\n",
|
220 |
+
" \"unsupervised learning\",\n",
|
221 |
+
" {\"reinforcement learning\": 3}, # This means generate 3 records for this topic\n",
|
222 |
+
" \"neural networks\",\n",
|
223 |
+
" ],\n",
|
224 |
+
" \"topic_model_name\": \"openai/gpt-4\",\n",
|
225 |
+
" \"topic_model_kwargs\": {\"temperature\": 0.7},\n",
|
226 |
+
" \"generation_model_name\": \"openai/gpt-4\",\n",
|
227 |
+
" \"generation_model_kwargs\": {\"temperature\": 0.8, \"max_tokens\": 200},\n",
|
228 |
+
" \"output_schema\": [\n",
|
229 |
+
" {\"name\": \"question\", \"type\": \"str\"},\n",
|
230 |
+
" {\"name\": \"answer\", \"type\": \"str\"},\n",
|
231 |
+
" {\"name\": \"difficulty\", \"type\": \"str\"}, # Added an additional field\n",
|
232 |
+
" ],\n",
|
233 |
+
" \"data_factory_config\": {\"max_concurrency\": 4, \"task_runner_timeout\": 60 * 2},\n",
|
234 |
+
" }\u001b[0m\n"
|
235 |
+
]
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"source": [
|
239 |
+
"loaded.print_example()"
|
240 |
+
]
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"cell_type": "code",
|
244 |
+
"execution_count": 5,
|
245 |
+
"metadata": {},
|
246 |
+
"outputs": [
|
247 |
+
{
|
248 |
+
"name": "stdout",
|
249 |
+
"output_type": "stream",
|
250 |
+
"text": [
|
251 |
+
"🌟 Function Calling Dataset Generation Pipeline\n",
|
252 |
+
"============================================================\n",
|
253 |
+
"📋 Process Overview:\n",
|
254 |
+
" 1. Calculate optimal data distribution\n",
|
255 |
+
" 2. Generate diverse topics\n",
|
256 |
+
" 3. Create subtopics for each topic\n",
|
257 |
+
" 4. Generate query-answer pairs\n",
|
258 |
+
" 5. Verify and validate generated data\n",
|
259 |
+
" 6. Regenerate failed cases\n",
|
260 |
+
"============================================================\n",
|
261 |
+
"📊 Data Distribution Plan:\n",
|
262 |
+
" • Requested: 10 records\n",
|
263 |
+
" • Distribution: 1 topics × 1 subtopics × 10 records\n",
|
264 |
+
" • Total generation: 10 records\n",
|
265 |
+
" • API calls needed: 3\n",
|
266 |
+
"\n",
|
267 |
+
"🎯 Step 1: Generating diverse topics...\n",
|
268 |
+
" ✅ Generated 1 topics\n",
|
269 |
+
"\n",
|
270 |
+
"🌿 Step 2: Creating subtopics for each topic...\n",
|
271 |
+
"\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: e6763e50-6438-4df5-81a9-5a68ce3f8468\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
272 |
+
"\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
273 |
+
"\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
274 |
+
" ✅ Generated 1 subtopics total\n",
|
275 |
+
"\n",
|
276 |
+
"💬 Step 3: Generating query-answer pairs...\n",
|
277 |
+
"\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 1931c5c8-c1f3-4268-98b7-1a5295b8abf2\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
278 |
+
"\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
279 |
+
"\u001b[32m2025-05-23 00:27:09\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
280 |
+
"\u001b[32m2025-05-23 00:27:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
281 |
+
"\u001b[32m2025-05-23 00:27:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
282 |
+
"\u001b[32m2025-05-23 00:27:18\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
283 |
+
"\u001b[32m2025-05-23 00:27:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
284 |
+
"\u001b[32m2025-05-23 00:27:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
285 |
+
"\u001b[32m2025-05-23 00:27:27\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
286 |
+
"\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
287 |
+
" ✅ Generated 10 initial query-answer pairs\n",
|
288 |
+
"\n",
|
289 |
+
"🔍 Step 4: Verifying data quality...\n",
|
290 |
+
"\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: f036c07c-1cd2-4690-be92-bac359e45544\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
291 |
+
"\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
292 |
+
"\u001b[32m2025-05-23 00:27:31\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
293 |
+
"\u001b[32m2025-05-23 00:27:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 9/10\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 9\u001b[0m (\u001b[32mCompleted: 9\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
294 |
+
"\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 10/10\u001b[0m | \u001b[33mAttempted: 10\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
295 |
+
"\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
|
296 |
+
" ✅ Quality check complete: 9 passed, 1 failed\n",
|
297 |
+
"\n",
|
298 |
+
"🔄 Step 5: Regenerating failed cases...\n",
|
299 |
+
"\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 3d6183a2-e465-4807-9e18-cbb84dc0d28f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
300 |
+
"\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
301 |
+
"\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
302 |
+
"\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8754bec6-25e3-40bd-9743-f2763fc1091f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
|
303 |
+
"\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
304 |
+
"\u001b[32m2025-05-23 00:27:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
|
305 |
+
"\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
|
306 |
+
"\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
|
307 |
+
" ✅ Regenerated 1 pairs, 1 still failing\n",
|
308 |
+
"\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mSome data still failing after regeneration - prompts may need improvement\u001b[0m\n",
|
309 |
+
"🎯 Perfect! Generated exactly 10 records as requested\n",
|
310 |
+
"\n",
|
311 |
+
"🎉 Generation Complete!\n",
|
312 |
+
"============================================================\n",
|
313 |
+
"📈 Final Results:\n",
|
314 |
+
" • Records generated: 10\n",
|
315 |
+
" • Success rate: 10/10 (100.0%)\n",
|
316 |
+
" • Distribution used: 1T × 1S × 10R\n",
|
317 |
+
"\n",
|
318 |
+
"⭐ If you found this helpful, please consider starring our repo!\n",
|
319 |
+
" Your support means the world to us! 🌟\n",
|
320 |
+
"============================================================\n"
|
321 |
+
]
|
322 |
+
}
|
323 |
+
],
|
324 |
+
"source": [
|
325 |
+
"input_data = {\n",
|
326 |
+
" \"user_instruction\": \"Generate Q&A pairs about machine learning concepts\",\n",
|
327 |
+
" \"num_records\": 100,\n",
|
328 |
+
" \"records_per_topic\": 5,\n",
|
329 |
+
" \"topics\": [\n",
|
330 |
+
" \"supervised learning\",\n",
|
331 |
+
" \"unsupervised learning\",\n",
|
332 |
+
" {\"reinforcement learning\": 3}, # This means generate 3 records for this topic\n",
|
333 |
+
" \"neural networks\",\n",
|
334 |
+
" ],\n",
|
335 |
+
" \"topic_model_name\": \"openai/gpt-4\",\n",
|
336 |
+
" \"topic_model_kwargs\": {\"temperature\": 0.7},\n",
|
337 |
+
" \"generation_model_name\": \"openai/gpt-4\",\n",
|
338 |
+
" \"generation_model_kwargs\": {\"temperature\": 0.8, \"max_tokens\": 200},\n",
|
339 |
+
" \"output_schema\": [\n",
|
340 |
+
" {\"name\": \"question\", \"type\": \"str\"},\n",
|
341 |
+
" {\"name\": \"answer\", \"type\": \"str\"},\n",
|
342 |
+
" {\"name\": \"difficulty\", \"type\": \"str\"}, # Added an additional field\n",
|
343 |
+
" ],\n",
|
344 |
+
" \"data_factory_config\": {\"max_concurrency\": 4, \"task_runner_timeout\": 60 * 2},\n",
|
345 |
+
" }\n",
|
346 |
+
"data = await loaded.run(input_data=input_data)"
|
347 |
+
]
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"cell_type": "code",
|
351 |
+
"execution_count": 6,
|
352 |
+
"metadata": {},
|
353 |
+
"outputs": [
|
354 |
+
{
|
355 |
+
"data": {
|
356 |
+
"text/plain": [
|
357 |
+
"[{'query': 'Can you check the current weather in Toronto and Rome? Use Fahrenheit for both locations.',\n",
|
358 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
359 |
+
" 'arguments': {'location': 'Toronto', 'units': 'Fahrenheit'}},\n",
|
360 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
361 |
+
" 'arguments': {'location': 'Rome', 'units': 'Fahrenheit'}}]},\n",
|
362 |
+
" {'query': 'Get me the current weather in Mumbai and also in Johannesburg, please use Fahrenheit for both.',\n",
|
363 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
364 |
+
" 'arguments': {'location': 'Mumbai', 'units': 'Fahrenheit'}},\n",
|
365 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
366 |
+
" 'arguments': {'location': 'Johannesburg', 'units': 'Fahrenheit'}}]},\n",
|
367 |
+
" {'query': 'I need the current weather for Sydney and London. What are the temperatures in Celsius?',\n",
|
368 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
369 |
+
" 'arguments': {'location': 'Sydney', 'units': 'Celsius'}},\n",
|
370 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
371 |
+
" 'arguments': {'location': 'London', 'units': 'Celsius'}}]},\n",
|
372 |
+
" {'query': 'Please find the current weather in Buenos Aires and Cape Town, using Celsius for Buenos Aires.',\n",
|
373 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
374 |
+
" 'arguments': {'location': 'Buenos Aires', 'units': 'Celsius'}},\n",
|
375 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
376 |
+
" 'arguments': {'location': 'Cape Town'}}]},\n",
|
377 |
+
" {'query': 'What’s the weather like in Moscow? Also, can you get the current conditions in Beijing?',\n",
|
378 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
379 |
+
" 'arguments': {'location': 'Moscow'}},\n",
|
380 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
381 |
+
" 'arguments': {'location': 'Beijing'}}]},\n",
|
382 |
+
" {'query': 'Can you tell me the current weather in Tokyo and in Los Angeles? Please provide both in Fahrenheit.',\n",
|
383 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
384 |
+
" 'arguments': {'location': 'Tokyo', 'units': 'Fahrenheit'}},\n",
|
385 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
386 |
+
" 'arguments': {'location': 'Los Angeles', 'units': 'Fahrenheit'}}]},\n",
|
387 |
+
" {'query': 'Please provide the current weather for Berlin and Cairo, using Celsius for Berlin and no specific unit for Cairo.',\n",
|
388 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
389 |
+
" 'arguments': {'location': 'Berlin', 'units': 'Celsius'}},\n",
|
390 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
391 |
+
" 'arguments': {'location': 'Cairo'}}]},\n",
|
392 |
+
" {'query': 'I need the current weather in Seattle and in Santiago. Use Fahrenheit for Seattle and Celsius for Santiago.',\n",
|
393 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
394 |
+
" 'arguments': {'location': 'Seattle', 'units': 'Fahrenheit'}},\n",
|
395 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
396 |
+
" 'arguments': {'location': 'Santiago', 'units': 'Celsius'}}]},\n",
|
397 |
+
" {'query': \"What's the current temperature in San Francisco? Can you also check the weather in Paris?\",\n",
|
398 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
399 |
+
" 'arguments': {'location': 'San Francisco'}},\n",
|
400 |
+
" {'name': 'weather_api.get_current_weather',\n",
|
401 |
+
" 'arguments': {'location': 'Paris'}}]},\n",
|
402 |
+
" {'query': 'What is the current weather in New York City? And can you also provide the temperature in Celsius?',\n",
|
403 |
+
" 'answer': [{'name': 'weather_api.get_current_weather',\n",
|
404 |
+
" 'arguments': {'location': 'New York City', 'units': 'Celsius'}}]}]"
|
405 |
+
]
|
406 |
+
},
|
407 |
+
"execution_count": 6,
|
408 |
+
"metadata": {},
|
409 |
+
"output_type": "execute_result"
|
410 |
+
}
|
411 |
+
],
|
412 |
+
"source": [
|
413 |
+
"data"
|
414 |
+
]
|
415 |
+
}
|
416 |
+
],
|
417 |
+
"metadata": {
|
418 |
+
"kernelspec": {
|
419 |
+
"display_name": ".venv",
|
420 |
+
"language": "python",
|
421 |
+
"name": "python3"
|
422 |
+
},
|
423 |
+
"language_info": {
|
424 |
+
"codemirror_mode": {
|
425 |
+
"name": "ipython",
|
426 |
+
"version": 3
|
427 |
+
},
|
428 |
+
"file_extension": ".py",
|
429 |
+
"mimetype": "text/x-python",
|
430 |
+
"name": "python",
|
431 |
+
"nbconvert_exporter": "python",
|
432 |
+
"pygments_lexer": "ipython3",
|
433 |
+
"version": "3.11.4"
|
434 |
+
}
|
435 |
+
},
|
436 |
+
"nbformat": 4,
|
437 |
+
"nbformat_minor": 2
|
438 |
+
}
|
pyproject.toml
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "starfish-core"
|
3 |
+
version = "0.1.3"
|
4 |
+
description = ""
|
5 |
+
authors = ["Starfish AI Inc."]
|
6 |
+
readme = "README.md"
|
7 |
+
packages = [
|
8 |
+
{include = "starfish", from = "src"}
|
9 |
+
]
|
10 |
+
|
11 |
+
[tool.poetry.dependencies]
|
12 |
+
python = ">=3.10,<4.0"
|
13 |
+
litellm = ">=1.65.1,<2.0.0"
|
14 |
+
fastapi = ">=0.95.0"
|
15 |
+
loguru = ">=0.7.3,<0.8.0"
|
16 |
+
cachetools = ">=5.5.2,<6.0.0"
|
17 |
+
ollama = ">=0.4.7,<0.5.0"
|
18 |
+
python-dotenv = ">=1.1.0,<2.0.0"
|
19 |
+
aiosqlite = ">=0.21.0,<0.22.0"
|
20 |
+
aiofiles = ">=24.1.0,<25.0.0"
|
21 |
+
typing-extensions = ">=4.0.0,<5.0.0"
|
22 |
+
posthog = "^3.11.0"
|
23 |
+
cloudpickle = "^2.2.0"
|
24 |
+
datasets = "3.6.0"
|
25 |
+
psutil = ">=7.0.0,<8.0.0"
|
26 |
+
nest_asyncio = "^1.6.0"
|
27 |
+
docstring_parser = "^0.16.0"
|
28 |
+
mcp = "^1.8.1"
|
29 |
+
# Force cryptography >=44.0.1 due to transitive security vulnerability
|
30 |
+
# See: https://openssl-library.org/news/secadv/20250211.txt
|
31 |
+
cryptography = ">=44.0.1"
|
32 |
+
# Embedding dependencies
|
33 |
+
faiss-cpu = "^1.7.4"
|
34 |
+
sentence-transformers = "^4.1.0"
|
35 |
+
unstructured = { version = "^0.10.0", extras = ["pdf"], optional = true }
|
36 |
+
python-docx = { version = "*", optional = true }
|
37 |
+
python-pptx = { version = "*", optional = true }
|
38 |
+
openpyxl = { version = "*", optional = true }
|
39 |
+
pytube = { version = "^15.0.0", optional = true }
|
40 |
+
youtube-transcript-api = { version = "^0.6.1", optional = true }
|
41 |
+
pdfminer_six = { version = "^20250506", optional = true }
|
42 |
+
|
43 |
+
# Add optional dependencies for parsers
|
44 |
+
[tool.poetry.extras]
|
45 |
+
docx = ["python-docx"]
|
46 |
+
ppt = ["python-pptx"]
|
47 |
+
excel = ["openpyxl"]
|
48 |
+
youtube = ["pytube", "youtube-transcript-api"]
|
49 |
+
pdf = ["pdfminer_six"]
|
50 |
+
unstructured = ["unstructured"]
|
51 |
+
all = [
|
52 |
+
"python-docx",
|
53 |
+
"python-pptx",
|
54 |
+
"openpyxl",
|
55 |
+
"pytube",
|
56 |
+
"youtube-transcript-api",
|
57 |
+
"pdfminer_six",
|
58 |
+
"unstructured",
|
59 |
+
]
|
60 |
+
|
61 |
+
[build-system]
|
62 |
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
63 |
+
build-backend = "poetry.core.masonry.api"
|
64 |
+
|
65 |
+
[tool.poetry.group.dev.dependencies]
|
66 |
+
ipykernel = "^6.29.5"
|
67 |
+
twine = "^5.0.0"
|
68 |
+
ruff = "^0.8.6"
|
69 |
+
vcrpy = "^7.0.0"
|
70 |
+
isort = "^5.13.2"
|
71 |
+
pre-commit = "^4.0.1"
|
72 |
+
pytest = "^8.3.3"
|
73 |
+
pytest-asyncio = "^0.24.0"
|
74 |
+
pytest-dependency = "^0.6.0"
|
75 |
+
pytest-timeout = "^2.3.1"
|
76 |
+
pytest-cov = "^6.0.0"
|
77 |
+
nbval = "^0.11.0"
|
78 |
+
|
79 |
+
|
80 |
+
[tool.poetry.scripts]
|
81 |
+
starfish = "starfish.api.cli:main"
|
82 |
+
data-template = "src.starfish.data_gen_template.cli:main"
|
83 |
+
|
84 |
+
|
85 |
+
[tool.ruff]
|
86 |
+
line-length = 160
|
87 |
+
|
88 |
+
# Auto-fix settings
|
89 |
+
fix = true
|
90 |
+
unsafe-fixes = true
|
91 |
+
|
92 |
+
[tool.ruff.lint]
|
93 |
+
select = [
|
94 |
+
"E", # pycodestyle errors
|
95 |
+
"W", # pycodestyle warnings
|
96 |
+
"F", # pyflakes
|
97 |
+
"F401", # Unused imports
|
98 |
+
"I", # isort
|
99 |
+
"B", # flake8-bugbear
|
100 |
+
"C4", # flake8-comprehensions
|
101 |
+
"N", # PEP8 naming convetions
|
102 |
+
"D" # pydocstyle
|
103 |
+
]
|
104 |
+
ignore = [
|
105 |
+
"D100", # Remove this eventually
|
106 |
+
"C901", # too complex
|
107 |
+
"W191", # indentation contains tabs
|
108 |
+
"D401", # imperative mood
|
109 |
+
"N806", # uppercase variable names, for example, "API_KEY"
|
110 |
+
]
|
111 |
+
exclude = [
|
112 |
+
".git",
|
113 |
+
"__pycache__",
|
114 |
+
"venv",
|
115 |
+
"build",
|
116 |
+
"dist",
|
117 |
+
]
|
118 |
+
|
119 |
+
[tool.ruff.lint.per-file-ignores]
|
120 |
+
"tests/**/*" = ["D"] # ignore tests for now
|
121 |
+
|
122 |
+
[tool.ruff.lint.pydocstyle]
|
123 |
+
convention = "google"
|
124 |
+
[tool.isort]
|
125 |
+
profile = "black"
|
126 |
+
line_length = 88
|
127 |
+
|
128 |
+
[tool.pytest.ini_options]
|
129 |
+
asyncio_mode = "strict"
|
130 |
+
asyncio_default_fixture_loop_scope = "function"
|
131 |
+
|
132 |
+
|
pytest.ini
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[pytest]
|
2 |
+
asyncio_mode = auto
|
3 |
+
timeout = 300
|
4 |
+
timeout_method = thread
|
5 |
+
norecursedirs = .ipynb_checkpoints
|
6 |
+
python_files = test_*.py
|
7 |
+
ignore = tests/data_factory/factory/data_factory.ipynb
|
readme-web.md
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
#### Step 2: Start the Backend
|
3 |
+
|
4 |
+
```bash
|
5 |
+
# Install Python dependencies
|
6 |
+
pip install -r api/requirements.txt
|
7 |
+
|
8 |
+
# Start the API server
|
9 |
+
python -m web.api.main
|
10 |
+
```
|
11 |
+
|
12 |
+
#### Step 3: Start the Frontend
|
13 |
+
|
14 |
+
```bash
|
15 |
+
NODE_OPTIONS='--inspect'
|
16 |
+
npm run dev
|
17 |
+
```
|
18 |
+
|
19 |
+
#### Step 4: Debug the Frontend
|
20 |
+
|
21 |
+
```bash
|
22 |
+
NODE_OPTIONS='--inspect' npm run dev
|
23 |
+
```
|
scripts/hug_push.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
git push hug vam:main
|
scripts/rag.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# warning
|
2 |
+
import warnings
|
3 |
+
|
4 |
+
warnings.filterwarnings("ignore")
|
5 |
+
|
6 |
+
import os
|
7 |
+
from together import Together
|
8 |
+
import faiss
|
9 |
+
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
|
12 |
+
"""
|
13 |
+
Do these steps:
|
14 |
+
1) Set up a Together API key from https://together.ai/
|
15 |
+
"""
|
16 |
+
together_api_key = os.environ.get("TOGETHER_API_KEY")
|
17 |
+
|
18 |
+
|
19 |
+
def run_rag(data_dict: dict, prompt: str):
|
20 |
+
"""
|
21 |
+
Run RAG system: process documents, create embeddings, search, and generate answer.
|
22 |
+
|
23 |
+
"""
|
24 |
+
|
25 |
+
# Stage 0: Initialize Together AI client for LLM completions
|
26 |
+
client = Together(api_key=together_api_key)
|
27 |
+
|
28 |
+
# Stage 1: Load sentence transformer model for creating embeddings
|
29 |
+
# ------------------------------------------------------------
|
30 |
+
embedding_model = SentenceTransformer(
|
31 |
+
"sentence-transformers/all-MiniLM-L6-v2",
|
32 |
+
use_auth_token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
|
33 |
+
)
|
34 |
+
|
35 |
+
# Stage 2: Process documents into Vector Database
|
36 |
+
# ------------------------------------------------------------
|
37 |
+
documents = []
|
38 |
+
filenames = []
|
39 |
+
|
40 |
+
print(f"Processing {len(data_dict)} documents...")
|
41 |
+
for key, content in data_dict.items():
|
42 |
+
content = content.strip()
|
43 |
+
if content: # Only add non-empty documents
|
44 |
+
documents.append(content)
|
45 |
+
filenames.append(key)
|
46 |
+
print(f"✅ Loaded: {key}")
|
47 |
+
|
48 |
+
if not documents:
|
49 |
+
return "No valid documents found in data dictionary!"
|
50 |
+
|
51 |
+
# Create embeddings for all documents
|
52 |
+
print("Creating embeddings...")
|
53 |
+
embeddings = embedding_model.encode(documents)
|
54 |
+
|
55 |
+
# Set up FAISS index for similarity search
|
56 |
+
dimension = embeddings.shape[1]
|
57 |
+
index = faiss.IndexFlatIP(dimension)
|
58 |
+
|
59 |
+
# Normalize embeddings for cosine similarity
|
60 |
+
faiss.normalize_L2(embeddings)
|
61 |
+
index.add(embeddings)
|
62 |
+
|
63 |
+
print(f"✅ RAG system ready with {len(documents)} documents!")
|
64 |
+
|
65 |
+
# Stage 3: Retrieve relevant documents
|
66 |
+
# ------------------------------------------------------------
|
67 |
+
query_embedding = embedding_model.encode([prompt])
|
68 |
+
faiss.normalize_L2(query_embedding)
|
69 |
+
|
70 |
+
# Get top similar documents
|
71 |
+
scores, indices = index.search(query_embedding, min(3, len(documents)))
|
72 |
+
|
73 |
+
# Stage 4: Build context from retrieved documents
|
74 |
+
# ------------------------------------------------------------
|
75 |
+
relevant_docs = []
|
76 |
+
context_parts = []
|
77 |
+
|
78 |
+
for score, idx in zip(scores[0], indices[0]):
|
79 |
+
if idx < len(documents):
|
80 |
+
doc_info = {
|
81 |
+
"content": documents[idx],
|
82 |
+
"filename": filenames[idx],
|
83 |
+
"score": float(score),
|
84 |
+
}
|
85 |
+
relevant_docs.append(doc_info)
|
86 |
+
context_parts.append(f"[{doc_info['filename']}]\n{doc_info['content']}")
|
87 |
+
|
88 |
+
if not relevant_docs:
|
89 |
+
return "No relevant documents found for the query."
|
90 |
+
|
91 |
+
# Combine context
|
92 |
+
context = "\n\n".join(context_parts)
|
93 |
+
|
94 |
+
# Stage 5: Augment by running the LLM to generate an answer
|
95 |
+
# ------------------------------------------------------------
|
96 |
+
llm_prompt = f"""Answer the question based on the provided context documents.
|
97 |
+
|
98 |
+
Context:
|
99 |
+
{context}
|
100 |
+
|
101 |
+
Question: {prompt}
|
102 |
+
|
103 |
+
Instructions:
|
104 |
+
- Answer based only on the information in the context
|
105 |
+
- Answer should beat least 10 words at max 20 words
|
106 |
+
- If the context doesn't contain enough information, say so
|
107 |
+
- Mention which document(s) you're referencing
|
108 |
+
- Start with According to [document name]
|
109 |
+
- Add brackets to the document name
|
110 |
+
|
111 |
+
|
112 |
+
Answer:"""
|
113 |
+
|
114 |
+
try:
|
115 |
+
# Generate answer using Together AI
|
116 |
+
response = client.chat.completions.create(
|
117 |
+
model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
118 |
+
messages=[{"role": "user", "content": llm_prompt}],
|
119 |
+
max_tokens=500,
|
120 |
+
temperature=0.7,
|
121 |
+
)
|
122 |
+
answer = response.choices[0].message.content
|
123 |
+
|
124 |
+
# Display source information
|
125 |
+
print(f"\n📚 Most relevant source:")
|
126 |
+
for doc in relevant_docs:
|
127 |
+
print(f" • {doc['filename']} (similarity: {doc['score']:.3f})")
|
128 |
+
|
129 |
+
# Add source information to the answer
|
130 |
+
sources_list = [doc["filename"] for doc in relevant_docs]
|
131 |
+
sources_text = sources_list[0]
|
132 |
+
full_answer = f"{answer}\n\n📄 Source Used: {sources_text}"
|
133 |
+
|
134 |
+
return full_answer
|
135 |
+
|
136 |
+
except Exception as e:
|
137 |
+
return f"Error generating answer: {str(e)}"
|
138 |
+
|
139 |
+
|
140 |
+
if __name__ == "__main__":
|
141 |
+
# Load dataset
|
142 |
+
data_dict = {
|
143 |
+
"octopus_facts": "Octopuses have three hearts and blue blood. Two hearts pump blood to the gills, while the third pumps blood to the rest of the body. Their blood is blue because it contains copper-based hemocyanin instead of iron-based hemoglobin.",
|
144 |
+
"honey_facts": "Honey never spoils. Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still perfectly edible. This is because honey has natural antibacterial properties and very low water content.",
|
145 |
+
"space_facts": "A day on Venus is longer than its year. Venus takes 243 Earth days to rotate once on its axis, but only 225 Earth days to orbit the Sun. This means a Venusian day is longer than a Venusian year.",
|
146 |
+
"banana_facts": "Bananas are berries, but strawberries aren't. Botanically speaking, berries must have seeds inside their flesh. Bananas qualify, but strawberries have seeds on the outside, making them aggregate fruits.",
|
147 |
+
"shark_facts": "Sharks have been around longer than trees. Sharks first appeared around 400 million years ago, while the earliest trees appeared around 350 million years ago. This means sharks pre-date trees by about 50 million years.",
|
148 |
+
"penguin_facts": "Emperor penguins can hold their breath for over 20 minutes and dive to depths of over 500 meters while hunting for fish. They have special adaptations including collapsible lungs and the ability to slow their heart rate.",
|
149 |
+
"human_brain": "Your brain uses about 20% of your body's total energy despite being only 2% of your body weight. It consumes roughly 320 calories per day, which is equivalent to eating about 320 M&Ms.",
|
150 |
+
}
|
151 |
+
|
152 |
+
question = "What is interesting about a banana?"
|
153 |
+
answer = run_rag(data_dict, question)
|
154 |
+
print(f"\n🤖 Answer: {answer}\n")
|
155 |
+
print("-" * 50)
|
src/starfish/__init__.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Starfish Core - A framework for structured data processing and LLM integration.
|
2 |
+
|
3 |
+
Provides core components for:
|
4 |
+
- StructuredLLM: Interface for working with large language models
|
5 |
+
- data_factory: Factory pattern for creating and managing data pipelines
|
6 |
+
"""
|
7 |
+
|
8 |
+
# Expose core directly from easy access
|
9 |
+
from .data_factory.factory import data_factory
|
10 |
+
from .llm.structured_llm import StructuredLLM
|
11 |
+
from .data_gen_template.core import data_gen_template
|
12 |
+
|
13 |
+
# Define what 'from starfish import *' imports (good practice)
|
14 |
+
__all__ = [
|
15 |
+
"StructuredLLM",
|
16 |
+
"data_factory",
|
17 |
+
"data_gen_template",
|
18 |
+
]
|
src/starfish/common/env_loader.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Environment variable loader utility.
|
2 |
+
|
3 |
+
This module provides functionality to load environment variables from a .env file
|
4 |
+
in non-production environments. In production, environment variables should be
|
5 |
+
set through the system/platform instead of using .env files for security reasons.
|
6 |
+
|
7 |
+
Uses python-dotenv for loading environment variables from .env files.
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
from typing import Optional
|
12 |
+
|
13 |
+
# Import python-dotenv
|
14 |
+
from dotenv import dotenv_values
|
15 |
+
from dotenv import find_dotenv as dotenv_find_dotenv
|
16 |
+
from dotenv import load_dotenv as dotenv_load_dotenv
|
17 |
+
|
18 |
+
from starfish.common.logger import get_logger
|
19 |
+
|
20 |
+
logger = get_logger(__name__)
|
21 |
+
|
22 |
+
|
23 |
+
def load_env_file(env_path: Optional[str] = None, override: bool = False) -> bool:
|
24 |
+
"""Load environment variables from .env file for non-production environments.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
env_path: Path to the .env file. If None, looks for .env file in the current
|
28 |
+
working directory and parent directories.
|
29 |
+
override: Whether to override existing environment variables. Default is False.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
True if environment variables were loaded, False otherwise.
|
33 |
+
"""
|
34 |
+
# Skip loading in production environments
|
35 |
+
if os.getenv("ENV") == "PROD":
|
36 |
+
logger.info("Production environment detected. Skipping .env file loading.")
|
37 |
+
|
38 |
+
# Find the .env file if path not provided
|
39 |
+
if env_path is None:
|
40 |
+
env_path = dotenv_find_dotenv(usecwd=True)
|
41 |
+
if not env_path:
|
42 |
+
logger.warning("No .env file found in the current or parent directories.")
|
43 |
+
|
44 |
+
# Load environment variables
|
45 |
+
loaded = dotenv_load_dotenv(dotenv_path=env_path, override=override)
|
46 |
+
|
47 |
+
if loaded:
|
48 |
+
# Get the loaded variables to count and log them
|
49 |
+
loaded_vars = dotenv_values(env_path)
|
50 |
+
logger.debug(f"Loaded {len(loaded_vars)} environment variables from {env_path}")
|
51 |
+
else:
|
52 |
+
logger.warning(f"Failed to load environment variables from {env_path}")
|
src/starfish/common/exceptions.py
ADDED
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import functools
|
2 |
+
import os
|
3 |
+
import traceback
|
4 |
+
import uuid
|
5 |
+
from typing import Any, Dict, Optional, Tuple
|
6 |
+
|
7 |
+
from pydantic import BaseModel, Field, ValidationError
|
8 |
+
|
9 |
+
from starfish.common.logger import get_logger
|
10 |
+
|
11 |
+
logger = get_logger(__name__)
|
12 |
+
|
13 |
+
# Simple configuration flag (can be set from app config)
|
14 |
+
# Default to False for production safety
|
15 |
+
INCLUDE_TRACEBACK_IN_RESPONSE = os.environ.get("INCLUDE_TRACEBACK_IN_RESPONSE", False)
|
16 |
+
|
17 |
+
#############################################
|
18 |
+
# HTTP Status Codes
|
19 |
+
#############################################
|
20 |
+
|
21 |
+
|
22 |
+
class HTTPStatus:
|
23 |
+
"""Standard HTTP status codes."""
|
24 |
+
|
25 |
+
OK = 200
|
26 |
+
BAD_REQUEST = 400
|
27 |
+
UNAUTHORIZED = 401
|
28 |
+
FORBIDDEN = 403
|
29 |
+
NOT_FOUND = 404
|
30 |
+
UNPROCESSABLE_ENTITY = 422
|
31 |
+
INTERNAL_SERVER_ERROR = 500
|
32 |
+
|
33 |
+
|
34 |
+
#############################################
|
35 |
+
# Error Response Model
|
36 |
+
#############################################
|
37 |
+
|
38 |
+
|
39 |
+
class ErrorResponse(BaseModel):
|
40 |
+
"""Standardized error response format for API errors."""
|
41 |
+
|
42 |
+
status: str = "error"
|
43 |
+
error_id: str = Field(..., description="Unique identifier for this error occurrence")
|
44 |
+
message: str
|
45 |
+
error_type: str
|
46 |
+
details: Optional[Dict[str, Any]] = None
|
47 |
+
|
48 |
+
|
49 |
+
#############################################
|
50 |
+
# Exception Classes
|
51 |
+
#############################################
|
52 |
+
|
53 |
+
|
54 |
+
class StarfishException(Exception):
|
55 |
+
"""Base exception for all Starfish exceptions."""
|
56 |
+
|
57 |
+
status_code: int = HTTPStatus.INTERNAL_SERVER_ERROR
|
58 |
+
default_message: str = "An unexpected error occurred"
|
59 |
+
|
60 |
+
def __init__(self, message: Optional[str] = None, details: Optional[Dict[str, Any]] = None):
|
61 |
+
self.message = message or self.default_message
|
62 |
+
self.details = details
|
63 |
+
self.error_id = str(uuid.uuid4())
|
64 |
+
super().__init__(self.message)
|
65 |
+
|
66 |
+
def __str__(self):
|
67 |
+
if self.details:
|
68 |
+
return f"{self.message} - Details: {self.details}"
|
69 |
+
return self.message
|
70 |
+
|
71 |
+
|
72 |
+
class ValidationError(StarfishException):
|
73 |
+
"""Exception raised for validation errors."""
|
74 |
+
|
75 |
+
status_code = HTTPStatus.UNPROCESSABLE_ENTITY
|
76 |
+
default_message = "Validation error"
|
77 |
+
|
78 |
+
|
79 |
+
class PydanticValidationError(ValidationError):
|
80 |
+
"""Exception raised for Pydantic validation errors.
|
81 |
+
|
82 |
+
This class formats Pydantic validation errors into user-friendly messages
|
83 |
+
and preserves the detailed error information for debugging.
|
84 |
+
"""
|
85 |
+
|
86 |
+
default_message = "Data validation error"
|
87 |
+
|
88 |
+
@staticmethod
|
89 |
+
def format_validation_error(error: ValidationError) -> Tuple[str, Dict[str, Any]]:
|
90 |
+
"""Format a Pydantic ValidationError into a user-friendly message and details.
|
91 |
+
|
92 |
+
Args:
|
93 |
+
error: The Pydantic ValidationError to format
|
94 |
+
|
95 |
+
Returns:
|
96 |
+
Tuple of (message, details)
|
97 |
+
"""
|
98 |
+
if not hasattr(error, "errors") or not callable(getattr(error, "errors", None)):
|
99 |
+
return str(error), {}
|
100 |
+
|
101 |
+
error_details = error.errors()
|
102 |
+
if not error_details:
|
103 |
+
return "Validation error", {}
|
104 |
+
|
105 |
+
# Format fields with errors
|
106 |
+
field_errors = []
|
107 |
+
for err in error_details:
|
108 |
+
# Get error type and location
|
109 |
+
err_type = err.get("type", "unknown")
|
110 |
+
loc = err.get("loc", [])
|
111 |
+
|
112 |
+
# Special handling for discriminated unions
|
113 |
+
# If first element is a string and subsequent elements exist, might be a discriminated union
|
114 |
+
if len(loc) >= 2 and isinstance(loc[0], str) and isinstance(loc[1], str):
|
115 |
+
# This might be a discriminated union error like ['vanilla', 'user_input']
|
116 |
+
type_name = loc[0]
|
117 |
+
field_name = loc[1]
|
118 |
+
|
119 |
+
# Handle errors differently based on type
|
120 |
+
if err_type == "missing":
|
121 |
+
field_errors.append(f"Field '{field_name}' is required for '{type_name}' type")
|
122 |
+
continue
|
123 |
+
|
124 |
+
# Standard handling for other errors
|
125 |
+
loc_str = ".".join(str(item) for item in loc) if loc else "unknown"
|
126 |
+
msg = err.get("msg", "")
|
127 |
+
|
128 |
+
# Create a user-friendly error message based on error type
|
129 |
+
if err_type == "missing":
|
130 |
+
field_errors.append(f"'{loc_str}' is required")
|
131 |
+
elif err_type == "type_error":
|
132 |
+
field_errors.append(f"'{loc_str}' has an invalid type")
|
133 |
+
elif err_type == "value_error":
|
134 |
+
field_errors.append(f"'{loc_str}' has an invalid value")
|
135 |
+
elif err_type.startswith("value_error"):
|
136 |
+
field_errors.append(f"'{loc_str}' {msg}")
|
137 |
+
elif err_type.startswith("type_error"):
|
138 |
+
field_errors.append(f"'{loc_str}' {msg}")
|
139 |
+
elif err_type == "extra_forbidden":
|
140 |
+
field_errors.append(f"'{loc_str}' is not allowed")
|
141 |
+
else:
|
142 |
+
field_errors.append(f"'{loc_str}': {msg}")
|
143 |
+
|
144 |
+
# Create a combined message
|
145 |
+
if len(field_errors) == 1:
|
146 |
+
message = f"Validation error: {field_errors[0]}"
|
147 |
+
else:
|
148 |
+
message = f"Validation errors: {', '.join(field_errors)}"
|
149 |
+
|
150 |
+
return message, {"validation_errors": error_details}
|
151 |
+
|
152 |
+
def __init__(self, validation_error: ValidationError, message: Optional[str] = None, details: Optional[Dict[str, Any]] = None):
|
153 |
+
# Format the validation error if no message is provided
|
154 |
+
if message is None:
|
155 |
+
message, error_details = self.format_validation_error(validation_error)
|
156 |
+
|
157 |
+
# Merge error details with provided details
|
158 |
+
if details is None:
|
159 |
+
details = error_details
|
160 |
+
else:
|
161 |
+
details = {**details, **error_details}
|
162 |
+
|
163 |
+
super().__init__(message=message, details=details)
|
164 |
+
|
165 |
+
|
166 |
+
class ParserError(StarfishException):
|
167 |
+
"""Base exception for all parser-related errors."""
|
168 |
+
|
169 |
+
status_code = HTTPStatus.UNPROCESSABLE_ENTITY
|
170 |
+
default_message = "Parser error"
|
171 |
+
|
172 |
+
|
173 |
+
class JsonParserError(ParserError):
|
174 |
+
"""Exception raised when JSON parsing fails."""
|
175 |
+
|
176 |
+
default_message = "JSON parsing error"
|
177 |
+
|
178 |
+
|
179 |
+
class SchemaValidationError(ParserError):
|
180 |
+
"""Exception raised when data doesn't conform to schema."""
|
181 |
+
|
182 |
+
default_message = "Schema validation error"
|
183 |
+
|
184 |
+
def __str__(self):
|
185 |
+
if self.details and "errors" in self.details:
|
186 |
+
errors_text = "\n".join([f"- {err}" for err in self.details["errors"]])
|
187 |
+
return f"{self.message}:\n{errors_text}"
|
188 |
+
return super().__str__()
|
189 |
+
|
190 |
+
|
191 |
+
class PydanticParserError(ParserError):
|
192 |
+
"""Exception raised when Pydantic parsing or validation fails."""
|
193 |
+
|
194 |
+
default_message = "Pydantic parsing error"
|
195 |
+
|
196 |
+
|
197 |
+
#############################################
|
198 |
+
# Error Handling Functions
|
199 |
+
#############################################
|
200 |
+
|
201 |
+
|
202 |
+
def format_error(exc: Exception, include_traceback: bool = INCLUDE_TRACEBACK_IN_RESPONSE) -> Tuple[ErrorResponse, int]:
|
203 |
+
"""Format an exception into a standardized error response.
|
204 |
+
|
205 |
+
Args:
|
206 |
+
exc: The exception to format
|
207 |
+
include_traceback: Whether to include traceback in the response details
|
208 |
+
|
209 |
+
Returns:
|
210 |
+
Tuple of (error_response, status_code)
|
211 |
+
"""
|
212 |
+
# Get traceback for logging (always) - may optionally include in response
|
213 |
+
tb_str = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__))
|
214 |
+
|
215 |
+
# Check for exception chaining
|
216 |
+
cause = getattr(exc, "__cause__", None)
|
217 |
+
cause_tb = None
|
218 |
+
if cause:
|
219 |
+
cause_tb = "".join(traceback.format_exception(type(cause), cause, cause.__traceback__))
|
220 |
+
logger.error(f"Original exception: {type(cause).__name__}: {str(cause)}")
|
221 |
+
logger.error(f"Original traceback: {cause_tb}")
|
222 |
+
|
223 |
+
# Log the current exception
|
224 |
+
logger.error(f"Exception: {type(exc).__name__}: {str(exc)}")
|
225 |
+
logger.error(f"Traceback: {tb_str}")
|
226 |
+
|
227 |
+
# Handle Starfish exceptions
|
228 |
+
if isinstance(exc, StarfishException):
|
229 |
+
error_id = getattr(exc, "error_id", str(uuid.uuid4()))
|
230 |
+
status_code = exc.status_code
|
231 |
+
details = exc.details or {}
|
232 |
+
|
233 |
+
# Only add traceback to details if requested
|
234 |
+
if include_traceback:
|
235 |
+
details["traceback"] = tb_str
|
236 |
+
if cause_tb:
|
237 |
+
details["original_traceback"] = cause_tb
|
238 |
+
|
239 |
+
return ErrorResponse(error_id=error_id, message=exc.message, error_type=type(exc).__name__, details=details if details else None), status_code
|
240 |
+
|
241 |
+
# Handle Pydantic validation errors
|
242 |
+
elif isinstance(exc, ValidationError):
|
243 |
+
error_id = str(uuid.uuid4())
|
244 |
+
status_code = HTTPStatus.UNPROCESSABLE_ENTITY
|
245 |
+
details = {"validation_errors": exc.errors()}
|
246 |
+
|
247 |
+
if include_traceback:
|
248 |
+
details["traceback"] = tb_str
|
249 |
+
if cause_tb:
|
250 |
+
details["original_traceback"] = cause_tb
|
251 |
+
|
252 |
+
return ErrorResponse(error_id=error_id, message="Validation error", error_type="ValidationError", details=details), status_code
|
253 |
+
|
254 |
+
# Handle all other exceptions
|
255 |
+
else:
|
256 |
+
error_id = str(uuid.uuid4())
|
257 |
+
status_code = HTTPStatus.INTERNAL_SERVER_ERROR
|
258 |
+
details = {}
|
259 |
+
|
260 |
+
if include_traceback:
|
261 |
+
details["traceback"] = tb_str
|
262 |
+
if cause_tb:
|
263 |
+
details["original_traceback"] = cause_tb
|
264 |
+
|
265 |
+
return ErrorResponse(
|
266 |
+
error_id=error_id, message=str(exc) or "An unexpected error occurred", error_type=type(exc).__name__, details=details if details else None
|
267 |
+
), status_code
|
268 |
+
|
269 |
+
|
270 |
+
#############################################
|
271 |
+
# Utility Decorators
|
272 |
+
#############################################
|
273 |
+
|
274 |
+
|
275 |
+
def handle_exceptions(return_value=None):
|
276 |
+
"""Decorator to handle exceptions in both async and sync functions.
|
277 |
+
|
278 |
+
This decorator can be used with any function to catch exceptions,
|
279 |
+
log them, and return a default value instead of raising.
|
280 |
+
|
281 |
+
Args:
|
282 |
+
return_value: The value to return if an exception occurs
|
283 |
+
|
284 |
+
Returns:
|
285 |
+
Decorated function with exception handling
|
286 |
+
"""
|
287 |
+
|
288 |
+
def decorator(func):
|
289 |
+
# Import asyncio here to avoid dependency if not needed
|
290 |
+
try:
|
291 |
+
import asyncio
|
292 |
+
|
293 |
+
is_async_available = True
|
294 |
+
except ImportError:
|
295 |
+
is_async_available = False
|
296 |
+
|
297 |
+
# Handle async functions
|
298 |
+
if is_async_available and asyncio.iscoroutinefunction(func):
|
299 |
+
|
300 |
+
@functools.wraps(func)
|
301 |
+
async def async_wrapper(*args, **kwargs):
|
302 |
+
try:
|
303 |
+
return await func(*args, **kwargs)
|
304 |
+
except Exception as exc:
|
305 |
+
# Format and log the error but don't raise
|
306 |
+
format_error(exc, include_traceback=True)
|
307 |
+
return return_value
|
308 |
+
|
309 |
+
return async_wrapper
|
310 |
+
|
311 |
+
# Handle synchronous functions
|
312 |
+
else:
|
313 |
+
|
314 |
+
@functools.wraps(func)
|
315 |
+
def sync_wrapper(*args, **kwargs):
|
316 |
+
try:
|
317 |
+
return func(*args, **kwargs)
|
318 |
+
except Exception as exc:
|
319 |
+
# Format and log the error but don't raise
|
320 |
+
format_error(exc, include_traceback=True)
|
321 |
+
return return_value
|
322 |
+
|
323 |
+
return sync_wrapper
|
324 |
+
|
325 |
+
return decorator
|
src/starfish/common/logger.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from enum import IntEnum
|
4 |
+
|
5 |
+
from loguru import logger
|
6 |
+
|
7 |
+
simple_log_format_enabled = os.getenv("SIMPLE_LOG_FORMAT", "true").lower() in ("true", "1", "yes")
|
8 |
+
|
9 |
+
default_log_level = os.getenv("LOG_LEVEL", "INFO")
|
10 |
+
|
11 |
+
|
12 |
+
# Define custom log levels
|
13 |
+
class LogLevel(IntEnum):
|
14 |
+
"""Custom log levels."""
|
15 |
+
|
16 |
+
VERBOSE = 5
|
17 |
+
DEBUG = 10
|
18 |
+
INFO = 20
|
19 |
+
WARNING = 30
|
20 |
+
ERROR = 40
|
21 |
+
CRITICAL = 50
|
22 |
+
|
23 |
+
|
24 |
+
# Configuration Constants
|
25 |
+
COLORED_FORMAT = (
|
26 |
+
"<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
|
27 |
+
"<level>{level: <8}</level> | "
|
28 |
+
"<cyan>{name}</cyan> | "
|
29 |
+
"<blue>{file}:{line}</blue> | "
|
30 |
+
"<level>{message}</level>"
|
31 |
+
)
|
32 |
+
|
33 |
+
SIMPLE_COLORED_FORMAT = "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | " "<level>{level: <8}</level> | " "<level>{message}</level>"
|
34 |
+
|
35 |
+
|
36 |
+
class LogManager:
|
37 |
+
"""Manages logger configuration."""
|
38 |
+
|
39 |
+
_instance = None
|
40 |
+
|
41 |
+
def __new__(cls):
|
42 |
+
"""Create a singleton instance."""
|
43 |
+
if cls._instance is None:
|
44 |
+
cls._instance = super(LogManager, cls).__new__(cls)
|
45 |
+
cls._instance.handler_id = None
|
46 |
+
cls._instance.current_level = default_log_level
|
47 |
+
cls._instance._initialize()
|
48 |
+
return cls._instance
|
49 |
+
|
50 |
+
def _get_format_string(self):
|
51 |
+
"""Return the appropriate format string based on LOG_FORMAT_MODE."""
|
52 |
+
if simple_log_format_enabled:
|
53 |
+
if self.current_level == "DEBUG":
|
54 |
+
return COLORED_FORMAT
|
55 |
+
return SIMPLE_COLORED_FORMAT
|
56 |
+
return COLORED_FORMAT
|
57 |
+
|
58 |
+
def _initialize(self):
|
59 |
+
"""Initialize logging with console handler."""
|
60 |
+
logger.remove() # Remove default handler
|
61 |
+
log_format = self._get_format_string()
|
62 |
+
self.handler_id = logger.add(sys.stdout, format=log_format, level=self.current_level, colorize=True)
|
63 |
+
# Add custom level only if it doesn't exist
|
64 |
+
try:
|
65 |
+
logger.level("VERBOSE", no=LogLevel.VERBOSE, color="<magenta>")
|
66 |
+
except ValueError:
|
67 |
+
# Level already exists, ignore the error
|
68 |
+
pass
|
69 |
+
|
70 |
+
def get_current_log_level(self):
|
71 |
+
"""Get the current log level."""
|
72 |
+
return self.current_level
|
73 |
+
|
74 |
+
def update_log_level(self, level):
|
75 |
+
"""Update the log level of the console handler.
|
76 |
+
|
77 |
+
This can be called at any time during runtime to change the log level.
|
78 |
+
"""
|
79 |
+
level = level.upper()
|
80 |
+
if level not in ["VERBOSE", "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
|
81 |
+
raise ValueError(f"Invalid log level: {level}")
|
82 |
+
logger.remove(self.handler_id)
|
83 |
+
self.current_level = level
|
84 |
+
log_format = self._get_format_string()
|
85 |
+
self.handler_id = logger.add(sys.stdout, format=log_format, level=self.current_level, colorize=True)
|
86 |
+
|
87 |
+
|
88 |
+
# Instantiate LogManager to ensure logging is initialized on module import
|
89 |
+
log_manager = LogManager()
|
90 |
+
|
91 |
+
|
92 |
+
# Add verbose method to logger
|
93 |
+
def verbose(self, message, *args, **kwargs):
|
94 |
+
"""Log a verbose message."""
|
95 |
+
self.log("VERBOSE", message, *args, **kwargs)
|
96 |
+
|
97 |
+
|
98 |
+
logger.__class__.verbose = verbose
|
99 |
+
|
100 |
+
|
101 |
+
# Function to get the logger
|
102 |
+
def get_logger(name):
|
103 |
+
"""Get a logger instance bound with a name."""
|
104 |
+
return logger.bind(name=name)
|
src/starfish/components/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .prepare_topic import prepare_topic
|
2 |
+
|
3 |
+
__all__ = ["prepare_topic"]
|
src/starfish/components/prepare_topic.py
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import math
|
3 |
+
from typing import Any, Dict, List, Optional, Union
|
4 |
+
|
5 |
+
from starfish import StructuredLLM
|
6 |
+
|
7 |
+
|
8 |
+
async def generate_topics(
|
9 |
+
user_instruction: str,
|
10 |
+
num_topics: int,
|
11 |
+
model_name: str = "openai/gpt-4o-mini",
|
12 |
+
model_kwargs: Optional[Dict[str, Any]] = None,
|
13 |
+
existing_topics: Optional[List[str]] = None,
|
14 |
+
) -> List[str]:
|
15 |
+
"""Generate unique topics based on user instructions using a StructuredLLM model."""
|
16 |
+
if model_kwargs is None:
|
17 |
+
model_kwargs = {}
|
18 |
+
if "temperature" not in model_kwargs:
|
19 |
+
model_kwargs["temperature"] = 1
|
20 |
+
existing_topics = existing_topics or []
|
21 |
+
|
22 |
+
if num_topics <= 0:
|
23 |
+
return []
|
24 |
+
|
25 |
+
# Calculate batches needed (5 topics per batch)
|
26 |
+
llm_batch_size = 5
|
27 |
+
num_batches = math.ceil(num_topics / llm_batch_size)
|
28 |
+
generated_topics = []
|
29 |
+
|
30 |
+
for _ in range(num_batches):
|
31 |
+
topic_generator = StructuredLLM(
|
32 |
+
model_name=model_name,
|
33 |
+
prompt="""Can you generate a list of topics about {{user_instruction}}
|
34 |
+
{% if existing_topics_str %}
|
35 |
+
Please do not generate topics that are already in the list: {{existing_topics_str}}
|
36 |
+
Make sure the topics are unique and vary from each other
|
37 |
+
{% endif %}
|
38 |
+
""",
|
39 |
+
output_schema=[{"name": "topic", "type": "str"}],
|
40 |
+
model_kwargs=model_kwargs,
|
41 |
+
)
|
42 |
+
|
43 |
+
all_existing = existing_topics + generated_topics
|
44 |
+
input_params = {"user_instruction": user_instruction, "num_records": min(llm_batch_size, num_topics - len(generated_topics))}
|
45 |
+
|
46 |
+
if all_existing:
|
47 |
+
input_params["existing_topics_str"] = ",".join(all_existing)
|
48 |
+
|
49 |
+
topic_response = await topic_generator.run(**input_params)
|
50 |
+
topic_data = [item.get("topic") for item in topic_response.data]
|
51 |
+
generated_topics.extend(topic_data)
|
52 |
+
|
53 |
+
if len(generated_topics) >= num_topics:
|
54 |
+
break
|
55 |
+
|
56 |
+
return generated_topics
|
57 |
+
|
58 |
+
|
59 |
+
async def prepare_topic(
|
60 |
+
topics: Optional[List[Union[str, Dict[str, int]]]] = None,
|
61 |
+
num_records: Optional[int] = None,
|
62 |
+
records_per_topic: int = 20,
|
63 |
+
user_instruction: Optional[str] = None,
|
64 |
+
model_name: str = "openai/gpt-4o-mini",
|
65 |
+
model_kwargs: Optional[Dict[str, Any]] = None,
|
66 |
+
) -> List[Dict[str, str]]:
|
67 |
+
"""Split records into topics, generating topics if none are provided or if needed.
|
68 |
+
|
69 |
+
Supported input formats:
|
70 |
+
1. String list: ['topic1', 'topic2'] - Topics with equal or calculated distribution
|
71 |
+
2. Dict list: [{'topic1': 20}, {'topic2': 30}] - Topics with specific counts
|
72 |
+
3. Mixed: ['topic1', {'topic2': 30}] - Combination of both formats
|
73 |
+
4. None: No topics provided, will generate based on user_instruction
|
74 |
+
|
75 |
+
Args:
|
76 |
+
topics: Optional list of topics, either strings or {topic: count} dicts
|
77 |
+
num_records: Total number of records to split (required for dict topics or None topics)
|
78 |
+
records_per_topic: Number of records per topic (default: 20)
|
79 |
+
user_instruction: Topic generation instructions (required if topics is None)
|
80 |
+
model_name: Model name for topic generation
|
81 |
+
model_kwargs: Model kwargs for topic generation
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
List of {'topic': topic_name} dictionaries, with one entry per record
|
85 |
+
"""
|
86 |
+
if model_kwargs is None:
|
87 |
+
model_kwargs = {}
|
88 |
+
if "temperature" not in model_kwargs:
|
89 |
+
model_kwargs["temperature"] = 1
|
90 |
+
# --- STEP 1: Input validation and normalization ---
|
91 |
+
if topics is None:
|
92 |
+
# Must have num_records and user_instruction if no topics provided
|
93 |
+
if not num_records or num_records <= 0:
|
94 |
+
raise ValueError("num_records must be positive when topics are not provided")
|
95 |
+
if not user_instruction:
|
96 |
+
raise ValueError("user_instruction required when topics are not provided")
|
97 |
+
topic_assignments = []
|
98 |
+
else:
|
99 |
+
# Validate topics is a non-empty list
|
100 |
+
if not isinstance(topics, list) or not topics:
|
101 |
+
raise ValueError("topics must be a non-empty list")
|
102 |
+
|
103 |
+
# Convert all topic inputs to a standardized [(topic_name, count)] list
|
104 |
+
# For string topics: count will be None (to be calculated later)
|
105 |
+
# For dict topics: use the specified count
|
106 |
+
topic_assignments = []
|
107 |
+
seen_topics = set()
|
108 |
+
|
109 |
+
for topic in topics:
|
110 |
+
if isinstance(topic, str):
|
111 |
+
if topic not in seen_topics:
|
112 |
+
topic_assignments.append((topic, None))
|
113 |
+
seen_topics.add(topic)
|
114 |
+
elif isinstance(topic, dict) and len(topic) == 1:
|
115 |
+
topic_name = next(iter(topic))
|
116 |
+
count = topic[topic_name]
|
117 |
+
|
118 |
+
if not isinstance(count, int) or count < 0:
|
119 |
+
raise ValueError(f"Topic '{topic_name}' has invalid count {count}")
|
120 |
+
|
121 |
+
if topic_name not in seen_topics:
|
122 |
+
topic_assignments.append((topic_name, count))
|
123 |
+
seen_topics.add(topic_name)
|
124 |
+
else:
|
125 |
+
raise ValueError("Topics must be strings or single-key dictionaries")
|
126 |
+
|
127 |
+
# --- STEP 2: Calculate or validate counts for provided topics ---
|
128 |
+
result = []
|
129 |
+
assigned_count = 0
|
130 |
+
topic_names = [] # Track all assigned topic names
|
131 |
+
|
132 |
+
if topic_assignments:
|
133 |
+
# Handle string topics with no count (None) - assign counts based on input
|
134 |
+
string_topics = [(name, count) for name, count in topic_assignments if count is None]
|
135 |
+
dict_topics = [(name, count) for name, count in topic_assignments if count is not None]
|
136 |
+
|
137 |
+
# Case: String topics with no num_records - assign records_per_topic to each
|
138 |
+
if string_topics and num_records is None:
|
139 |
+
for name, _ in string_topics:
|
140 |
+
result.append({name: records_per_topic})
|
141 |
+
topic_names.append(name)
|
142 |
+
assigned_count += records_per_topic
|
143 |
+
|
144 |
+
# Case: String topics with num_records - distribute evenly
|
145 |
+
elif string_topics and num_records is not None:
|
146 |
+
remaining = num_records - sum(count for _, count in dict_topics if count is not None)
|
147 |
+
if remaining < 0:
|
148 |
+
raise ValueError("Dict topic counts exceed num_records")
|
149 |
+
|
150 |
+
# Distribute remaining records among string topics
|
151 |
+
if string_topics and remaining > 0:
|
152 |
+
base = remaining // len(string_topics)
|
153 |
+
extra = remaining % len(string_topics)
|
154 |
+
|
155 |
+
for i, (name, _) in enumerate(string_topics):
|
156 |
+
count = base + (1 if i < extra else 0)
|
157 |
+
if count > 0:
|
158 |
+
result.append({name: count})
|
159 |
+
topic_names.append(name)
|
160 |
+
assigned_count += count
|
161 |
+
|
162 |
+
# Add dictionary topics with predefined counts
|
163 |
+
for name, count in dict_topics:
|
164 |
+
if count > 0:
|
165 |
+
result.append({name: count})
|
166 |
+
topic_names.append(name)
|
167 |
+
assigned_count += count
|
168 |
+
|
169 |
+
# Validate total count for dictionary topics
|
170 |
+
if dict_topics and num_records is None:
|
171 |
+
raise ValueError("num_records required when using dictionary topics")
|
172 |
+
|
173 |
+
if num_records is not None and assigned_count > num_records:
|
174 |
+
raise ValueError(f"Total assigned count ({assigned_count}) exceeds num_records ({num_records})")
|
175 |
+
|
176 |
+
# --- STEP 3: Generate topics for remaining records if needed ---
|
177 |
+
remaining_records = 0 if num_records is None else num_records - assigned_count
|
178 |
+
|
179 |
+
if remaining_records > 0:
|
180 |
+
if records_per_topic <= 0:
|
181 |
+
raise ValueError("records_per_topic must be positive when generating topics")
|
182 |
+
|
183 |
+
# Generate topics with LLM if instructions provided
|
184 |
+
if user_instruction:
|
185 |
+
topics_needed = math.ceil(remaining_records / records_per_topic)
|
186 |
+
|
187 |
+
generated = await generate_topics(
|
188 |
+
user_instruction=user_instruction, num_topics=topics_needed, model_name=model_name, model_kwargs=model_kwargs, existing_topics=topic_names
|
189 |
+
)
|
190 |
+
|
191 |
+
# Assign counts to generated topics
|
192 |
+
for topic in generated:
|
193 |
+
if topic in topic_names: # Skip if duplicate (shouldn't happen with proper LLM)
|
194 |
+
print(f"Skipping duplicate generated topic: {topic}")
|
195 |
+
continue
|
196 |
+
|
197 |
+
count = min(records_per_topic, remaining_records)
|
198 |
+
if count <= 0:
|
199 |
+
break
|
200 |
+
|
201 |
+
result.append({topic: count})
|
202 |
+
topic_names.append(topic)
|
203 |
+
remaining_records -= count
|
204 |
+
assigned_count += count
|
205 |
+
|
206 |
+
# Generate auto-topics for any still-remaining records
|
207 |
+
auto_index = 1
|
208 |
+
while remaining_records > 0:
|
209 |
+
# Find next available auto_topic name
|
210 |
+
auto_name = f"auto_topic{auto_index}"
|
211 |
+
while auto_name in topic_names:
|
212 |
+
auto_index += 1
|
213 |
+
auto_name = f"auto_topic{auto_index}"
|
214 |
+
|
215 |
+
count = min(records_per_topic, remaining_records)
|
216 |
+
result.append({auto_name: count})
|
217 |
+
topic_names.append(auto_name)
|
218 |
+
remaining_records -= count
|
219 |
+
assigned_count += count
|
220 |
+
auto_index += 1
|
221 |
+
|
222 |
+
# Final validation
|
223 |
+
if num_records is not None and assigned_count != num_records:
|
224 |
+
print(f"Warning: Assigned {assigned_count} records, expected {num_records}")
|
225 |
+
|
226 |
+
flatten_topic_list = []
|
227 |
+
for item in result:
|
228 |
+
for key, count in item.items():
|
229 |
+
flatten_topic_list.extend([{"topic": key}] * count)
|
230 |
+
|
231 |
+
return flatten_topic_list
|
232 |
+
|
233 |
+
|
234 |
+
if __name__ == "__main__":
|
235 |
+
print("--- Running Examples ---")
|
236 |
+
|
237 |
+
# Example 1: Dictionary topics with additional generation
|
238 |
+
print("\nExample 1: Dictionary topics + generation")
|
239 |
+
topics1 = [{"topic1": 20}, {"topic2": 30}]
|
240 |
+
result1 = asyncio.run(prepare_topic(topics=topics1, num_records=100, records_per_topic=25, user_instruction="some context"))
|
241 |
+
print(f"Result: {result1}")
|
242 |
+
print(f"Total: {len(result1)}")
|
243 |
+
|
244 |
+
# Example 2: String topics with even distribution
|
245 |
+
print("\nExample 2: String topics with distribution")
|
246 |
+
topics2 = ["topicA", "topicB", "topicC"]
|
247 |
+
result2 = asyncio.run(prepare_topic(topics=topics2, num_records=10))
|
248 |
+
print(f"Result: {result2}")
|
249 |
+
print(f"Total: {len(result2)}")
|
250 |
+
|
251 |
+
# Example 3: Mixed string and dict topics
|
252 |
+
print("\nExample 3: Mixed string/dict topics")
|
253 |
+
topics3 = ["topicX", {"topicY": 10}]
|
254 |
+
result3 = asyncio.run(prepare_topic(topics=topics3, num_records=30, user_instruction="mixed topics"))
|
255 |
+
print(f"Result: {result3}")
|
256 |
+
print(f"Total: {len(result3)}")
|
257 |
+
|
258 |
+
# Example 4: String topics with fixed count
|
259 |
+
print("\nExample 4: String topics with fixed count")
|
260 |
+
topics4 = ["apple", "banana", "cherry"]
|
261 |
+
result4 = asyncio.run(prepare_topic(topics=topics4, records_per_topic=15))
|
262 |
+
print(f"Result: {result4}")
|
263 |
+
print(f"Total: {len(result4)}")
|
264 |
+
|
265 |
+
# Example 5: No topics, generate all
|
266 |
+
print("\nExample 5: No topics, generate all")
|
267 |
+
|
268 |
+
async def run_example5():
|
269 |
+
result = await prepare_topic(topics=None, num_records=10, records_per_topic=5, user_instruction="cloud computing")
|
270 |
+
print(f"Result: {result}")
|
271 |
+
print(f"Total: {len(result)}")
|
272 |
+
|
273 |
+
asyncio.run(run_example5())
|
274 |
+
|
275 |
+
print("\n--- Examples Finished ---")
|
src/starfish/data_factory/config.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PROGRESS_LOG_INTERVAL = 3
|
2 |
+
TASK_RUNNER_TIMEOUT = 60
|
3 |
+
|
4 |
+
MAX_CONCURRENT_TASKS = 10
|
5 |
+
|
6 |
+
NOT_COMPLETED_THRESHOLD = 3
|
src/starfish/data_factory/constants.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
RECORD_STATUS = "status"
|
6 |
+
|
7 |
+
STATUS_TOTAL = "total"
|
8 |
+
STATUS_COMPLETED = "completed"
|
9 |
+
STATUS_DUPLICATE = "duplicate"
|
10 |
+
STATUS_FILTERED = "filtered"
|
11 |
+
STATUS_FAILED = "failed"
|
12 |
+
|
13 |
+
STATUS_MOJO_MAP = {
|
14 |
+
STATUS_COMPLETED: "✅",
|
15 |
+
STATUS_DUPLICATE: "🔁",
|
16 |
+
STATUS_FILTERED: "🚫",
|
17 |
+
STATUS_FAILED: "❌",
|
18 |
+
STATUS_TOTAL: "📊",
|
19 |
+
}
|
20 |
+
RUN_MODE = "run_mode"
|
21 |
+
RUN_MODE_NORMAL = "normal"
|
22 |
+
RUN_MODE_RE_RUN = "resume_from_checkpoint"
|
23 |
+
RUN_MODE_DRY_RUN = "dry_run"
|
24 |
+
|
25 |
+
STORAGE_TYPE_LOCAL = "local"
|
26 |
+
STORAGE_TYPE_IN_MEMORY = "in_memory"
|
27 |
+
|
28 |
+
IDX = "idx_index"
|
29 |
+
|
30 |
+
|
31 |
+
# Define the function directly in constants to avoid circular imports
|
32 |
+
def get_app_data_dir():
|
33 |
+
r"""Returns a platform-specific directory for application data storage.
|
34 |
+
|
35 |
+
Following platform conventions:
|
36 |
+
- Linux: ~/.local/share/starfish
|
37 |
+
- macOS: ~/Library/Application Support/starfish
|
38 |
+
- Windows: %LOCALAPPDATA%\starfish
|
39 |
+
|
40 |
+
Environment variable STARFISH_LOCAL_STORAGE_DIR can override this location.
|
41 |
+
"""
|
42 |
+
# Allow override through environment variable
|
43 |
+
env_dir = os.environ.get("STARFISH_LOCAL_STORAGE_DIR")
|
44 |
+
if env_dir:
|
45 |
+
return env_dir
|
46 |
+
|
47 |
+
app_name = "starfish"
|
48 |
+
|
49 |
+
# Get user's home directory
|
50 |
+
home = Path.home()
|
51 |
+
|
52 |
+
# Platform-specific paths
|
53 |
+
if sys.platform == "win32":
|
54 |
+
# Windows: Use %LOCALAPPDATA% if available, otherwise construct from home
|
55 |
+
app_data = os.environ.get("LOCALAPPDATA")
|
56 |
+
if not app_data:
|
57 |
+
app_data = os.path.join(home, "AppData", "Local")
|
58 |
+
base_dir = os.path.join(app_data, app_name)
|
59 |
+
elif sys.platform == "darwin":
|
60 |
+
# macOS
|
61 |
+
base_dir = os.path.join(home, "Library", "Application Support", app_name)
|
62 |
+
else:
|
63 |
+
# Linux/Unix: follow XDG Base Directory Specification
|
64 |
+
xdg_data_home = os.environ.get("XDG_DATA_HOME")
|
65 |
+
if not xdg_data_home:
|
66 |
+
xdg_data_home = os.path.join(home, ".local", "share")
|
67 |
+
base_dir = os.path.join(xdg_data_home, app_name)
|
68 |
+
|
69 |
+
return base_dir
|
70 |
+
|
71 |
+
|
72 |
+
# Get application database directory
|
73 |
+
APP_DATA_DIR = get_app_data_dir()
|
74 |
+
LOCAL_STORAGE_PATH = os.path.join(APP_DATA_DIR, "db")
|
75 |
+
LOCAL_STORAGE_URI = f"file://{LOCAL_STORAGE_PATH}"
|
src/starfish/data_factory/event_loop.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
|
3 |
+
import nest_asyncio
|
4 |
+
|
5 |
+
from starfish.common.logger import get_logger
|
6 |
+
|
7 |
+
logger = get_logger(__name__)
|
8 |
+
|
9 |
+
|
10 |
+
def run_in_event_loop(coroutine):
|
11 |
+
"""Run a coroutine in the event loop, handling both nested and new loop cases.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
coroutine: The coroutine to be executed
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
The result of the coroutine execution
|
18 |
+
|
19 |
+
Note:
|
20 |
+
If an event loop is already running, nest_asyncio will be used to allow
|
21 |
+
nested execution. If no loop is running, a new event loop will be created.
|
22 |
+
"""
|
23 |
+
try:
|
24 |
+
# This call will raise an RuntimError if there is no event loop running.
|
25 |
+
asyncio.get_running_loop()
|
26 |
+
|
27 |
+
# If there is an event loop running (the call above doesn't raise an exception), we can use nest_asyncio to patch the event loop.
|
28 |
+
nest_asyncio.apply()
|
29 |
+
logger.debug(f"Running nested coroutine: {coroutine.__name__}")
|
30 |
+
except RuntimeError as e:
|
31 |
+
# If no event loop is running, asyncio
|
32 |
+
# Explicitly pass, since we want to fallback to asyncio.run
|
33 |
+
logger.debug(str(e))
|
34 |
+
logger.debug(f"Running coroutine: {coroutine.__name__}")
|
35 |
+
return asyncio.run(coroutine)
|
src/starfish/data_factory/factory.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Callable, Dict, List, Optional, cast
|
2 |
+
from starfish.common.logger import get_logger
|
3 |
+
from starfish.data_factory.config import NOT_COMPLETED_THRESHOLD, TASK_RUNNER_TIMEOUT
|
4 |
+
from starfish.data_factory.constants import STORAGE_TYPE_LOCAL
|
5 |
+
from starfish.data_factory.factory_ import Factory
|
6 |
+
from starfish.data_factory.factory_wrapper import FactoryWrapper, DataFactoryProtocol, P, T
|
7 |
+
from starfish.data_factory.factory_executor_manager import FactoryExecutorManager
|
8 |
+
from starfish.data_factory.utils.data_class import FactoryMasterConfig
|
9 |
+
from starfish.data_factory.utils.state import MutableSharedState
|
10 |
+
|
11 |
+
logger = get_logger(__name__)
|
12 |
+
|
13 |
+
|
14 |
+
def data_factory(
|
15 |
+
storage: str = STORAGE_TYPE_LOCAL,
|
16 |
+
batch_size: int = 1,
|
17 |
+
target_count: int = 0,
|
18 |
+
dead_queue_threshold: int = 3,
|
19 |
+
max_concurrency: int = 10,
|
20 |
+
initial_state_values: Optional[Dict[str, Any]] = None,
|
21 |
+
on_record_complete: Optional[List[Callable]] = None,
|
22 |
+
on_record_error: Optional[List[Callable]] = None,
|
23 |
+
show_progress: bool = True,
|
24 |
+
task_runner_timeout: int = TASK_RUNNER_TIMEOUT,
|
25 |
+
job_run_stop_threshold: int = NOT_COMPLETED_THRESHOLD,
|
26 |
+
) -> Callable[[Callable[P, T]], DataFactoryProtocol[P, T]]:
|
27 |
+
"""Decorator for creating data processing pipelines.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
storage: Storage backend to use ('local' or 'in_memory')
|
31 |
+
batch_size: Number of records to process in each batch
|
32 |
+
target_count: Target number of records to generate (0 means process all input)
|
33 |
+
max_concurrency: Maximum number of concurrent tasks
|
34 |
+
initial_state_values: Initial values for shared state
|
35 |
+
on_record_complete: Callbacks to execute after successful record processing
|
36 |
+
on_record_error: Callbacks to execute after failed record processing
|
37 |
+
show_progress: Whether to display progress bar
|
38 |
+
task_runner_timeout: Timeout in seconds for task execution
|
39 |
+
job_run_stop_threshold: Threshold for stopping job if too many records fail
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
Decorated function with additional execution methods
|
43 |
+
"""
|
44 |
+
# Initialize default values
|
45 |
+
on_record_error = on_record_error or []
|
46 |
+
on_record_complete = on_record_complete or []
|
47 |
+
initial_state_values = initial_state_values or {}
|
48 |
+
|
49 |
+
# Create configuration
|
50 |
+
config = FactoryMasterConfig(
|
51 |
+
storage=storage,
|
52 |
+
batch_size=batch_size,
|
53 |
+
target_count=target_count,
|
54 |
+
dead_queue_threshold=dead_queue_threshold,
|
55 |
+
max_concurrency=max_concurrency,
|
56 |
+
show_progress=show_progress,
|
57 |
+
task_runner_timeout=task_runner_timeout,
|
58 |
+
on_record_complete=on_record_complete,
|
59 |
+
on_record_error=on_record_error,
|
60 |
+
job_run_stop_threshold=job_run_stop_threshold,
|
61 |
+
)
|
62 |
+
|
63 |
+
# Initialize factory instance
|
64 |
+
_factory = None
|
65 |
+
|
66 |
+
def decorator(func: Callable[P, T]) -> DataFactoryProtocol[P, T]:
|
67 |
+
"""Actual decorator that wraps the function."""
|
68 |
+
nonlocal _factory
|
69 |
+
_factory = _initialize_or_update_factory(_factory, config, func, initial_state_values)
|
70 |
+
wrapper = FactoryWrapper(_factory, func)
|
71 |
+
return cast(DataFactoryProtocol[P, T], wrapper)
|
72 |
+
|
73 |
+
# Add resume capability as a static method
|
74 |
+
data_factory.resume_from_checkpoint = resume_from_checkpoint
|
75 |
+
|
76 |
+
return decorator
|
77 |
+
|
78 |
+
|
79 |
+
def _initialize_or_update_factory(
|
80 |
+
factory: Optional[Factory], config: FactoryMasterConfig, func: Callable[P, T], initial_state_values: Dict[str, Any]
|
81 |
+
) -> Factory:
|
82 |
+
"""Initialize or update a Factory instance."""
|
83 |
+
if factory is None:
|
84 |
+
factory = Factory(config, func)
|
85 |
+
factory.state = MutableSharedState(initial_data=initial_state_values)
|
86 |
+
else:
|
87 |
+
factory.config = config
|
88 |
+
factory.func = func
|
89 |
+
factory.state = MutableSharedState(initial_data=initial_state_values)
|
90 |
+
return factory
|
91 |
+
|
92 |
+
|
93 |
+
def resume_from_checkpoint(*args, **kwargs) -> List[dict[str, Any]]:
|
94 |
+
"""Decorator for creating data processing pipelines.
|
95 |
+
|
96 |
+
Args:
|
97 |
+
master_job_id : resume for this master job
|
98 |
+
storage: Storage backend to use ('local' or 'in_memory')
|
99 |
+
batch_size: Number of records to process in each batch
|
100 |
+
target_count: Target number of records to generate (0 means process all input)
|
101 |
+
max_concurrency: Maximum number of concurrent tasks
|
102 |
+
initial_state_values: Initial values for shared state
|
103 |
+
on_record_complete: Callbacks to execute after successful record processing
|
104 |
+
on_record_error: Callbacks to execute after failed record processing
|
105 |
+
show_progress: Whether to display progress bar
|
106 |
+
task_runner_timeout: Timeout in seconds for task execution
|
107 |
+
job_run_stop_threshold: Threshold for stopping job if too many records fail
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
List[Dict(str,Any)]
|
111 |
+
"""
|
112 |
+
return FactoryExecutorManager.resume(*args, **kwargs)
|