John-Jiang commited on
Commit
5301c48
·
0 Parent(s):

init commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +118 -0
  2. .env.template +17 -0
  3. .github/dependabot.yml +6 -0
  4. .github/workflows/lint-and-test.yaml +60 -0
  5. .github/workflows/publish_pypi.yaml +39 -0
  6. .github/workflows/publish_testpypi.yaml +107 -0
  7. .gitignore +190 -0
  8. .gitmodules +6 -0
  9. .pre-commit-config.yaml +25 -0
  10. Dockerfile +79 -0
  11. LICENSE +202 -0
  12. Makefile +28 -0
  13. README.HuggingFace.md +177 -0
  14. README.md +193 -0
  15. docs_mintlify +1 -0
  16. examples/__init__.py +0 -0
  17. examples/data_factory.ipynb +681 -0
  18. examples/data_factory_release_check.ipynb +494 -0
  19. examples/embedding_usage_example.py +202 -0
  20. examples/structured_llm.ipynb +470 -0
  21. examples/usecases/math_data_gen.ipynb +0 -0
  22. internal +1 -0
  23. mcp_hackathon/README.md +119 -0
  24. mcp_hackathon/data_gen_server/.gitignore +1 -0
  25. mcp_hackathon/data_gen_server/.python-version +1 -0
  26. mcp_hackathon/data_gen_server/data_gen_server.py +68 -0
  27. mcp_hackathon/data_gen_server/model_gen.py +73 -0
  28. mcp_hackathon/data_gen_server/model_probe.py +65 -0
  29. nginx.conf +112 -0
  30. poetry.lock +0 -0
  31. prebuilt_template/README.md +61 -0
  32. prebuilt_template/function_calling/README.md +23 -0
  33. prebuilt_template/function_calling/sample_run.ipynb +425 -0
  34. prebuilt_template/generate_by_topic/README.md +102 -0
  35. prebuilt_template/generate_by_topic/sample_run.ipynb +438 -0
  36. pyproject.toml +132 -0
  37. pytest.ini +7 -0
  38. readme-web.md +23 -0
  39. scripts/hug_push.sh +1 -0
  40. scripts/rag.py +155 -0
  41. src/starfish/__init__.py +18 -0
  42. src/starfish/common/env_loader.py +52 -0
  43. src/starfish/common/exceptions.py +325 -0
  44. src/starfish/common/logger.py +104 -0
  45. src/starfish/components/__init__.py +3 -0
  46. src/starfish/components/prepare_topic.py +275 -0
  47. src/starfish/data_factory/config.py +6 -0
  48. src/starfish/data_factory/constants.py +75 -0
  49. src/starfish/data_factory/event_loop.py +35 -0
  50. src/starfish/data_factory/factory.py +112 -0
.dockerignore ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git and version control
2
+ .git
3
+ .gitignore
4
+ .gitattributes
5
+
6
+ # Development files
7
+ .env*
8
+ !.env.example
9
+ .vscode/
10
+ .idea/
11
+ *.swp
12
+ *.swo
13
+ *~
14
+
15
+ # OS files
16
+ .DS_Store
17
+ .DS_Store?
18
+ ._*
19
+ .Spotlight-V100
20
+ .Trashes
21
+ ehthumbs.db
22
+ Thumbs.db
23
+
24
+ # Python
25
+ __pycache__/
26
+ *.py[cod]
27
+ *$py.class
28
+ *.so
29
+ .Python
30
+ build/
31
+ develop-eggs/
32
+ dist/
33
+ downloads/
34
+ eggs/
35
+ .eggs/
36
+ lib/
37
+ lib64/
38
+ parts/
39
+ sdist/
40
+ var/
41
+ wheels/
42
+ pip-wheel-metadata/
43
+ share/python-wheels/
44
+ *.egg-info/
45
+ .installed.cfg
46
+ *.egg
47
+ MANIFEST
48
+ .venv/
49
+ env/
50
+ venv/
51
+ ENV/
52
+ env.bak/
53
+ venv.bak/
54
+
55
+ # Node.js
56
+ web/node_modules/
57
+ web/npm-debug.log*
58
+ web/yarn-debug.log*
59
+ web/yarn-error.log*
60
+ web/.pnpm-debug.log*
61
+ web/.next/
62
+ web/out/
63
+ web/dist/
64
+ web/build/
65
+ web/.vercel
66
+
67
+ # Documentation
68
+ *.md
69
+ !README.md
70
+ docs/
71
+ docs_mintlify/
72
+ vibe_coding/
73
+
74
+ # Tests
75
+ tests/
76
+ *.test.js
77
+ *.test.ts
78
+ *.test.tsx
79
+ .coverage
80
+ htmlcov/
81
+ pytest.ini
82
+ .pytest_cache/
83
+
84
+ # Data and outputs
85
+ data_factory_output/
86
+ db/
87
+ *.db
88
+ *.sqlite
89
+
90
+ # Jupyter notebooks
91
+ *.ipynb
92
+ .ipynb_checkpoints/
93
+
94
+ # Temporary files
95
+ *.tmp
96
+ *.temp
97
+ .cache/
98
+
99
+ # Logs
100
+ *.log
101
+ web/api/logs/
102
+
103
+ # Docker files (except the main ones)
104
+ .dockerignore*
105
+ Dockerfile.*
106
+ docker-compose*.yml
107
+
108
+ # Development and internal files
109
+ internal/
110
+ examples/
111
+ mcp_hackathon/
112
+ prebuilt_template/
113
+ scripts/
114
+ htmlcov/
115
+
116
+ # Poetry (we copy these explicitly)
117
+ # poetry.lock - we need this
118
+ # pyproject.toml - we need this
.env.template ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Starfish Environment Variables
2
+ # Copy this file to .env and customize for your local environment
3
+ # DO NOT commit the .env file to version control
4
+
5
+ # Environment type (DEV, STAGING, PROD)
6
+ ENV=DEV
7
+
8
+ # API Keys (replace with your own)
9
+ OPENAI_API_KEY=your_openai_api_key_here
10
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
11
+ HUGGING_FACE_HUB_TOKEN=your_huggingface_token_here
12
+ TELEMETRY_ENABLED=true
13
+
14
+ # Logging
15
+ LOG_LEVEL=INFO
16
+ # STARFISH_LOCAL_STORAGE_DIR=
17
+ JINA_AI_API_KEY=jina_api_key
.github/dependabot.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "pip"
4
+ directory: "/" # root of the repo
5
+ schedule:
6
+ interval: "weekly"
.github/workflows/lint-and-test.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Starfish testing workflow
2
+
3
+ on:
4
+ # push:
5
+ # branches:
6
+ # - main
7
+ # - dev
8
+ pull_request:
9
+ branches:
10
+ - main
11
+ - dev
12
+ - '!f/pypi_release'
13
+
14
+ jobs:
15
+ test-integration:
16
+ if: github.event.pull_request.head.ref != 'f/pypi_release'
17
+ runs-on: ubuntu-latest
18
+
19
+ steps:
20
+ - name: Checkout code
21
+ uses: actions/checkout@v2
22
+
23
+ - name: Set up Python
24
+ uses: actions/setup-python@v2
25
+ with:
26
+ python-version: '3.11'
27
+
28
+ - name: Load cached Poetry installation
29
+ uses: actions/cache@v3
30
+ with:
31
+ path: ~/.local
32
+ key: poetry-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
33
+
34
+ - name: Load cached venv
35
+ uses: actions/cache@v3
36
+ with:
37
+ path: .venv
38
+ key: venv-${{ runner.os }}-python-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
39
+
40
+ - name: Set Locale
41
+ run: |
42
+ sudo locale-gen "en_US.UTF-8"
43
+ export LC_ALL=en_US.UTF-8
44
+ export LANG=en_US.UTF-8
45
+ export TELEMETRY_ENABLED=false
46
+
47
+ - name: Install dependencies
48
+ run: |
49
+ pip install poetry
50
+ poetry install --with dev
51
+
52
+ # - name: Run ruff
53
+ # run: |
54
+ # poetry run ruff check . --output-format=github
55
+ # poetry run ruff format . --check
56
+
57
+ # --cov-report=html
58
+ - name: Run tests with coverage
59
+ run: |
60
+ poetry run pytest --cov='src' --cov-fail-under=20 tests/
.github/workflows/publish_pypi.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+ # branches:
8
+ # - 'main'
9
+
10
+ jobs:
11
+ deploy:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v3
15
+ with:
16
+ fetch-depth: 0
17
+ - name: Verify tag is on main branch
18
+ run: |
19
+ TAG_NAME=${GITHUB_REF#refs/tags/}
20
+ COMMIT=$(git rev-parse $TAG_NAME)
21
+ if ! git branch --contains $COMMIT | grep -qw main; then
22
+ echo "::error::Tag $TAG_NAME must be created from main branch"
23
+ exit 1
24
+ fi
25
+ - name: Set up Python
26
+ uses: actions/setup-python@v4
27
+ with:
28
+ python-version: '3.x'
29
+ - name: Install dependencies
30
+ run: |
31
+ python -m pip install --upgrade pip
32
+ pip install build twine
33
+ - name: Build and publish
34
+ env:
35
+ TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
36
+ TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
37
+ run: |
38
+ python -m build
39
+ twine upload dist/*
.github/workflows/publish_testpypi.yaml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Publish to Test PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'test-v*'
7
+ branches:
8
+ - 'f/pypi_release'
9
+
10
+ jobs:
11
+ deploy_testpypi:
12
+ #if: true
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v3
16
+ with:
17
+ fetch-depth: 0 # Required for full commit history check
18
+ - name: Verify tag is on dev branch
19
+ run: |
20
+ TAG_NAME=${GITHUB_REF#refs/tags/}
21
+ COMMIT=$(git rev-parse $TAG_NAME)
22
+ if ! git branch --contains $COMMIT | grep -qw dev; then
23
+ echo "::error::Tag $TAG_NAME must be created from dev branch"
24
+ exit 1
25
+ fi
26
+ - name: Set up Python
27
+ uses: actions/setup-python@v4
28
+ with:
29
+ python-version: '3.x'
30
+ - name: Install dependencies
31
+ run: |
32
+ python -m pip install --upgrade pip
33
+ pip install build twine
34
+ - name: Build and publish
35
+ env:
36
+ #TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
37
+ TWINE_USERNAME: __token__
38
+ TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
39
+ #ACTIONS_STEP_DEBUG: true
40
+ run: |
41
+ # echo "TWINE_PASSWORD first 5 chars: ${TWINE_PASSWORD:0:184}"
42
+ # echo "TWINE_PASSWORD length: ${#TWINE_PASSWORD}"
43
+ python -m build
44
+ twine upload --verbose --repository-url https://test.pypi.org/legacy/ dist/*
45
+
46
+ test-colab:
47
+ needs: deploy_testpypi
48
+ runs-on: ubuntu-latest
49
+ #a Public "Colab-like" Image
50
+ container:
51
+ image: jupyter/minimal-notebook:latest
52
+ options: --user root # Run as root to avoid permission issues
53
+ permissions:
54
+ contents: write
55
+ steps:
56
+ - uses: actions/checkout@v3
57
+ with:
58
+ sparse-checkout: |
59
+ tests/*
60
+ examples/data_factory_release_check.ipynb
61
+ sparse-checkout-cone-mode: false
62
+ - name: Update system packages
63
+ run: |
64
+ apt-get update
65
+ apt-get install -y libssl3 # Removed sudo since we're running as root
66
+ - name: Print Python and Jupyter versions
67
+ run: |
68
+ python --version
69
+ pip list | grep -E 'jupyter|ipykernel|nbconvert|notebook'
70
+ # Authenticate to GCP
71
+ # - name: Authenticate to GCP
72
+ # uses: google-github-actions/auth@v1
73
+ # with:
74
+ # credentials_json: ${{ secrets.GCP_SA_KEY }}
75
+
76
+ # # Configure Docker to use GCR credentials
77
+ # - name: Configure Docker for GCR
78
+ # uses: google-github-actions/docker-auth@v1
79
+
80
+ # # Now you can pull the image
81
+ # - name: Use Colab base image
82
+ # run: docker pull gcr.io/colab-images/base:latest
83
+
84
+ # --no-prompt --no-input \ suppress the output
85
+ - name: Run Colab-style tests
86
+ run: |
87
+ if ! jupyter nbconvert --execute --to notebook --inplace \
88
+ --ExecutePreprocessor.kernel_name=python3 \
89
+ --ExecutePreprocessor.timeout=120 \
90
+ --no-prompt --no-input \
91
+ --stdout \
92
+ examples/data_factory_release_check.ipynb; then
93
+ echo "::error::Notebook execution failed"
94
+ exit 1
95
+ fi
96
+ echo "Notebook executed successfully. Summary:" && \
97
+ jupyter nbconvert --to markdown --stdout \
98
+ examples/data_factory_release_check.ipynb | \
99
+ grep -E '^#|^##' || true
100
+
101
+ # Add tag deletion step
102
+ - name: Delete triggering tag after successful test
103
+ if: startsWith(github.ref, 'refs/tags/test-v')
104
+ run: |
105
+ gh api -X DELETE /repos/$GITHUB_REPOSITORY/git/refs/tags/${GITHUB_REF#refs/tags/}
106
+ env:
107
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
.gitignore ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adhoc stuff
2
+ web/node_modules/
3
+ web/.next/
4
+ web/public/
5
+ web/dist/
6
+ web/build/
7
+ web/out/
8
+ web/coverage/
9
+ web/logs/
10
+ web/.local/
11
+ web/.env
12
+
13
+ .serena/
14
+ docs/
15
+ /vibe_coding/response.md
16
+ /dev/
17
+ todo
18
+ .local/
19
+ .vscode/
20
+ db/
21
+ .ruff_cache/
22
+ data_factory_output/
23
+ examples/test_jupyter.ipynb
24
+ # *.ipynb
25
+ # .ipynb_checkpoints
26
+ .cursor
27
+
28
+ .DS_Store
29
+ */.DS_Store
30
+
31
+ # Byte-compiled / optimized / DLL files
32
+ __pycache__/
33
+ *.py[cod]
34
+ *$py.class
35
+
36
+ # C extensions
37
+ *.so
38
+
39
+ # Distribution / packaging
40
+ .Python
41
+ build/
42
+ develop-eggs/
43
+ dist/
44
+ downloads/
45
+ eggs/
46
+ .eggs/
47
+ lib/
48
+ lib64/
49
+ parts/
50
+ sdist/
51
+ var/
52
+ wheels/
53
+ share/python-wheels/
54
+ *.egg-info/
55
+ .installed.cfg
56
+ *.egg
57
+ MANIFEST
58
+
59
+ # PyInstaller
60
+ # Usually these files are written by a python script from a template
61
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
62
+ *.manifest
63
+ *.spec
64
+
65
+ # Installer logs
66
+ pip-log.txt
67
+ pip-delete-this-directory.txt
68
+
69
+ # Unit test / coverage reports
70
+ htmlcov/
71
+ .tox/
72
+ .nox/
73
+ .coverage
74
+ .coverage.*
75
+ .cache
76
+ nosetests.xml
77
+ coverage.xml
78
+ *.cover
79
+ *.py,cover
80
+ .hypothesis/
81
+ .pytest_cache/
82
+ cover/
83
+
84
+ # Translations
85
+ *.mo
86
+ *.pot
87
+
88
+ # Django stuff:
89
+ *.log
90
+ local_settings.py
91
+ db.sqlite3
92
+ db.sqlite3-journal
93
+
94
+ # Flask stuff:
95
+ instance/
96
+ .webassets-cache
97
+
98
+ # Scrapy stuff:
99
+ .scrapy
100
+
101
+ # Sphinx documentation
102
+ docs/_build/
103
+
104
+ # PyBuilder
105
+ .pybuilder/
106
+ target/
107
+
108
+ # Jupyter Notebook
109
+ .ipynb_checkpoints
110
+
111
+ # IPython
112
+ profile_default/
113
+ ipython_config.py
114
+
115
+ # pyenv
116
+ # For a library or package, you might want to ignore these files since the code is
117
+ # intended to run in multiple environments; otherwise, check them in:
118
+ # .python-version
119
+
120
+ # pipenv
121
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
122
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
123
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
124
+ # install all needed dependencies.
125
+ #Pipfile.lock
126
+
127
+ # poetry
128
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
129
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
130
+ # commonly ignored for libraries.
131
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
132
+ #poetry.lock
133
+
134
+ # pdm
135
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
136
+ #pdm.lock
137
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
138
+ # in version control.
139
+ # https://pdm.fming.dev/#use-with-ide
140
+ .pdm.toml
141
+
142
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
143
+ __pypackages__/
144
+
145
+ # Celery stuff
146
+ celerybeat-schedule
147
+ celerybeat.pid
148
+
149
+ # SageMath parsed files
150
+ *.sage.py
151
+
152
+ # Environments
153
+ .env
154
+ .venv
155
+ env/
156
+ venv/
157
+ ENV/
158
+ env.bak/
159
+ venv.bak/
160
+
161
+ # Spyder project settings
162
+ .spyderproject
163
+ .spyproject
164
+
165
+ # Rope project settings
166
+ .ropeproject
167
+
168
+ # mkdocs documentation
169
+ /site
170
+
171
+ # mypy
172
+ .mypy_cache/
173
+ .dmypy.json
174
+ dmypy.json
175
+
176
+ # Pyre type checker
177
+ .pyre/
178
+
179
+ # pytype static type analyzer
180
+ .pytype/
181
+
182
+ # Cython debug symbols
183
+ cython_debug/
184
+
185
+ # PyCharm
186
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
187
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
188
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
189
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
190
+ #.idea/
.gitmodules ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [submodule "internal"]
2
+ path = internal
3
+ url = https://github.com/starfishdata/starfish_internal.git
4
+ [submodule "docs_mintlify"]
5
+ path = docs_mintlify
6
+ url = https://github.com/starfishdata/docs.git
.pre-commit-config.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ # - repo: local
3
+ # hooks:
4
+ # - id: pytest
5
+ # name: Run pytest
6
+ # entry: poetry run pytest tests/
7
+ # language: system
8
+ # types: [python]
9
+ # pass_filenames: false
10
+ # always_run: true
11
+
12
+ - repo: https://github.com/astral-sh/ruff-pre-commit
13
+ # Ruff version.
14
+ rev: v0.8.6
15
+ hooks:
16
+ # Run the linter.
17
+ # - id: ruff
18
+ # #args: [ --fix ]
19
+ # types: [python]
20
+ # Run the formatter.
21
+ - id: ruff-format
22
+ #args: [ --fix ]
23
+ #run even when no Python files are staged
24
+ #always_run: true
25
+ types: [python]
Dockerfile ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-stage build for combined frontend + backend
2
+ FROM node:18-alpine AS frontend-builder
3
+
4
+ WORKDIR /app
5
+
6
+ # Copy package files
7
+ COPY web/package*.json ./
8
+
9
+ # Install dependencies
10
+ RUN npm ci
11
+
12
+ # Copy frontend code and build
13
+ COPY web/ ./
14
+
15
+ # Clean up unnecessary files
16
+ RUN rm -rf api/ || true
17
+ RUN rm -rf storage/ || true
18
+ RUN rm -rf .git/ || true
19
+ RUN rm -rf .next/ || true
20
+ RUN rm -rf .local/ || true
21
+
22
+ # Build frontend
23
+ RUN npm run build
24
+
25
+ # Backend stage
26
+ FROM python:3.11-slim
27
+
28
+ # Install system dependencies
29
+ RUN apt-get update && apt-get install -y \
30
+ nginx \
31
+ supervisor \
32
+ curl \
33
+ && rm -rf /var/lib/apt/lists/*
34
+
35
+ # Install Node.js for combined container
36
+ RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
37
+ apt-get install -y nodejs
38
+
39
+ WORKDIR /app
40
+
41
+ # Copy pyproject.toml and poetry.lock
42
+ COPY pyproject.toml poetry.lock ./
43
+
44
+ # Install Poetry and basic dependencies (skip heavy ML packages for testing)
45
+ RUN pip install --no-cache-dir --upgrade pip \
46
+ && pip install --no-cache-dir poetry \
47
+ && poetry config virtualenvs.create false \
48
+ && poetry install --only=main --no-root || pip install fastapi uvicorn python-dotenv pydantic
49
+
50
+ # Copy starfish source code and README (needed by backend)
51
+ COPY src/ ./src/
52
+ COPY README.md ./
53
+
54
+ # Copy built frontend from previous stage
55
+ COPY --from=frontend-builder /app/.next ./web/.next
56
+ COPY --from=frontend-builder /app/public ./web/public
57
+ COPY --from=frontend-builder /app/package.json ./web/package.json
58
+ #COPY --from=frontend-builder /app/node_modules ./web/node_modules
59
+
60
+ # Copy backend API code
61
+ COPY web/api/ ./web/api/
62
+
63
+ # Copy configuration files
64
+ COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
65
+ COPY nginx.conf /etc/nginx/nginx.conf
66
+
67
+ # Create necessary directories and set permissions
68
+ RUN mkdir -p /var/log/supervisor /var/log/nginx /var/run \
69
+ && chmod +x /app/src/ || true
70
+
71
+ # Expose port 7860 (required for Hugging Face Spaces)
72
+ EXPOSE 7860
73
+
74
+ # Health check
75
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
76
+ CMD curl -f http://localhost:7860/health || exit 1
77
+
78
+ # Start supervisor which manages both nginx and the applications
79
+ CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright 2025 Starfish AI Inc.
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
Makefile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lint:
2
+ @echo "Running Linter (Ruff)..."
3
+ poetry run isort tests/ src/ examples --check-only || poetry run isort tests/ src/ examples
4
+ poetry run ruff check src examples --fix --unsafe-fixes --exit-zero
5
+ poetry run ruff format src examples --check || poetry run ruff format src examples
6
+ docstring:
7
+ ruff check --select D src/starfish/data_factory
8
+ test:
9
+ poetry run pytest tests/
10
+
11
+ install: install-extras
12
+
13
+ #poetry install --extras "code_execution vllm" --with dev
14
+ # Install with specific extras
15
+ #make install EXTRAS="pdf"
16
+ # Install all extras
17
+ #make install EXTRAS="all"
18
+ # Install without extras (default)
19
+ #make install
20
+ install-extras:
21
+ @echo "Installing dependencies with extras: $(EXTRAS)"
22
+ poetry install $(if $(EXTRAS),--extras "$(EXTRAS)",) --with dev
23
+
24
+ start-client_claude:
25
+ python src/starfish/data_mcp/client_claude.py src/starfish/data_mcp/server.py
26
+
27
+ start-client_openai:
28
+ python src/starfish/data_mcp/client_openai.py
README.HuggingFace.md ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces Deployment
2
+
3
+ This guide explains how to deploy your combined FastAPI backend and Next.js frontend to Hugging Face Spaces.
4
+
5
+ > ✅ **Build Status**: Docker build is working successfully with resolved path alias issues!
6
+
7
+ ## Overview
8
+
9
+ The `Dockerfile.huggingface` creates a single container that runs:
10
+ - **FastAPI backend** on port 8002
11
+ - **Next.js frontend** on port 3000
12
+ - **Nginx reverse proxy** on port 7860 (required by Hugging Face Spaces)
13
+ - **Supervisor** to manage all processes
14
+
15
+ ## Files for Hugging Face Spaces
16
+
17
+ 1. **`Dockerfile`** - Combined Dockerfile for both services (multi-stage build)
18
+ 2. **`nginx.conf`** - Nginx configuration for routing
19
+ 3. **`supervisord.conf`** - Process manager configuration
20
+ 4. **`.dockerignore`** - Optimized to exclude only necessary files
21
+ 5. **`next.config.js`** - Enhanced with webpack path alias configuration
22
+ 6. **`tsconfig.json`** - Updated with explicit path mappings
23
+
24
+ ## Deployment Steps
25
+
26
+ ### 1. Prepare Your Repository
27
+
28
+ Your repository is already configured with the correct `Dockerfile` for Hugging Face Spaces deployment.
29
+
30
+ ### 2. Set Environment Variables in Hugging Face Spaces
31
+
32
+ In your Hugging Face Space settings, add these secrets:
33
+ - `GOOGLE_API_KEY` - Your Google API key
34
+ - `OPENAI_API_KEY` - Your OpenAI API key
35
+
36
+ ### 3. Configure Your Space
37
+
38
+ - **Space Type**: Docker
39
+ - **Visibility**: Public or Private (your choice)
40
+ - **Hardware**: CPU Basic (or upgrade if needed)
41
+
42
+ ### 4. Update API URLs in Frontend
43
+
44
+ Make sure your frontend points to the correct API endpoints:
45
+ ```typescript
46
+ // In your frontend code, use relative URLs:
47
+ const API_BASE_URL = "/api" // This goes to Next.js API routes in src/app/api/
48
+
49
+ // Next.js API routes will then proxy to FastAPI using:
50
+ // SERVER_BASE_URL=http://localhost:8002 (set in Dockerfile)
51
+ ```
52
+
53
+ ### 5. Deploy
54
+
55
+ 1. Push your code to the Hugging Face Space repository
56
+ 2. The space will automatically build and deploy
57
+
58
+ ## How It Works
59
+
60
+ ### Architecture
61
+ ```
62
+ External Request :7860
63
+
64
+ Nginx Proxy
65
+
66
+ Next.js :3000 (handles ALL routes)
67
+
68
+ /api/* → src/app/api/ routes
69
+
70
+ proxy.ts uses SERVER_BASE_URL
71
+
72
+ FastAPI Backend :8002
73
+ ```
74
+
75
+ ### Port Mapping
76
+ - **7860** - Main port (required by Hugging Face Spaces)
77
+ - **3000** - Next.js frontend (internal) - handles all routing
78
+ - **8002** - FastAPI backend (internal) - accessed via Next.js proxy
79
+
80
+ ### URL Routing
81
+ - `/` - Next.js frontend (all routes handled by Next.js)
82
+ - `/api/*` - Next.js API routes (in `src/app/api/`) that proxy to FastAPI backend
83
+ - `/backend-docs` - Direct FastAPI documentation (for debugging)
84
+ - `/backend-openapi.json` - Direct FastAPI OpenAPI schema (for debugging)
85
+
86
+ ### Process Management
87
+ Supervisor manages three processes:
88
+ 1. **backend** - FastAPI server (port 8002)
89
+ 2. **frontend** - Next.js server (port 3000) - handles all routing and proxying
90
+ 3. **nginx** - Reverse proxy (port 7860) - routes all traffic to Next.js
91
+
92
+ ## Troubleshooting
93
+
94
+ ### Common Issues
95
+
96
+ 1. **Build fails with "Module not found: Can't resolve '@/lib/utils'"**
97
+ - **FIXED**: This was caused by `lib/` being excluded in `.dockerignore`
98
+ - The issue has been resolved by removing the `lib/` exclusion pattern
99
+
100
+ 2. **Build fails during npm install**
101
+ - Check that all package.json dependencies are valid
102
+ - Ensure Node.js version compatibility
103
+
104
+ 3. **FastAPI fails to start**
105
+ - Check environment variables are set
106
+ - Verify the starfish package is properly configured
107
+ - Check logs in the Space's logs tab
108
+
109
+ 4. **Frontend can't reach backend**
110
+ - Ensure API calls use relative URLs (`/api/...`)
111
+ - Check that `SERVER_BASE_URL=http://localhost:8002` is set in the Dockerfile
112
+ - Verify Next.js API routes in `src/app/api/` are proxying correctly
113
+ - For direct FastAPI access, use `/backend-docs` instead of `/docs`
114
+
115
+ 5. **Space shows "Application starting" indefinitely**
116
+ - Check supervisor logs for errors
117
+ - Verify all services are starting properly
118
+
119
+ ### Viewing Logs
120
+
121
+ In your Hugging Face Space:
122
+ 1. Go to the "Logs" tab
123
+ 2. Look for errors from supervisor, nginx, backend, or frontend
124
+ 3. Logs are also written to `/var/log/` in the container
125
+
126
+ ### Local Testing
127
+
128
+ Test the Hugging Face build locally:
129
+ ```bash
130
+ # Build the image
131
+ docker build -t starfishai-web .
132
+
133
+ # Run with environment variables
134
+ docker run -p 7860:7860 3000:3000 8002:8002\
135
+ -e GOOGLE_API_KEY=your_key \
136
+ -e OPENAI_API_KEY=your_key \
137
+ starfishai-web
138
+ ```
139
+
140
+ Then visit:
141
+ - http://localhost:7860 - Main application
142
+ - http://localhost:7860/backend-docs - Direct FastAPI documentation
143
+ - http://localhost:7860/backend-openapi.json - Direct FastAPI schema
144
+
145
+ ## Recent Fixes & Improvements
146
+
147
+ ### Path Alias Resolution Fixed
148
+ - **Issue**: Build was failing with `Module not found: Can't resolve '@/lib/utils'`
149
+ - **Root Cause**: The `.dockerignore` file was excluding the `lib/` directory
150
+ - **Solution**: Removed `lib/` from `.dockerignore` and enhanced path configuration
151
+ - **Files Updated**:
152
+ - `.dockerignore` - Removed generic `lib/` exclusion
153
+ - `next.config.js` - Added explicit webpack path aliases
154
+ - `tsconfig.json` - Enhanced path mappings
155
+
156
+ ### Docker Build Optimization
157
+ - **Multi-stage build** for optimal image size
158
+ - **Specific Python exclusions** in `.dockerignore` (e.g., `api/__pycache__/` instead of all `__pycache__/`)
159
+ - **Enhanced file copying strategy** during build
160
+
161
+ ## Performance Tips
162
+
163
+ 1. **Use CPU Basic** for development, upgrade for production
164
+ 2. **Optimize Docker image** by removing unnecessary files
165
+ 3. **Use caching** for build dependencies
166
+ 4. **Monitor resource usage** in the Space dashboard
167
+
168
+ ## Security Notes
169
+
170
+ - Never commit API keys to your repository
171
+ - Use Hugging Face Spaces secrets for sensitive environment variables
172
+ - Consider making your Space private if it contains sensitive data
173
+ - Regularly update dependencies for security patches
174
+
175
+ docker run -d -p 7860:7860 --name starfish-app -v $(pwd)/nginx.conf:/etc/nginx/nginx.conf -v $(pwd)/supervisord.conf:/etc/supervisor/conf.d/supervisord.conf starfish-app
176
+
177
+ docker build -t starfish-app .
README.md ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Starfish - Synthetic Data Generation
3
+ emoji: 🌟
4
+ colorFrom: pink
5
+ colorTo: blue
6
+ sdk: docker
7
+ sdk_version: "4.36.0"
8
+ ---
9
+
10
+ <p align="center">
11
+ <img src="https://github.com/user-attachments/assets/744c666a-bb5c-418b-aab4-162072c0b8c8" alt="Starfish Logo" width="200"/>
12
+ </p>
13
+ <h1 align="center">Starfish</h1>
14
+ <h3 align="center" style="font-size: 20px; margin-bottom: 4px">Synthetic Data Generation Made Easy</h2>
15
+ </br>
16
+
17
+ <div align="center">
18
+
19
+ [![Github](https://img.shields.io/badge/starfish-black?style=for-the-badge&logo=github&color=black)](https://github.com/starfishdata/starfish) [![X](https://img.shields.io/badge/starfishdata-black?style=for-the-badge&logo=x&color=black&link=https%3A%2F%2Fx.com%2Fstarfishdata)](https://x.com/starfishdata) [![Hugging Face](https://img.shields.io/badge/starfishdata-yellow?style=for-the-badge&logo=huggingface&labelColor=black&color=black)](https://huggingface.co/starfishdata) [![Discord](https://img.shields.io/badge/starfishdata-yellow?style=for-the-badge&logo=discord&logoColor=white&labelColor=%235865F2&color=%235865F2)](https://discord.gg/qWKmeUtb)
20
+ <br>
21
+ [![Website](https://img.shields.io/badge/starfishdata-yellow?style=for-the-badge&label=SITE&labelColor=%23DB2777&color=%23FDF2F8)](https://starfishdata.ai/)
22
+ [![Docs](https://img.shields.io/badge/docs-pink?style=for-the-badge&label=Deepwiki&labelColor=%23da2876&color=%23fdf2f8&link=https%3A%2F%2Fdeepwiki.com%2Fstarfishdata%2Fstarfish%2F1-overview
23
+ )](https://deepwiki.com/starfishdata/starfish/1-overview)
24
+ </div>
25
+
26
+ ## Overview
27
+
28
+ Starfish is a Python library that helps you build synthetic data your way. We adapt to your workflow—not the other way around. By combining structured LLM outputs with efficient parallel processing, Starfish lets you define exactly how your data should look and scale seamlessly from experiments to production.
29
+
30
+ ⭐ Star us on GitHub if you find this project useful!
31
+
32
+ Key Features:
33
+ - **Structured Outputs**: First-class support for structured data through JSON schemas or Pydantic models.
34
+ - **Model Flexibility**: Use any LLM provider—local models, OpenAI, Anthropic, or your own implementation via LiteLLM.
35
+ - **Dynamic Prompts**: Dynamic prompts with built-in Jinja2 templates.
36
+ - **Easy Scaling**: Transform any function to run in parallel across thousands of inputs with a single decorator.
37
+ - **Resilient Pipeline**: Automatic retries, error handling, and job resumption—pause and continue your data generation anytime.
38
+ - **Complete Control**: Share state across your pipeline, extend functionality with custom hooks.
39
+
40
+ **Official Website**: [starfishdata.ai](https://starfishdata.ai/) - We offer both self-service and managed solutions. Visit our website to explore our services or contact us for more options!
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install starfish-core
46
+ ```
47
+
48
+ ### Optional Dependencies
49
+
50
+ Starfish supports optional dependencies for specific file parsers. Install only what you need:
51
+
52
+ ```bash
53
+ # Install specific parsers
54
+ pip install "starfish-core[pdf]" # PDF support
55
+ pip install "starfish-core[docx]" # Word document support
56
+ pip install "starfish-core[ppt]" # PowerPoint support
57
+ pip install "starfish-core[excel]" # Excel support
58
+ pip install "starfish-core[youtube]" # YouTube support
59
+
60
+ # Install all parser dependencies
61
+ pip install "starfish-core[all]"
62
+ ```
63
+
64
+ ## Configuration
65
+
66
+ Starfish uses environment variables for configuration. We provide a `.env.template` file to help you get started quickly:
67
+
68
+ ```bash
69
+ # Copy the template to .env
70
+ cp .env.template .env
71
+
72
+ # Edit with your API keys and configuration
73
+ nano .env # or use your preferred editor
74
+ ```
75
+
76
+ The template includes settings for API keys, model configurations, and other runtime parameters.
77
+
78
+ ## Quick Start
79
+
80
+ ### Structured LLM - Type-Safe Outputs from Any Model
81
+
82
+ ```python
83
+ # 1. Define structured outputs with schema
84
+ from starfish import StructuredLLM
85
+ from pydantic import BaseModel
86
+
87
+ # Option A: Use Pydantic for type safety
88
+ class QnASchema(BaseModel):
89
+ question: str
90
+ answer: str
91
+
92
+ # Option B: Or use simple JSON schema
93
+ json_schema = [
94
+ {'name': 'question', 'type': 'str'},
95
+ {'name': 'answer', 'type': 'str'},
96
+ ]
97
+
98
+ # 2. Create a structured LLM with your preferred output format
99
+ qna_llm = StructuredLLM(
100
+ model_name="openai/gpt-4o-mini",
101
+ prompt="Generate facts about {{city}}",
102
+ output_schema=QnASchema # or json_schema
103
+ )
104
+
105
+ # 3. Get structured responses
106
+ response = await qna_llm.run(city="San Francisco")
107
+
108
+ # Access typed data
109
+ print(response.data)
110
+ # [{'question': 'What is the iconic symbol of San Francisco?',
111
+ # 'answer': 'The Golden Gate Bridge is the iconic symbol of San Francisco, completed in 1937.'}]
112
+
113
+ # Access raw API response for complete flexibility
114
+ print(response.raw) # Full API object with function calls, reasoning tokens, etc.
115
+ ```
116
+
117
+ ### Data Factory - Scale Any Workflow with One Decorator
118
+
119
+ ```python
120
+ # Turn any function into a scalable data pipeline
121
+ from starfish import data_factory
122
+
123
+ # Works with any function - simple or complex workflows
124
+ @data_factory(max_concurrency=50)
125
+ async def parallel_qna_llm(city):
126
+ # This could be any arbitrary complex workflow:
127
+ # - Pre-processing
128
+ # - Multiple LLM calls
129
+ # - Post-processing
130
+ # - Error handling
131
+ response = await qna_llm.run(city=city)
132
+ return response.data
133
+
134
+ # Process 100 cities with 50 concurrent workers - finishes in seconds
135
+ cities = ["San Francisco", "New York", "Tokyo", "Paris", "London"] * 20
136
+ results = parallel_qna_llm.run(city=cities)
137
+
138
+ # dry run to test the workflow and data
139
+ results = parallel_qna_llm.dry_run(city=cities)
140
+
141
+ # resume job which pick up from where it left off.
142
+ results = parallel_qna_llm.resume()
143
+ ```
144
+
145
+ ### Examples
146
+
147
+ Check out our example notebooks for detailed walkthroughs:
148
+ - [Structured LLM Examples](examples/structured_llm.ipynb)
149
+ - [Data Factory Examples](examples/data_factory.ipynb)
150
+
151
+ ## Documentation
152
+
153
+ Comprehensive documentation is on the way!
154
+
155
+ ## Contributing
156
+
157
+ We'd love your help making Starfish better! Whether you're fixing bugs, adding features, or improving documentation, your contributions are welcome.
158
+
159
+ 1. Fork the repository
160
+ 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
161
+ 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
162
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
163
+ 5. Open a Pull Request
164
+
165
+ Contribution guidelines coming soon!
166
+
167
+ ## License
168
+
169
+ This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
170
+
171
+ ## Contact
172
+
173
+ If you have any questions or feedback, feel free to reach out to us at [[email protected]](mailto:[email protected]).
174
+
175
+ Want to discuss your use case directly? [Schedule a meeting with our team](https://calendly.com/d/crsb-ckq-fv2/chat-with-starfishdata-team).
176
+
177
+ ## Telemetry
178
+
179
+ Starfish collects minimal and anonymous telemetry data to help improve the library. Participation is optional and you can opt out by setting `TELEMETRY_ENABLED=false` in your environment variables.
180
+
181
+ ## Citation
182
+
183
+ If you use Starfish in your research, please consider citing us!
184
+
185
+ ```
186
+ @software{starfish,
187
+ author = {Wendao, John, Ayush},
188
+ title = {{Starfish: A Tool for Synthetic Data Generation}},
189
+ year = {2025},
190
+ url = {https://github.com/starfishdata/starfish},
191
+ }
192
+ ```
193
+
docs_mintlify ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 6ad0ad5eda1fc3637fde8d0da24f0d3fd4263453
examples/__init__.py ADDED
File without changes
examples/data_factory.ipynb ADDED
@@ -0,0 +1,681 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Google Colab Version: [Open this notebook in Google Colab](https://colab.research.google.com/github/starfishdata/starfish/blob/main/examples/data_factory.ipynb)"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "#### Dependencies "
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 11,
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "name": "stdout",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "Requirement already satisfied: starfish-core in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (0.1.0)\n",
27
+ "Requirement already satisfied: aiofiles<25.0.0,>=24.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (24.1.0)\n",
28
+ "Requirement already satisfied: aiosqlite<0.22.0,>=0.21.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.21.0)\n",
29
+ "Requirement already satisfied: cachetools<6.0.0,>=5.5.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (5.5.2)\n",
30
+ "Requirement already satisfied: litellm<2.0.0,>=1.65.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.65.1)\n",
31
+ "Requirement already satisfied: loguru<0.8.0,>=0.7.3 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.7.3)\n",
32
+ "Requirement already satisfied: ollama<0.5.0,>=0.4.7 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.4.7)\n",
33
+ "Requirement already satisfied: platformdirs<5.0.0,>=4.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.3.7)\n",
34
+ "Requirement already satisfied: psutil<8.0.0,>=7.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (7.0.0)\n",
35
+ "Requirement already satisfied: python-dotenv<2.0.0,>=1.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.1.0)\n",
36
+ "Requirement already satisfied: typing-extensions<5.0.0,>=4.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.13.0)\n",
37
+ "Requirement already satisfied: aiohttp in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.11.16)\n",
38
+ "Requirement already satisfied: click in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.1.8)\n",
39
+ "Requirement already satisfied: httpx>=0.23.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.28.1)\n",
40
+ "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.6.1)\n",
41
+ "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.1.6)\n",
42
+ "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (4.23.0)\n",
43
+ "Requirement already satisfied: openai>=1.68.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (1.70.0)\n",
44
+ "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (2.11.1)\n",
45
+ "Requirement already satisfied: tiktoken>=0.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
46
+ "Requirement already satisfied: tokenizers in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.21.1)\n",
47
+ "Requirement already satisfied: anyio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (4.9.0)\n",
48
+ "Requirement already satisfied: certifi in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (2025.1.31)\n",
49
+ "Requirement already satisfied: httpcore==1.* in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (1.0.7)\n",
50
+ "Requirement already satisfied: idna in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.10)\n",
51
+ "Requirement already satisfied: h11<0.15,>=0.13 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.14.0)\n",
52
+ "Requirement already satisfied: zipp>=3.20 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.21.0)\n",
53
+ "Requirement already satisfied: MarkupSafe>=2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm<2.0.0,>=1.65.1->starfish-core) (3.0.2)\n",
54
+ "Requirement already satisfied: attrs>=22.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (25.3.0)\n",
55
+ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.10.1)\n",
56
+ "Requirement already satisfied: referencing>=0.28.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.36.2)\n",
57
+ "Requirement already satisfied: rpds-py>=0.7.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.24.0)\n",
58
+ "Requirement already satisfied: distro<2,>=1.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.9.0)\n",
59
+ "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
60
+ "Requirement already satisfied: sniffio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.1)\n",
61
+ "Requirement already satisfied: tqdm>4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (4.67.1)\n",
62
+ "Requirement already satisfied: annotated-types>=0.6.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.7.0)\n",
63
+ "Requirement already satisfied: pydantic-core==2.33.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.33.0)\n",
64
+ "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.4.0)\n",
65
+ "Requirement already satisfied: regex>=2022.1.18 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.11.6)\n",
66
+ "Requirement already satisfied: requests>=2.26.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.32.3)\n",
67
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (2.6.1)\n",
68
+ "Requirement already satisfied: aiosignal>=1.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.2)\n",
69
+ "Requirement already satisfied: frozenlist>=1.1.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.5.0)\n",
70
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (6.3.1)\n",
71
+ "Requirement already satisfied: propcache>=0.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (0.3.1)\n",
72
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.18.3)\n",
73
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (0.30.1)\n",
74
+ "Requirement already satisfied: filelock in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (3.18.0)\n",
75
+ "Requirement already satisfied: fsspec>=2023.5.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (2025.3.2)\n",
76
+ "Requirement already satisfied: packaging>=20.9 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (24.2)\n",
77
+ "Requirement already satisfied: pyyaml>=5.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (6.0.2)\n",
78
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.4.1)\n",
79
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.3.0)\n",
80
+ "Note: you may need to restart the kernel to use updated packages.\n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "%pip install starfish-core"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 1,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "## Fix for Jupyter Notebook only — do NOT use in production\n",
95
+ "## Enables async code execution in notebooks, but may cause issues with sync/async issues\n",
96
+ "## For production, please run in standard .py files without this workaround\n",
97
+ "## See: https://github.com/erdewit/nest_asyncio for more details\n",
98
+ "import nest_asyncio\n",
99
+ "nest_asyncio.apply()\n",
100
+ "\n",
101
+ "from starfish import StructuredLLM, data_factory\n",
102
+ "from starfish.llm.utils import merge_structured_outputs\n",
103
+ "\n",
104
+ "from starfish.common.env_loader import load_env_file ## Load environment variables from .env file\n",
105
+ "load_env_file()"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 2,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "# setup your openai api key if not already set\n",
115
+ "# import os\n",
116
+ "# os.environ[\"OPENAI_API_KEY\"] = \"your_key_here\"\n",
117
+ "\n",
118
+ "# If you dont have any API key, please navigate to local model section"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 3,
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "## Helper function mock llm call\n",
128
+ "# When developing data pipelines with LLMs, making thousands of real API calls\n",
129
+ "# can be expensive. Using mock LLM calls lets you test your pipeline's reliability,\n",
130
+ "# failure handling, and recovery without spending money on API calls.\n",
131
+ "from starfish.data_factory.utils.mock import mock_llm_call"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "markdown",
136
+ "metadata": {},
137
+ "source": [
138
+ "#### 1. Your First Data Factory: Simple Scaling\n",
139
+ "\n",
140
+ "The @data_factory decorator transforms any async function into a scalable data processing pipeline.\n",
141
+ "It handles:\n",
142
+ "- Parallel execution \n",
143
+ "- Automatic batching\n",
144
+ "- Error handling & retries\n",
145
+ "- Progress tracking\n",
146
+ "\n",
147
+ "Let's start with a single LLM call and then show how easy it is to scale it.\n"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": 4,
153
+ "metadata": {},
154
+ "outputs": [
155
+ {
156
+ "data": {
157
+ "text/plain": [
158
+ "[{'fact': 'New Yorkers consume around 1,000,000 slices of pizza every day, which means if you laid them all in a line, they would stretch from the Statue of Liberty to the Eiffel Tower... and back!'}]"
159
+ ]
160
+ },
161
+ "execution_count": 4,
162
+ "metadata": {},
163
+ "output_type": "execute_result"
164
+ }
165
+ ],
166
+ "source": [
167
+ "# First, create a StructuredLLM instance for generating facts about cities\n",
168
+ "json_llm = StructuredLLM(\n",
169
+ " model_name = \"openai/gpt-4o-mini\",\n",
170
+ " prompt = \"Funny facts about city {{city_name}}.\",\n",
171
+ " output_schema = [{'name': 'fact', 'type': 'str'}],\n",
172
+ " model_kwargs = {\"temperature\": 0.7},\n",
173
+ ")\n",
174
+ "\n",
175
+ "json_llm_response = await json_llm.run(city_name='New York')\n",
176
+ "json_llm_response.data"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 5,
182
+ "metadata": {},
183
+ "outputs": [
184
+ {
185
+ "name": "stdout",
186
+ "output_type": "stream",
187
+ "text": [
188
+ "\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8c926411-63e7-4dc6-98c9-861c3489fb8b\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
189
+ "\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
190
+ "Processing New York at 2025-04-25 10:16:32.524033\n",
191
+ "Processing London at 2025-04-25 10:16:32.524286\n",
192
+ "Processing Tokyo at 2025-04-25 10:16:32.524979\n",
193
+ "Processing Paris at 2025-04-25 10:16:32.525535\n",
194
+ "Processing Sydney at 2025-04-25 10:16:32.526729\n",
195
+ "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
196
+ "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
197
+ ]
198
+ },
199
+ {
200
+ "data": {
201
+ "text/plain": [
202
+ "[{'fact': \"In Tokyo, there's a train station called 'Shinjuku' that handles more passengers each day than the entire population of the United States!\"},\n",
203
+ " {'fact': \"London has a 'secret' underground city known as the 'London Stone', which is said to have magical powers, making it one of the city's most famous and quirky legends!\"},\n",
204
+ " {'fact': 'In Paris, you can legally marry a dead person! This quirky law allows for posthumous marriages, as long as you can prove that the deceased had intended to marry you before their untimely demise.'},\n",
205
+ " {'fact': 'In New York City, there are more than 25,000 licensed taxis, but only about 1,200 of them are actually yellow. The rest are a rainbow of colors, including pink, blue, and even animal print!'},\n",
206
+ " {'fact': 'Sydney has a beach where you can surf, swim, and even watch a film – all in one day! Just don’t forget your sunscreen and popcorn!'}]"
207
+ ]
208
+ },
209
+ "execution_count": 5,
210
+ "metadata": {},
211
+ "output_type": "execute_result"
212
+ }
213
+ ],
214
+ "source": [
215
+ "# Now, scale to multiple cities using data_factory\n",
216
+ "# Just add the @data_factory decorator to process many cities in parallel\n",
217
+ "\n",
218
+ "from datetime import datetime\n",
219
+ "@data_factory(max_concurrency=10)\n",
220
+ "async def process_json_llm(city_name: str):\n",
221
+ " ## Adding a print statement to indicate the start of the processing\n",
222
+ " print(f\"Processing {city_name} at {datetime.now()}\")\n",
223
+ " json_llm_response = await json_llm.run(city_name=city_name)\n",
224
+ " return json_llm_response.data\n",
225
+ "\n",
226
+ "# This is all it takes to scale from one city to many cities!\n",
227
+ "process_json_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"])"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "markdown",
232
+ "metadata": {},
233
+ "source": [
234
+ "#### 2. Works with any aysnc function\n",
235
+ "\n",
236
+ "Data Factory works with any async function, not just LLM calls, you can build complex pipelines involving multiple LLMs, data processing, etc.\n",
237
+ "\n",
238
+ "Here is example of two chained structured llm"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 6,
244
+ "metadata": {},
245
+ "outputs": [
246
+ {
247
+ "name": "stdout",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 466fca03-85a2-46de-b135-629cd76738f7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
251
+ "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
252
+ "\u001b[32m2025-04-25 10:16:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
253
+ "\u001b[32m2025-04-25 10:16:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
254
+ "\u001b[32m2025-04-25 10:16:43\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 2/3\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 2\u001b[0m (\u001b[32mCompleted: 2\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
255
+ "\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
256
+ "\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 3/3\u001b[0m | \u001b[33mAttempted: 3\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
257
+ ]
258
+ }
259
+ ],
260
+ "source": [
261
+ "# Example of a more complex function that chains multiple LLM calls\n",
262
+ "# This was grabbed from structured llm examples \n",
263
+ "\n",
264
+ "@data_factory(max_concurrency=10)\n",
265
+ "async def complex_process_cities(topic: str):\n",
266
+ " ## topic → generator_llm → rating_llm → merged results\n",
267
+ " # First LLM to generate question/answer pairs\n",
268
+ " generator_llm = StructuredLLM(\n",
269
+ " model_name=\"openai/gpt-4o-mini\",\n",
270
+ " prompt=\"Generate question/answer pairs about {{topic}}.\",\n",
271
+ " output_schema=[\n",
272
+ " {\"name\": \"question\", \"type\": \"str\"},\n",
273
+ " {\"name\": \"answer\", \"type\": \"str\"}\n",
274
+ " ],\n",
275
+ " )\n",
276
+ "\n",
277
+ " # Second LLM to rate the generated pairs\n",
278
+ " rater_llm = StructuredLLM(\n",
279
+ " model_name=\"openai/gpt-4o-mini\",\n",
280
+ " prompt='''Rate the following Q&A pairs based on accuracy and clarity (1-10).\n",
281
+ " Pairs: {{generated_pairs}}''',\n",
282
+ " output_schema=[\n",
283
+ " {\"name\": \"accuracy_rating\", \"type\": \"int\"},\n",
284
+ " {\"name\": \"clarity_rating\", \"type\": \"int\"}\n",
285
+ " ],\n",
286
+ " model_kwargs={\"temperature\": 0.5}\n",
287
+ ")\n",
288
+ "\n",
289
+ " generation_response = await generator_llm.run(topic=topic, num_records=5)\n",
290
+ " rating_response = await rater_llm.run(generated_pairs=generation_response.data)\n",
291
+ " \n",
292
+ " # Merge the results\n",
293
+ " return merge_structured_outputs(generation_response.data, rating_response.data)\n",
294
+ "\n",
295
+ "\n",
296
+ "### To save on token here we only use 3 topics as example\n",
297
+ "complex_process_cities_data = complex_process_cities.run(topic=['Science', 'History', 'Technology'])"
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "execution_count": 7,
303
+ "metadata": {},
304
+ "outputs": [
305
+ {
306
+ "name": "stdout",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "15\n",
310
+ "[{'question': 'What is the primary function of a CPU in a computer?', 'answer': 'The CPU, or Central Processing Unit, is responsible for executing instructions and processing data in a computer system.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What does IoT stand for and what is its significance?', 'answer': 'IoT stands for Internet of Things, which refers to the interconnection of everyday devices to the internet, allowing them to send and receive data, thereby enhancing efficiency and convenience.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the difference between RAM and ROM?', 'answer': 'RAM (Random Access Memory) is volatile memory that temporarily stores data and applications currently in use, while ROM (Read-Only Memory) is non-volatile memory that permanently stores firmware and system software.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is cloud computing?', 'answer': 'Cloud computing is the delivery of computing services over the internet, enabling users to access and store data and applications on remote servers rather than on local computers.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What are the benefits of using artificial intelligence in business?', 'answer': 'Artificial intelligence can enhance efficiency, improve decision-making, personalize customer experiences, automate repetitive tasks, and generate insights from data analytics in business operations.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the chemical formula for water?', 'answer': 'The chemical formula for water is H2O.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the process by which plants make their own food?', 'answer': 'The process by which plants make their own food is called photosynthesis.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the speed of light in a vacuum?', 'answer': 'The speed of light in a vacuum is approximately 299,792 kilometers per second (or about 186,282 miles per second).', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the smallest unit of life?', 'answer': 'The smallest unit of life is the cell.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What gas do living organisms need for respiration?', 'answer': 'Living organisms need oxygen for respiration.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the primary cause of World War I?', 'answer': 'The primary cause of World War I was the complex system of alliances, militarism, imperialism, and nationalism, which escalated tensions following the assassination of Archduke Franz Ferdinand of Austria in 1914.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Who was the first President of the United States?', 'answer': 'George Washington was the first President of the United States, serving from April 30, 1789, to March 4, 1797.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What year did the Berlin Wall fall?', 'answer': 'The Berlin Wall fell on November 9, 1989, symbolizing the end of the Cold War and the division between East and West Germany.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Which ancient civilization is known for creating the first known writing system?', 'answer': 'The Sumerians, who inhabited ancient Mesopotamia around 3500 BCE, are known for creating the first known writing system called cuneiform.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the significance of the Magna Carta?', 'answer': 'The Magna Carta, signed in 1215, was significant because it limited the power of the monarchy and established the principle that everyone, including the king, was subject to the law.', 'accuracy_rating': 10, 'clarity_rating': 10}]\n"
311
+ ]
312
+ }
313
+ ],
314
+ "source": [
315
+ "### Each topic has 5 question/answer pairs so 3 topics has 15 pairs!\n",
316
+ "print(len(complex_process_cities_data))\n",
317
+ "print(complex_process_cities_data)"
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "markdown",
322
+ "metadata": {},
323
+ "source": [
324
+ "#### 3. Working with Different Input Formats\n",
325
+ "\n",
326
+ "\n",
327
+ "Data Factory is flexible with how you provide inputs. Let's demonstrate different ways to pass parameters to data_factory functions.\n",
328
+ "\n",
329
+ "'data' is a reserved keyword expecting list(dict) or tuple(dict) - this design make it super easy to pass large data and support HuggingFace and Pandas dataframe very easily"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": 8,
335
+ "metadata": {},
336
+ "outputs": [
337
+ {
338
+ "data": {
339
+ "text/plain": [
340
+ "[{'answer': 'New York_5'}, {'answer': 'New York_2'}, {'answer': 'New York_3'}]"
341
+ ]
342
+ },
343
+ "execution_count": 8,
344
+ "metadata": {},
345
+ "output_type": "execute_result"
346
+ }
347
+ ],
348
+ "source": [
349
+ "## We will be using mock llm call for rest of example to save on token\n",
350
+ "## Mock LLM call is a function that simulates an LLM API call with random delays (controlled by sleep_time) and occasional failures (controlled by fail_rate)\n",
351
+ "await mock_llm_call(city_name=\"New York\", num_records_per_city=3)"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 9,
357
+ "metadata": {},
358
+ "outputs": [],
359
+ "source": [
360
+ "@data_factory(max_concurrency=100)\n",
361
+ "async def input_format_mock_llm(city_name: str, num_records_per_city: int):\n",
362
+ " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.01)"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": 10,
368
+ "metadata": {},
369
+ "outputs": [
370
+ {
371
+ "name": "stdout",
372
+ "output_type": "stream",
373
+ "text": [
374
+ "\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 05c84608-fec3-4010-8876-e59eed12bb6a\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
375
+ "\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
376
+ "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
377
+ "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
378
+ ]
379
+ }
380
+ ],
381
+ "source": [
382
+ "# Format 1: Multiple lists that get zipped together\n",
383
+ "input_format_data1 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=[2, 1, 1, 1, 1])"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 11,
389
+ "metadata": {},
390
+ "outputs": [
391
+ {
392
+ "name": "stdout",
393
+ "output_type": "stream",
394
+ "text": [
395
+ "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: fedb98e5-c408-4bc8-9479-6087f4a298b7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
396
+ "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
397
+ "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
398
+ "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
399
+ ]
400
+ }
401
+ ],
402
+ "source": [
403
+ "# Format 2: List + single value (single value gets broadcasted)\n",
404
+ "input_format_data2 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=1)"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": 12,
410
+ "metadata": {},
411
+ "outputs": [
412
+ {
413
+ "name": "stdout",
414
+ "output_type": "stream",
415
+ "text": [
416
+ "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 2f5cb7cc-83c9-4b7e-9ebb-386cd66bdd42\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
417
+ "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
418
+ "\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
419
+ "\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
420
+ ]
421
+ }
422
+ ],
423
+ "source": [
424
+ "# Format 3: Special 'data' parameter\n",
425
+ "# 'data' is a reserved keyword expecting list(dict) or tuple(dict)\n",
426
+ "# Makes integration with various data sources easier\n",
427
+ "input_format_data3 = input_format_mock_llm.run(data=[{\"city_name\": \"New York\", \"num_records_per_city\": 2}, {\"city_name\": \"London\", \"num_records_per_city\": 1}, {\"city_name\": \"Tokyo\", \"num_records_per_city\": 1}, {\"city_name\": \"Paris\", \"num_records_per_city\": 1}, {\"city_name\": \"Sydney\", \"num_records_per_city\": 1}])"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "markdown",
432
+ "metadata": {},
433
+ "source": [
434
+ "#### 4. Resilient error retry\n",
435
+ "Data Factory automatically handles errors and retries, making your pipelines robust.\n",
436
+ "\n",
437
+ "Let's demonstrate with a high failure rate example."
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 13,
443
+ "metadata": {},
444
+ "outputs": [
445
+ {
446
+ "name": "stdout",
447
+ "output_type": "stream",
448
+ "text": [
449
+ "\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 38c50ab6-f24b-4cba-a2c5-070130ab420e\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
450
+ "\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/25\u001b[0m | \u001b[33mRunning: 25\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
451
+ "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 16/25\u001b[0m | \u001b[33mRunning: 9\u001b[0m | \u001b[36mAttempted: 16\u001b[0m (\u001b[32mCompleted: 16\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
452
+ "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Tokyo\u001b[0m\n",
453
+ "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n",
454
+ "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
455
+ "\u001b[32m2025-04-25 10:17:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 23/25\u001b[0m | \u001b[33mRunning: 2\u001b[0m | \u001b[36mAttempted: 26\u001b[0m (\u001b[32mCompleted: 23\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
456
+ "\u001b[32m2025-04-25 10:17:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
457
+ "\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
458
+ "\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 25/25\u001b[0m | \u001b[33mAttempted: 29\u001b[0m (Failed: 4, Filtered: 0, Duplicate: 0)\u001b[0m\n",
459
+ "\n",
460
+ "Successfully completed 25 out of 25 tasks\n",
461
+ "Data Factory automatically handled the failures and continued processing\n",
462
+ "The results only include successful tasks\n"
463
+ ]
464
+ }
465
+ ],
466
+ "source": [
467
+ "@data_factory(max_concurrency=100)\n",
468
+ "async def high_error_rate_mock_llm(city_name: str, num_records_per_city: int):\n",
469
+ " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3) # Hardcode to 30% chance of failure\n",
470
+ "\n",
471
+ "# Process all cities - some will fail, but data_factory keeps going\n",
472
+ "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 5 # 25 cities\n",
473
+ "high_error_rate_mock_lllm_data = high_error_rate_mock_llm.run(city_name=cities, num_records_per_city=1)\n",
474
+ "\n",
475
+ "print(f\"\\nSuccessfully completed {len(high_error_rate_mock_lllm_data)} out of {len(cities)} tasks\")\n",
476
+ "print(\"Data Factory automatically handled the failures and continued processing\")\n",
477
+ "print(\"The results only include successful tasks\")"
478
+ ]
479
+ },
480
+ {
481
+ "cell_type": "markdown",
482
+ "metadata": {},
483
+ "source": [
484
+ "#### 5. Resume\n",
485
+ "\n",
486
+ "This is essential for long-running jobs with thousands of tasks.\n",
487
+ "\n",
488
+ "If a job is interrupted, you can pick up where you left off using one of two resume methods:\n",
489
+ "\n",
490
+ "\n",
491
+ "1. **Same Session Resume**: If you're still in the same session where the job was interrupted, simply call - Same instance with .resume()\n",
492
+ "\n",
493
+ "2. **Cross-Session Resume**: If you've closed your notebook or lost your session, you can resume using the job ID:\n",
494
+ " ```python\n",
495
+ " from starfish import DataFactory\n",
496
+ " # Resume using the master job ID from a previous run\n",
497
+ " data_factory = DataFactory.resume_from_checkpoint(job_id=\"your_job_id\")\n",
498
+ " ```\n",
499
+ "\n",
500
+ "The key difference:\n",
501
+ "- `resume()` uses the same DataFactory instance you defined\n",
502
+ "- `resume_from_checkpoint()` reconstructs your DataFactory from persistent storage where tasks and progress are saved\n",
503
+ "\n",
504
+ "> **Note**: Google Colab users may experience issues with `resume_from_checkpoint()` due to how Colab works\n",
505
+ "\n",
506
+ "We're simulating an interruption here. In a real scenario, this might happen if your notebook errors out, is manually interrupted with a keyboard command, encounters API rate limits, or experiences any other issues that halt execution."
507
+ ]
508
+ },
509
+ {
510
+ "cell_type": "code",
511
+ "execution_count": 14,
512
+ "metadata": {},
513
+ "outputs": [
514
+ {
515
+ "name": "stdout",
516
+ "output_type": "stream",
517
+ "text": [
518
+ "\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: b2a400b3-32e7-45ee-b8e8-c2bc7afe9f11\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
519
+ "\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
520
+ "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 17/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 17\u001b[0m (\u001b[32mCompleted: 17\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
521
+ "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
522
+ "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError occurred: KeyboardInterrupt\u001b[0m\n",
523
+ "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\u001b[0m\n",
524
+ "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mAttempted: 20\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
525
+ ]
526
+ }
527
+ ],
528
+ "source": [
529
+ "@data_factory(max_concurrency=10)\n",
530
+ "async def re_run_mock_llm(city_name: str, num_records_per_city: int):\n",
531
+ " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
532
+ "\n",
533
+ "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 20 # 100 cities\n",
534
+ "re_run_mock_llm_data_1 = re_run_mock_llm.run(city_name=cities, num_records_per_city=1)"
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": 15,
540
+ "metadata": {},
541
+ "outputs": [
542
+ {
543
+ "name": "stdout",
544
+ "output_type": "stream",
545
+ "text": [
546
+ "When a job is interrupted, you'll see a message like:\n",
547
+ "[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\n",
548
+ "\n",
549
+ "To resume an interrupted job, simply call:\n",
550
+ "interrupted_job_mock_llm.resume()\n",
551
+ "\n",
552
+ "For this example we have 20/100 data generated and not finished yet!\n"
553
+ ]
554
+ }
555
+ ],
556
+ "source": [
557
+ "print(\"When a job is interrupted, you'll see a message like:\")\n",
558
+ "print(\"[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\")\n",
559
+ "\n",
560
+ "print(\"\\nTo resume an interrupted job, simply call:\")\n",
561
+ "print(\"interrupted_job_mock_llm.resume()\")\n",
562
+ "print('')\n",
563
+ "print(f\"For this example we have {len(re_run_mock_llm_data_1)}/{len(cities)} data generated and not finished yet!\")"
564
+ ]
565
+ },
566
+ {
567
+ "cell_type": "code",
568
+ "execution_count": 17,
569
+ "metadata": {},
570
+ "outputs": [
571
+ {
572
+ "name": "stdout",
573
+ "output_type": "stream",
574
+ "text": [
575
+ "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB RESUME START]\u001b[0m \u001b[33mPICKING UP FROM WHERE THE JOB WAS LEFT OFF...\u001b[0m\n",
576
+ "\u001b[0m\n",
577
+ "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[RESUME PROGRESS] STATUS AT THE TIME OF RESUME:\u001b[0m \u001b[32mCompleted: 20 / 100\u001b[0m | \u001b[31mFailed: 0\u001b[0m | \u001b[31mDuplicate: 0\u001b[0m | \u001b[33mFiltered: 0\u001b[0m\u001b[0m\n",
578
+ "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 20\u001b[0m (\u001b[32mCompleted: 20\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
579
+ "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 32/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 32\u001b[0m (\u001b[32mCompleted: 32\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
580
+ "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
581
+ "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
582
+ "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
583
+ "\u001b[32m2025-04-25 10:18:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 56/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 59\u001b[0m (\u001b[32mCompleted: 56\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
584
+ "\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
585
+ "\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n",
586
+ "\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 69/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 74\u001b[0m (\u001b[32mCompleted: 69\u001b[0m, \u001b[31mFailed: 5\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
587
+ "\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
588
+ "\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 89/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 95\u001b[0m (\u001b[32mCompleted: 89\u001b[0m, \u001b[31mFailed: 6\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
589
+ "\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
590
+ "\u001b[32m2025-04-25 10:18:13\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
591
+ "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
592
+ "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
593
+ "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 100/100\u001b[0m | \u001b[33mAttempted: 109\u001b[0m (Failed: 9, Filtered: 0, Duplicate: 0)\u001b[0m\n"
594
+ ]
595
+ }
596
+ ],
597
+ "source": [
598
+ "## Lets keep continue the rest of run by resume_from_checkpoint \n",
599
+ "re_run_mock_llm_data_2 = re_run_mock_llm.resume()"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "code",
604
+ "execution_count": 18,
605
+ "metadata": {},
606
+ "outputs": [
607
+ {
608
+ "name": "stdout",
609
+ "output_type": "stream",
610
+ "text": [
611
+ "Now we still able to finished with what is left!! 100 data generated!\n"
612
+ ]
613
+ }
614
+ ],
615
+ "source": [
616
+ "print(f\"Now we still able to finished with what is left!! {len(re_run_mock_llm_data_2)} data generated!\")"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "markdown",
621
+ "metadata": {},
622
+ "source": [
623
+ "#### 6. Dry run\n",
624
+ "Before running a large job, you can do a \"dry run\" to test your pipeline. This only processes a single item and doesn't save state to the database."
625
+ ]
626
+ },
627
+ {
628
+ "cell_type": "code",
629
+ "execution_count": 19,
630
+ "metadata": {},
631
+ "outputs": [
632
+ {
633
+ "name": "stdout",
634
+ "output_type": "stream",
635
+ "text": [
636
+ "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: None\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
637
+ "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
638
+ "\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
639
+ "\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/0\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
640
+ ]
641
+ }
642
+ ],
643
+ "source": [
644
+ "@data_factory(max_concurrency=10)\n",
645
+ "async def dry_run_mock_llm(city_name: str, num_records_per_city: int):\n",
646
+ " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
647
+ "\n",
648
+ "dry_run_mock_llm_data = dry_run_mock_llm.dry_run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"]*20, num_records_per_city=1)"
649
+ ]
650
+ },
651
+ {
652
+ "cell_type": "markdown",
653
+ "metadata": {},
654
+ "source": [
655
+ "#### 8. Advanced Usage\n",
656
+ "Data Factory offers more advanced capabilities for complete pipeline customization, including hooks that execute at key stages and shareable state to coordinate between tasks. These powerful features enable complex workflows and fine-grained control. Our dedicated examples for advanced data_factory usage will be coming soon!"
657
+ ]
658
+ }
659
+ ],
660
+ "metadata": {
661
+ "kernelspec": {
662
+ "display_name": ".venv",
663
+ "language": "python",
664
+ "name": "python3"
665
+ },
666
+ "language_info": {
667
+ "codemirror_mode": {
668
+ "name": "ipython",
669
+ "version": 3
670
+ },
671
+ "file_extension": ".py",
672
+ "mimetype": "text/x-python",
673
+ "name": "python",
674
+ "nbconvert_exporter": "python",
675
+ "pygments_lexer": "ipython3",
676
+ "version": "3.11.4"
677
+ }
678
+ },
679
+ "nbformat": 4,
680
+ "nbformat_minor": 2
681
+ }
examples/data_factory_release_check.ipynb ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Google Colab Version: [Open this notebook in Google Colab](https://colab.research.google.com/github/starfishdata/starfish/blob/main/examples/data_factory.ipynb)"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "#### Dependencies "
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 23,
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "name": "stdout",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "Looking in indexes: https://test.pypi.org/simple/, https://pypi.org/simple\n",
27
+ "Requirement already satisfied: starfish-core in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (0.1.2)\n",
28
+ "Requirement already satisfied: aiofiles<25.0.0,>=24.1.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (24.1.0)\n",
29
+ "Requirement already satisfied: aiosqlite<0.22.0,>=0.21.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (0.21.0)\n",
30
+ "Requirement already satisfied: cachetools<6.0.0,>=5.5.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (5.5.2)\n",
31
+ "Requirement already satisfied: cloudpickle<3.0.0,>=2.2.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (2.2.1)\n",
32
+ "Requirement already satisfied: cryptography>=44.0.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (44.0.3)\n",
33
+ "Requirement already satisfied: docstring_parser<0.17.0,>=0.16.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (0.16)\n",
34
+ "Requirement already satisfied: litellm<2.0.0,>=1.65.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (1.69.3)\n",
35
+ "Requirement already satisfied: loguru<0.8.0,>=0.7.3 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (0.7.3)\n",
36
+ "Requirement already satisfied: mcp<2.0.0,>=1.8.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (1.9.0)\n",
37
+ "Requirement already satisfied: nest_asyncio<2.0.0,>=1.6.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (1.6.0)\n",
38
+ "Requirement already satisfied: ollama<0.5.0,>=0.4.7 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (0.4.8)\n",
39
+ "Requirement already satisfied: posthog<4.0.0,>=3.11.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (3.25.0)\n",
40
+ "Requirement already satisfied: psutil<8.0.0,>=7.0.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (7.0.0)\n",
41
+ "Requirement already satisfied: python-dotenv<2.0.0,>=1.1.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (1.1.0)\n",
42
+ "Requirement already satisfied: typing-extensions<5.0.0,>=4.0.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from starfish-core) (4.13.2)\n",
43
+ "Requirement already satisfied: cffi>=1.12 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from cryptography>=44.0.1->starfish-core) (1.17.1)\n",
44
+ "Requirement already satisfied: aiohttp in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.11.18)\n",
45
+ "Requirement already satisfied: click in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.2.0)\n",
46
+ "Requirement already satisfied: httpx>=0.23.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.28.1)\n",
47
+ "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.7.0)\n",
48
+ "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.1.6)\n",
49
+ "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (4.23.0)\n",
50
+ "Requirement already satisfied: openai<1.76.0,>=1.68.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (1.75.0)\n",
51
+ "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (2.11.4)\n",
52
+ "Requirement already satisfied: tiktoken>=0.7.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
53
+ "Requirement already satisfied: tokenizers in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.21.1)\n",
54
+ "Requirement already satisfied: anyio>=4.5 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (4.9.0)\n",
55
+ "Requirement already satisfied: httpx-sse>=0.4 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (0.4.0)\n",
56
+ "Requirement already satisfied: pydantic-settings>=2.5.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (2.9.1)\n",
57
+ "Requirement already satisfied: python-multipart>=0.0.9 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (0.0.20)\n",
58
+ "Requirement already satisfied: sse-starlette>=1.6.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (2.3.5)\n",
59
+ "Requirement already satisfied: starlette>=0.27 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (0.46.2)\n",
60
+ "Requirement already satisfied: uvicorn>=0.23.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from mcp<2.0.0,>=1.8.1->starfish-core) (0.34.2)\n",
61
+ "Requirement already satisfied: requests<3.0,>=2.7 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (2.32.3)\n",
62
+ "Requirement already satisfied: six>=1.5 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (1.17.0)\n",
63
+ "Requirement already satisfied: monotonic>=1.5 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (1.6)\n",
64
+ "Requirement already satisfied: backoff>=1.10.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (2.2.1)\n",
65
+ "Requirement already satisfied: python-dateutil>2.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (2.9.0.post0)\n",
66
+ "Requirement already satisfied: distro>=1.5.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from posthog<4.0.0,>=3.11.0->starfish-core) (1.9.0)\n",
67
+ "Requirement already satisfied: idna>=2.8 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from anyio>=4.5->mcp<2.0.0,>=1.8.1->starfish-core) (3.10)\n",
68
+ "Requirement already satisfied: sniffio>=1.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from anyio>=4.5->mcp<2.0.0,>=1.8.1->starfish-core) (1.3.1)\n",
69
+ "Requirement already satisfied: pycparser in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from cffi>=1.12->cryptography>=44.0.1->starfish-core) (2.22)\n",
70
+ "Requirement already satisfied: certifi in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (2025.4.26)\n",
71
+ "Requirement already satisfied: httpcore==1.* in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (1.0.9)\n",
72
+ "Requirement already satisfied: h11>=0.16 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.16.0)\n",
73
+ "Requirement already satisfied: zipp>=3.20 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.21.0)\n",
74
+ "Requirement already satisfied: MarkupSafe>=2.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm<2.0.0,>=1.65.1->starfish-core) (3.0.2)\n",
75
+ "Requirement already satisfied: attrs>=22.2.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (25.3.0)\n",
76
+ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (2025.4.1)\n",
77
+ "Requirement already satisfied: referencing>=0.28.4 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.36.2)\n",
78
+ "Requirement already satisfied: rpds-py>=0.7.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.25.0)\n",
79
+ "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from openai<1.76.0,>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
80
+ "Requirement already satisfied: tqdm>4 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from openai<1.76.0,>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (4.67.1)\n",
81
+ "Requirement already satisfied: annotated-types>=0.6.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.7.0)\n",
82
+ "Requirement already satisfied: pydantic-core==2.33.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.33.2)\n",
83
+ "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.4.0)\n",
84
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog<4.0.0,>=3.11.0->starfish-core) (3.4.2)\n",
85
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog<4.0.0,>=3.11.0->starfish-core) (2.4.0)\n",
86
+ "Requirement already satisfied: regex>=2022.1.18 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.11.6)\n",
87
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (2.6.1)\n",
88
+ "Requirement already satisfied: aiosignal>=1.1.2 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.2)\n",
89
+ "Requirement already satisfied: frozenlist>=1.1.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.6.0)\n",
90
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (6.4.3)\n",
91
+ "Requirement already satisfied: propcache>=0.2.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (0.3.1)\n",
92
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.20.0)\n",
93
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (0.31.2)\n",
94
+ "Requirement already satisfied: filelock in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (3.18.0)\n",
95
+ "Requirement already satisfied: fsspec>=2023.5.0 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (2025.3.2)\n",
96
+ "Requirement already satisfied: packaging>=20.9 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (25.0)\n",
97
+ "Requirement already satisfied: pyyaml>=5.1 in /Users/john/Documents/projects/aa/python/starfish/starfish/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (6.0.2)\n",
98
+ "\n",
99
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n",
100
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
101
+ "Note: you may need to restart the kernel to use updated packages.\n"
102
+ ]
103
+ }
104
+ ],
105
+ "source": [
106
+ "#%pip install starfish-core\n",
107
+ "%pip install --index-url https://test.pypi.org/simple/ \\\n",
108
+ " --extra-index-url https://pypi.org/simple \\\n",
109
+ " starfish-core"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 24,
115
+ "metadata": {},
116
+ "outputs": [
117
+ {
118
+ "name": "stdout",
119
+ "output_type": "stream",
120
+ "text": [
121
+ "\u001b[32m2025-05-23 22:50:10\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mFailed to load environment variables from /Users/john/Documents/projects/aa/python/starfish/starfish/.env\u001b[0m\n"
122
+ ]
123
+ }
124
+ ],
125
+ "source": [
126
+ "## Fix for Jupyter Notebook only — do NOT use in production\n",
127
+ "## Enables async code execution in notebooks, but may cause issues with sync/async issues\n",
128
+ "## For production, please run in standard .py files without this workaround\n",
129
+ "## See: https://github.com/erdewit/nest_asyncio for more details\n",
130
+ "import nest_asyncio\n",
131
+ "nest_asyncio.apply()\n",
132
+ "\n",
133
+ "from starfish import StructuredLLM, data_factory\n",
134
+ "from starfish.llm.utils import merge_structured_outputs\n",
135
+ "\n",
136
+ "from starfish.common.env_loader import load_env_file ## Load environment variables from .env file\n",
137
+ "load_env_file()"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 25,
143
+ "metadata": {},
144
+ "outputs": [],
145
+ "source": [
146
+ "## Helper function mock llm call\n",
147
+ "# When developing data pipelines with LLMs, making thousands of real API calls\n",
148
+ "# can be expensive. Using mock LLM calls lets you test your pipeline's reliability,\n",
149
+ "# failure handling, and recovery without spending money on API calls.\n",
150
+ "from starfish.data_factory.utils.mock import mock_llm_call"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "markdown",
155
+ "metadata": {},
156
+ "source": [
157
+ "#### 3. Working with Different Input Formats\n",
158
+ "\n",
159
+ "\n",
160
+ "Data Factory is flexible with how you provide inputs. Let's demonstrate different ways to pass parameters to data_factory functions.\n",
161
+ "\n",
162
+ "'data' is a reserved keyword expecting list(dict) or tuple(dict) - this design make it super easy to pass large data and support HuggingFace and Pandas dataframe very easily"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 26,
168
+ "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "data": {
172
+ "text/plain": [
173
+ "[{'answer': 'New York_3'}, {'answer': 'New York_1'}, {'answer': 'New York_5'}]"
174
+ ]
175
+ },
176
+ "execution_count": 26,
177
+ "metadata": {},
178
+ "output_type": "execute_result"
179
+ }
180
+ ],
181
+ "source": [
182
+ "## We will be using mock llm call for rest of example to save on token\n",
183
+ "## Mock LLM call is a function that simulates an LLM API call with random delays (controlled by sleep_time) and occasional failures (controlled by fail_rate)\n",
184
+ "await mock_llm_call(city_name=\"New York\", num_records_per_city=3)"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": 27,
190
+ "metadata": {},
191
+ "outputs": [],
192
+ "source": [
193
+ "@data_factory(max_concurrency=100)\n",
194
+ "async def input_format_mock_llm(city_name: str, num_records_per_city: int):\n",
195
+ " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.01)"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": 28,
201
+ "metadata": {},
202
+ "outputs": [
203
+ {
204
+ "name": "stdout",
205
+ "output_type": "stream",
206
+ "text": [
207
+ "\u001b[32m2025-05-23 22:50:10\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 4da82fc7-4112-4e05-b58c-53cf470747ad\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
208
+ "\u001b[32m2025-05-23 22:50:10\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
209
+ "\u001b[32m2025-05-23 22:50:11\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
210
+ ]
211
+ }
212
+ ],
213
+ "source": [
214
+ "# Format 1: Multiple lists that get zipped together\n",
215
+ "input_format_data1 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=[2, 1, 1, 1, 1])"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 29,
221
+ "metadata": {},
222
+ "outputs": [
223
+ {
224
+ "name": "stdout",
225
+ "output_type": "stream",
226
+ "text": [
227
+ "\u001b[32m2025-05-23 22:50:11\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 73973449-6069-485e-ac8c-b1b3a6b3f1a4\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
228
+ "\u001b[32m2025-05-23 22:50:11\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
229
+ "\u001b[32m2025-05-23 22:50:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
230
+ ]
231
+ }
232
+ ],
233
+ "source": [
234
+ "# Format 2: List + single value (single value gets broadcasted)\n",
235
+ "input_format_data2 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=1)"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": 30,
241
+ "metadata": {},
242
+ "outputs": [
243
+ {
244
+ "name": "stdout",
245
+ "output_type": "stream",
246
+ "text": [
247
+ "\u001b[32m2025-05-23 22:50:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: aa9954f9-fc18-4b42-959e-fb2a897987c7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
248
+ "\u001b[32m2025-05-23 22:50:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
249
+ "\u001b[32m2025-05-23 22:50:13\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
250
+ ]
251
+ }
252
+ ],
253
+ "source": [
254
+ "# Format 3: Special 'data' parameter\n",
255
+ "# 'data' is a reserved keyword expecting list(dict) or tuple(dict)\n",
256
+ "# Makes integration with various data sources easier\n",
257
+ "input_format_data3 = input_format_mock_llm.run(data=[{\"city_name\": \"New York\", \"num_records_per_city\": 2}, {\"city_name\": \"London\", \"num_records_per_city\": 1}, {\"city_name\": \"Tokyo\", \"num_records_per_city\": 1}, {\"city_name\": \"Paris\", \"num_records_per_city\": 1}, {\"city_name\": \"Sydney\", \"num_records_per_city\": 1}])"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "markdown",
262
+ "metadata": {},
263
+ "source": [
264
+ "#### 4. Resilient error retry\n",
265
+ "Data Factory automatically handles errors and retries, making your pipelines robust.\n",
266
+ "\n",
267
+ "Let's demonstrate with a high failure rate example."
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 31,
273
+ "metadata": {},
274
+ "outputs": [
275
+ {
276
+ "name": "stdout",
277
+ "output_type": "stream",
278
+ "text": [
279
+ "\u001b[32m2025-05-23 22:50:13\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 730b766d-3c23-419a-a3dd-271d683818b1\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
280
+ "\u001b[32m2025-05-23 22:50:13\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/25\u001b[0m | \u001b[33mRunning: 25\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
281
+ "\u001b[32m2025-05-23 22:50:15\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Tokyo\u001b[0m\n",
282
+ "\u001b[32m2025-05-23 22:50:15\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
283
+ "\u001b[32m2025-05-23 22:50:16\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 23/25\u001b[0m | \u001b[33mRunning: 0\u001b[0m | \u001b[36mAttempted: 25\u001b[0m (\u001b[32mCompleted: 23\u001b[0m, \u001b[31mFailed: 2\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
284
+ "\u001b[32m2025-05-23 22:50:19\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 25/25\u001b[0m | \u001b[33mAttempted: 27\u001b[0m (Failed: 2, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
285
+ "\n",
286
+ "Successfully completed 25 out of 25 tasks\n",
287
+ "Data Factory automatically handled the failures and continued processing\n",
288
+ "The results only include successful tasks\n"
289
+ ]
290
+ }
291
+ ],
292
+ "source": [
293
+ "@data_factory(max_concurrency=100)\n",
294
+ "async def high_error_rate_mock_llm(city_name: str, num_records_per_city: int):\n",
295
+ " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3) # Hardcode to 30% chance of failure\n",
296
+ "\n",
297
+ "# Process all cities - some will fail, but data_factory keeps going\n",
298
+ "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 5 # 25 cities\n",
299
+ "high_error_rate_mock_lllm_data = high_error_rate_mock_llm.run(city_name=cities, num_records_per_city=1)\n",
300
+ "\n",
301
+ "print(f\"\\nSuccessfully completed {len(high_error_rate_mock_lllm_data)} out of {len(cities)} tasks\")\n",
302
+ "print(\"Data Factory automatically handled the failures and continued processing\")\n",
303
+ "print(\"The results only include successful tasks\")"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "markdown",
308
+ "metadata": {},
309
+ "source": [
310
+ "#### 5. Resume\n",
311
+ "\n",
312
+ "This is essential for long-running jobs with thousands of tasks.\n",
313
+ "\n",
314
+ "If a job is interrupted, you can pick up where you left off using one of two resume methods:\n",
315
+ "\n",
316
+ "\n",
317
+ "1. **Same Session Resume**: If you're still in the same session where the job was interrupted, simply call - Same instance with .resume()\n",
318
+ "\n",
319
+ "2. **Cross-Session Resume**: If you've closed your notebook or lost your session, you can resume using the job ID:\n",
320
+ " ```python\n",
321
+ " from starfish import DataFactory\n",
322
+ " # Resume using the master job ID from a previous run\n",
323
+ " data_factory = DataFactory.resume_from_checkpoint(job_id=\"your_job_id\")\n",
324
+ " ```\n",
325
+ "\n",
326
+ "The key difference:\n",
327
+ "- `resume()` uses the same DataFactory instance you defined\n",
328
+ "- `resume_from_checkpoint()` reconstructs your DataFactory from persistent storage where tasks and progress are saved\n",
329
+ "\n",
330
+ "> **Note**: Google Colab users may experience issues with `resume_from_checkpoint()` due to how Colab works\n",
331
+ "\n",
332
+ "We're simulating an interruption here. In a real scenario, this might happen if your notebook errors out, is manually interrupted with a keyboard command, encounters API rate limits, or experiences any other issues that halt execution."
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 32,
338
+ "metadata": {},
339
+ "outputs": [
340
+ {
341
+ "name": "stdout",
342
+ "output_type": "stream",
343
+ "text": [
344
+ "\u001b[32m2025-05-23 22:50:19\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 6829de29-0b83-4a64-835b-cc79cbad5e3a\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
345
+ "\u001b[32m2025-05-23 22:50:19\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
346
+ "\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
347
+ "\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
348
+ "\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
349
+ "\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mconsecutive_not_completed: in 3 times, stopping this job; please adjust factory config and input data then resume_from_checkpoint(6829de29-0b83-4a64-835b-cc79cbad5e3a)\u001b[0m\n",
350
+ "\u001b[32m2025-05-23 22:50:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 17/100\u001b[0m | \u001b[33mAttempted: 20\u001b[0m (Failed: 3, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
351
+ ]
352
+ }
353
+ ],
354
+ "source": [
355
+ "@data_factory(max_concurrency=10)\n",
356
+ "async def re_run_mock_llm(city_name: str, num_records_per_city: int):\n",
357
+ " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
358
+ "\n",
359
+ "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 20 # 100 cities\n",
360
+ "re_run_mock_llm_data_1 = re_run_mock_llm.run(city_name=cities, num_records_per_city=1)"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": 33,
366
+ "metadata": {},
367
+ "outputs": [
368
+ {
369
+ "name": "stdout",
370
+ "output_type": "stream",
371
+ "text": [
372
+ "When a job is interrupted, you'll see a message like:\n",
373
+ "[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\n",
374
+ "\n",
375
+ "To resume an interrupted job, simply call:\n",
376
+ "interrupted_job_mock_llm.resume()\n",
377
+ "\n",
378
+ "For this example we have 17/100 data generated and not finished yet!\n"
379
+ ]
380
+ }
381
+ ],
382
+ "source": [
383
+ "print(\"When a job is interrupted, you'll see a message like:\")\n",
384
+ "print(\"[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\")\n",
385
+ "\n",
386
+ "print(\"\\nTo resume an interrupted job, simply call:\")\n",
387
+ "print(\"interrupted_job_mock_llm.resume()\")\n",
388
+ "print('')\n",
389
+ "print(f\"For this example we have {len(re_run_mock_llm_data_1)}/{len(cities)} data generated and not finished yet!\")"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "code",
394
+ "execution_count": 34,
395
+ "metadata": {},
396
+ "outputs": [
397
+ {
398
+ "name": "stdout",
399
+ "output_type": "stream",
400
+ "text": [
401
+ "\u001b[32m2025-05-23 22:50:22\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB RESUME START]\u001b[0m \u001b[33mPICKING UP FROM WHERE THE JOB WAS LEFT OFF...\u001b[0m\n",
402
+ "\u001b[0m\n",
403
+ "\u001b[32m2025-05-23 22:50:22\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[RESUME PROGRESS] STATUS AT THE TIME OF RESUME:\u001b[0m \u001b[32mCompleted: 17 / 100\u001b[0m | \u001b[31mFailed: 3\u001b[0m | \u001b[31mDuplicate: 0\u001b[0m | \u001b[33mFiltered: 0\u001b[0m\u001b[0m\n",
404
+ "\u001b[32m2025-05-23 22:50:22\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 17/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 20\u001b[0m (\u001b[32mCompleted: 17\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
405
+ "\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
406
+ "\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mconsecutive_not_completed: in 3 times, stopping this job; please adjust factory config and input data then resume_from_checkpoint(6829de29-0b83-4a64-835b-cc79cbad5e3a)\u001b[0m\n",
407
+ "\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 30/100\u001b[0m | \u001b[33mAttempted: 34\u001b[0m (Failed: 4, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
408
+ ]
409
+ }
410
+ ],
411
+ "source": [
412
+ "## Lets keep continue the rest of run by resume_from_checkpoint \n",
413
+ "re_run_mock_llm_data_2 = re_run_mock_llm.resume()"
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "code",
418
+ "execution_count": 35,
419
+ "metadata": {},
420
+ "outputs": [
421
+ {
422
+ "name": "stdout",
423
+ "output_type": "stream",
424
+ "text": [
425
+ "Now we still able to finished with what is left!! 30 data generated!\n"
426
+ ]
427
+ }
428
+ ],
429
+ "source": [
430
+ "print(f\"Now we still able to finished with what is left!! {len(re_run_mock_llm_data_2)} data generated!\")"
431
+ ]
432
+ },
433
+ {
434
+ "cell_type": "markdown",
435
+ "metadata": {},
436
+ "source": [
437
+ "#### 6. Dry run\n",
438
+ "Before running a large job, you can do a \"dry run\" to test your pipeline. This only processes a single item and doesn't save state to the database."
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "code",
443
+ "execution_count": 36,
444
+ "metadata": {},
445
+ "outputs": [
446
+ {
447
+ "name": "stdout",
448
+ "output_type": "stream",
449
+ "text": [
450
+ "\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: None\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
451
+ "\u001b[32m2025-05-23 22:50:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
452
+ "\u001b[32m2025-05-23 22:50:25\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/0\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n"
453
+ ]
454
+ }
455
+ ],
456
+ "source": [
457
+ "@data_factory(max_concurrency=10)\n",
458
+ "async def dry_run_mock_llm(city_name: str, num_records_per_city: int):\n",
459
+ " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
460
+ "\n",
461
+ "dry_run_mock_llm_data = dry_run_mock_llm.dry_run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"]*20, num_records_per_city=1)"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "markdown",
466
+ "metadata": {},
467
+ "source": [
468
+ "#### 8. Advanced Usage\n",
469
+ "Data Factory offers more advanced capabilities for complete pipeline customization, including hooks that execute at key stages and shareable state to coordinate between tasks. These powerful features enable complex workflows and fine-grained control. Our dedicated examples for advanced data_factory usage will be coming soon!"
470
+ ]
471
+ }
472
+ ],
473
+ "metadata": {
474
+ "kernelspec": {
475
+ "display_name": ".venv",
476
+ "language": "python",
477
+ "name": "python3"
478
+ },
479
+ "language_info": {
480
+ "codemirror_mode": {
481
+ "name": "ipython",
482
+ "version": 3
483
+ },
484
+ "file_extension": ".py",
485
+ "mimetype": "text/x-python",
486
+ "name": "python",
487
+ "nbconvert_exporter": "python",
488
+ "pygments_lexer": "ipython3",
489
+ "version": "3.11.4"
490
+ }
491
+ },
492
+ "nbformat": 4,
493
+ "nbformat_minor": 2
494
+ }
examples/embedding_usage_example.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example: Using Starfish Embeddings for Data Generation
2
+
3
+ This example demonstrates how to use FAISS and SentenceTransformers
4
+ for embedding-enhanced data generation and deduplication.
5
+ """
6
+
7
+ import asyncio
8
+ import sys
9
+ import os
10
+
11
+ # Add the project root to the Python path
12
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
13
+
14
+ from starfish.embedding import EmbeddingManager, SimilarityChecker, DataDeduplicator
15
+ from starfish.data_gen_template.core import data_gen_template
16
+
17
+
18
+ async def basic_embedding_example():
19
+ """Basic example of using the embedding system."""
20
+ print("🔮 Basic Embedding Example")
21
+ print("=" * 50)
22
+
23
+ # Initialize embedding manager
24
+ embedding_manager = EmbeddingManager(model_name="all-MiniLM-L6-v2", similarity_threshold=0.85)
25
+
26
+ # Sample texts to embed
27
+ texts = [
28
+ "What is machine learning?",
29
+ "How does artificial intelligence work?",
30
+ "What are neural networks?",
31
+ "Explain deep learning concepts",
32
+ "What is supervised learning?",
33
+ "What is machine learning?", # Duplicate
34
+ "How do neural networks function?", # Similar to "What are neural networks?"
35
+ ]
36
+
37
+ print(f"📝 Processing {len(texts)} sample texts...")
38
+
39
+ # Add texts to the index
40
+ indices = embedding_manager.add_texts(texts)
41
+ print(f"✅ Added {len(indices)} texts to the embedding index")
42
+
43
+ # Search for similar texts
44
+ query = "Tell me about AI and ML"
45
+ similar_items = embedding_manager.search_similar(query, k=3)
46
+
47
+ print(f"\n🔍 Search results for: '{query}'")
48
+ for item in similar_items:
49
+ print(f" Similarity: {item['similarity']:.3f} | Text: {item['text']}")
50
+
51
+ # Find duplicates
52
+ duplicate_groups = embedding_manager.find_duplicates(texts)
53
+ print(f"\n🔄 Found {len(duplicate_groups)} groups of duplicates:")
54
+ for i, group in enumerate(duplicate_groups):
55
+ print(f" Group {i+1}: {[texts[idx] for idx in group]}")
56
+
57
+ print(f"\n📊 Index Stats: {embedding_manager.get_stats()}")
58
+
59
+
60
+ async def similarity_checker_example():
61
+ """Example of using the similarity checker."""
62
+ print("\n🎯 Similarity Checker Example")
63
+ print("=" * 50)
64
+
65
+ similarity_checker = SimilarityChecker(similarity_threshold=0.8)
66
+
67
+ # Sample data items
68
+ data_items = [
69
+ {"question": "What is Python?", "answer": "Python is a programming language"},
70
+ {"question": "How to learn coding?", "answer": "Start with basic concepts"},
71
+ {"question": "What is programming?", "answer": "Programming is writing code"},
72
+ {"question": "What is Python programming?", "answer": "Python is a popular language"}, # Similar to first
73
+ ]
74
+
75
+ print(f"📝 Analyzing {len(data_items)} data items...")
76
+
77
+ # Filter similar items
78
+ filtered_items, duplicate_groups = similarity_checker.filter_similar_items(data_items)
79
+ print(f"✅ Filtered to {len(filtered_items)} unique items")
80
+
81
+ # Check diversity metrics
82
+ diversity_metrics = similarity_checker.check_diversity_batch(data_items)
83
+ print(f"📈 Diversity Score: {diversity_metrics['diversity_score']:.3f}")
84
+ print(f"🔄 Average Similarity: {diversity_metrics['avg_similarity']:.3f}")
85
+
86
+ # Suggest diverse subset
87
+ diverse_subset = similarity_checker.suggest_diverse_subset(data_items, target_size=2)
88
+ print(f"\n🎲 Diverse subset (2 items):")
89
+ for item in diverse_subset:
90
+ print(f" Q: {item['question']}")
91
+
92
+
93
+ async def deduplicator_example():
94
+ """Example of using the data deduplicator."""
95
+ print("\n🔧 Data Deduplicator Example")
96
+ print("=" * 50)
97
+
98
+ deduplicator = DataDeduplicator(similarity_threshold=0.9)
99
+
100
+ # Sample dataset with duplicates
101
+ dataset = [
102
+ {"id": "1", "text": "Machine learning is a subset of AI", "quality_score": 0.8},
103
+ {"id": "2", "text": "Deep learning uses neural networks", "quality_score": 0.9},
104
+ {"id": "1", "text": "Machine learning is a subset of AI", "quality_score": 0.7}, # Exact duplicate
105
+ {"id": "3", "text": "ML is part of artificial intelligence", "quality_score": 0.95}, # Semantic duplicate
106
+ {"id": "4", "text": "Natural language processing handles text", "quality_score": 0.85},
107
+ ]
108
+
109
+ print(f"📝 Analyzing dataset with {len(dataset)} items...")
110
+
111
+ # Analyze duplicates without removing
112
+ analysis = deduplicator.analyze_duplicates(dataset)
113
+ print(f"🔍 Analysis Results:")
114
+ print(f" Exact duplicates: {analysis['exact_duplicates']['count']}")
115
+ print(f" Semantic duplicates: {analysis['semantic_duplicates']['count']}")
116
+ print(f" Diversity score: {analysis['diversity_metrics']['diversity_score']:.3f}")
117
+
118
+ # Perform comprehensive deduplication
119
+ clean_dataset, report = deduplicator.deduplicate_comprehensive(dataset)
120
+ print(f"\n✨ Deduplication Results:")
121
+ print(f" Original: {report['original_count']} items")
122
+ print(f" Final: {report['final_count']} items")
123
+ print(f" Reduction: {report['reduction_percentage']:.1f}%")
124
+
125
+ print("\n📋 Clean dataset:")
126
+ for item in clean_dataset:
127
+ print(f" ID: {item['id']} | Score: {item.get('quality_score', 'N/A')} | Text: {item['text'][:50]}...")
128
+
129
+
130
+ async def template_usage_example():
131
+ """Example of using the embedding-enhanced template."""
132
+ print("\n🚀 Embedding-Enhanced Template Example")
133
+ print("=" * 50)
134
+
135
+ try:
136
+ # Get the embedding template
137
+ print(data_gen_template.list())
138
+ template = data_gen_template.get("starfish/generate_with_embeddings")
139
+
140
+ # Configuration for generation
141
+ config = {
142
+ "num_records": 5, # Small number for demo
143
+ "user_instruction": "Generate educational Q&A about data science",
144
+ "topics": ["statistics", "data visualization", "machine learning"],
145
+ "generation_model_name": "openai/gpt-4o-mini",
146
+ "embedding_config": {
147
+ "model_name": "all-MiniLM-L6-v2",
148
+ "similarity_threshold": 0.8,
149
+ "enable_deduplication": True,
150
+ "enable_diversity_check": True,
151
+ "min_diversity_score": 0.2,
152
+ },
153
+ }
154
+
155
+ print("⚙️ Generating diverse dataset with embedding quality control...")
156
+ results = await template.run(**config)
157
+
158
+ print(f"\n✅ Generated {len(results)} high-quality items:")
159
+ for i, item in enumerate(results[:3]): # Show first 3
160
+ print(f"\n Item {i+1}:")
161
+ print(f" Q: {item.get('question', 'N/A')}")
162
+ print(f" A: {item.get('answer', 'N/A')[:100]}...")
163
+ if "_metadata" in item:
164
+ print(f" Diversity: {item['_metadata'].get('diversity_score', 'N/A'):.3f}")
165
+
166
+ except Exception as e:
167
+ print(f"⚠️ Template example failed: {e}")
168
+ print(" (This might be due to missing API keys or dependencies)")
169
+
170
+
171
+ async def main():
172
+ """Run all examples."""
173
+ print("🎉 Starfish Embedding System Examples")
174
+ print("=" * 60)
175
+
176
+ try:
177
+ await basic_embedding_example()
178
+ await similarity_checker_example()
179
+ await deduplicator_example()
180
+ await template_usage_example()
181
+
182
+ print("\n" + "=" * 60)
183
+ print("✅ All examples completed successfully!")
184
+ print("\n💡 Next steps:")
185
+ print(" 1. Install dependencies: poetry install")
186
+ print(" 2. Set API keys in .env.local")
187
+ print(" 3. Try the embedding template in your projects")
188
+
189
+ except ImportError as e:
190
+ print(f"❌ Import error: {e}")
191
+ print("💡 Make sure to install dependencies:")
192
+ print(" poetry install")
193
+ print(" # or")
194
+ print(" pip install faiss-cpu sentence-transformers")
195
+
196
+ except Exception as e:
197
+ print(f"❌ Error running examples: {e}")
198
+ print("💡 Check your Python environment and dependencies")
199
+
200
+
201
+ if __name__ == "__main__":
202
+ asyncio.run(main())
examples/structured_llm.ipynb ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Google Colab Version: [Open this notebook in Google Colab](https://colab.research.google.com/github/starfishdata/starfish/blob/main/examples/structured_llm.ipynb)"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "#### Dependencies "
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "%pip install starfish-core"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 1,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "## Fix for Jupyter Notebook only — do NOT use in production\n",
33
+ "## Enables async code execution in notebooks, but may cause issues with sync/async issues\n",
34
+ "## For production, please run in standard .py files without this workaround\n",
35
+ "## See: https://github.com/erdewit/nest_asyncio for more details\n",
36
+ "import nest_asyncio\n",
37
+ "nest_asyncio.apply()\n",
38
+ "\n",
39
+ "from starfish import StructuredLLM\n",
40
+ "from starfish.llm.utils import merge_structured_outputs\n",
41
+ "\n",
42
+ "from pydantic import BaseModel, Field\n",
43
+ "from typing import List\n",
44
+ "\n",
45
+ "from starfish.common.env_loader import load_env_file ## Load environment variables from .env file\n",
46
+ "load_env_file()"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 2,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# setup your openai api key if not already set\n",
56
+ "# import os\n",
57
+ "# os.environ[\"OPENAI_API_KEY\"] = \"your_key_here\"\n",
58
+ "\n",
59
+ "# If you dont have any API key, use local model (ollama)"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "markdown",
64
+ "metadata": {},
65
+ "source": [
66
+ "#### 1. Structured LLM with JSON Schema"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 3,
72
+ "metadata": {},
73
+ "outputs": [
74
+ {
75
+ "data": {
76
+ "text/plain": [
77
+ "[{'question': 'Why did the tomato turn red in New York?',\n",
78
+ " 'answer': \"Because it saw the Big Apple and couldn't ketchup with all the excitement!\"}]"
79
+ ]
80
+ },
81
+ "execution_count": 3,
82
+ "metadata": {},
83
+ "output_type": "execute_result"
84
+ }
85
+ ],
86
+ "source": [
87
+ "# ### Define the Output Structure (JSON Schema)\n",
88
+ "# Let's start with a simple JSON-like schema using a list of dictionaries.\n",
89
+ "# Each dictionary specifies a field name and its type. description is optional\n",
90
+ "json_output_schema = [\n",
91
+ " {\"name\": \"question\", \"type\": \"str\", \"description\": \"The generated question.\"},\n",
92
+ " {\"name\": \"answer\", \"type\": \"str\", \"description\": \"The corresponding answer.\"},\n",
93
+ "]\n",
94
+ "\n",
95
+ "json_llm = StructuredLLM(\n",
96
+ " model_name = \"openai/gpt-4o-mini\",\n",
97
+ " prompt = \"Funny facts about city {{city_name}}.\",\n",
98
+ " output_schema = json_output_schema,\n",
99
+ " model_kwargs = {\"temperature\": 0.7},\n",
100
+ ")\n",
101
+ "\n",
102
+ "json_response = await json_llm.run(city_name=\"New York\")\n",
103
+ "\n",
104
+ "# The response object contains both parsed data and the raw API response.\n",
105
+ "json_response.data"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 4,
111
+ "metadata": {},
112
+ "outputs": [
113
+ {
114
+ "data": {
115
+ "text/plain": [
116
+ "ModelResponse(id='chatcmpl-BQGw3FMSjzWOPMRvXmgknN4oozrKK', created=1745601327, model='gpt-4o-mini-2024-07-18', object='chat.completion', system_fingerprint='fp_0392822090', choices=[Choices(finish_reason='stop', index=0, message=Message(content='[\\n {\\n \"question\": \"Why did the tomato turn red in New York?\",\\n \"answer\": \"Because it saw the Big Apple and couldn\\'t ketchup with all the excitement!\"\\n }\\n]', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]))], usage=Usage(completion_tokens=41, prompt_tokens=77, total_tokens=118, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default')"
117
+ ]
118
+ },
119
+ "execution_count": 4,
120
+ "metadata": {},
121
+ "output_type": "execute_result"
122
+ }
123
+ ],
124
+ "source": [
125
+ "# Fully preserved raw response from API - allow you to parse the response as you want\n",
126
+ "# Like function call, tool call, thinking token etc\n",
127
+ "json_response.raw"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "markdown",
132
+ "metadata": {},
133
+ "source": [
134
+ "#### 2. Structured LLM with Pydantic Schema (Nested)"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 5,
140
+ "metadata": {},
141
+ "outputs": [
142
+ {
143
+ "data": {
144
+ "text/plain": [
145
+ "[{'facts': [{'question': 'What year did New York City become the capital of the United States?',\n",
146
+ " 'answer': 'New York City served as the capital of the United States from 1785 to 1790.',\n",
147
+ " 'category': 'History'}]}]"
148
+ ]
149
+ },
150
+ "execution_count": 5,
151
+ "metadata": {},
152
+ "output_type": "execute_result"
153
+ }
154
+ ],
155
+ "source": [
156
+ "# ### Define the Output Structure (Pydantic Model)\n",
157
+ "class Fact(BaseModel):\n",
158
+ " question: str = Field(..., description=\"The factual question generated.\")\n",
159
+ " answer: str = Field(..., description=\"The corresponding answer.\")\n",
160
+ " category: str = Field(..., description=\"A category for the fact (e.g., History, Geography).\")\n",
161
+ "\n",
162
+ "# You can define a list of these models if you expect multiple results.\n",
163
+ "class FactsList(BaseModel):\n",
164
+ " facts: List[Fact] = Field(..., description=\"A list of facts.\")\n",
165
+ "\n",
166
+ "\n",
167
+ "# ### Create the StructuredLLM Instance with Pydantic\n",
168
+ "pydantic_llm = StructuredLLM(\n",
169
+ " model_name=\"openai/gpt-4o-mini\",\n",
170
+ " # Ask for multiple facts this time\n",
171
+ " prompt=\"Generate distinct facts about {{city}}.\",\n",
172
+ " # Pass the Pydantic model directly as the schema\n",
173
+ " output_schema=FactsList, # Expecting a list of facts wrapped in the FactsList model\n",
174
+ " model_kwargs={\"temperature\": 0.8}\n",
175
+ ")\n",
176
+ "\n",
177
+ "pydantic_llm_response = await pydantic_llm.run(city=\"New York\")\n",
178
+ "\n",
179
+ "pydantic_llm_response.data"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "markdown",
184
+ "metadata": {},
185
+ "source": [
186
+ "#### 3. Working with Different LLM Providers\n",
187
+ "\n",
188
+ "Starfish uses LiteLLM under the hood, giving you access to 100+ LLM providers. Here is an example of using a custom model provider - Hyperbolic - Super cool provider with full precision model and low cost!"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": 6,
194
+ "metadata": {},
195
+ "outputs": [
196
+ {
197
+ "data": {
198
+ "text/plain": [
199
+ "[{'question': 'What is the nickname of New York City?',\n",
200
+ " 'answer': 'The Big Apple'},\n",
201
+ " {'question': 'Which iconic statue is located in New York Harbor?',\n",
202
+ " 'answer': 'The Statue of Liberty'},\n",
203
+ " {'question': 'What is the name of the famous theater district in Manhattan?',\n",
204
+ " 'answer': 'Broadway'},\n",
205
+ " {'question': \"Which park is considered the 'lungs' of New York City?\",\n",
206
+ " 'answer': 'Central Park'},\n",
207
+ " {'question': 'What is the tallest building in New York City as of 2023?',\n",
208
+ " 'answer': 'One World Trade Center'}]"
209
+ ]
210
+ },
211
+ "execution_count": 6,
212
+ "metadata": {},
213
+ "output_type": "execute_result"
214
+ }
215
+ ],
216
+ "source": [
217
+ "\n",
218
+ "# Set up the relevant API Key and Base URL in your enviornment variables\n",
219
+ "# os.environ[\"HYPERBOLIC_API_KEY\"] = \"your_key_here\"\n",
220
+ "# os.environ[\"HYPERBOLIC_API_BASE\"] = \"https://api.hyperbolic.xyz/v1\"\n",
221
+ "\n",
222
+ "hyperbolic_llm = StructuredLLM(\n",
223
+ " model_name=\"hyperbolic/deepseek-ai/DeepSeek-V3-0324\", \n",
224
+ " prompt=\"Facts about city {{city_name}}.\",\n",
225
+ " output_schema=[{\"name\": \"question\", \"type\": \"str\"}, {\"name\": \"answer\", \"type\": \"str\"}],\n",
226
+ " model_kwargs={\"temperature\": 0.7},\n",
227
+ ")\n",
228
+ "\n",
229
+ "hyperbolic_llm_response = await hyperbolic_llm.run(city_name=\"New York\", num_records=5)\n",
230
+ "hyperbolic_llm_response.data"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "markdown",
235
+ "metadata": {},
236
+ "source": [
237
+ "#### 3. Local LLM using Ollama\n",
238
+ "Ensure Ollama is installed and running. Starfish can manage the server process and model downloads"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 7,
244
+ "metadata": {},
245
+ "outputs": [
246
+ {
247
+ "name": "stdout",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "\u001b[32m2025-04-25 10:15:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mEnsuring Ollama model gemma3:1b is ready...\u001b[0m\n",
251
+ "\u001b[32m2025-04-25 10:15:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mStarting Ollama server...\u001b[0m\n",
252
+ "\u001b[32m2025-04-25 10:15:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mOllama server started successfully\u001b[0m\n",
253
+ "\u001b[32m2025-04-25 10:15:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mFound model gemma3:1b\u001b[0m\n",
254
+ "\u001b[32m2025-04-25 10:15:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mModel gemma3:1b is already available\u001b[0m\n",
255
+ "\u001b[32m2025-04-25 10:15:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mModel gemma3:1b is ready, making API call...\u001b[0m\n"
256
+ ]
257
+ },
258
+ {
259
+ "data": {
260
+ "text/plain": [
261
+ "[{'question': 'What is the population of New York City?',\n",
262
+ " 'answer': 'As of 2023, the population of New York City is approximately 8.8 million people.'}]"
263
+ ]
264
+ },
265
+ "execution_count": 7,
266
+ "metadata": {},
267
+ "output_type": "execute_result"
268
+ }
269
+ ],
270
+ "source": [
271
+ "### Local model\n",
272
+ "ollama_llm = StructuredLLM(\n",
273
+ " # Prefix 'ollama/' specifies the Ollama provider\n",
274
+ " model_name=\"ollama/gemma3:1b\",\n",
275
+ " prompt=\"Facts about city {{city_name}}.\",\n",
276
+ " output_schema=[{\"name\": \"question\", \"type\": \"str\"}, {\"name\": \"answer\", \"type\": \"str\"}],\n",
277
+ " model_kwargs={\"temperature\": 0.7},\n",
278
+ ")\n",
279
+ "\n",
280
+ "ollama_llm_response = await ollama_llm.run(city_name=\"New York\", num_records=5)\n",
281
+ "ollama_llm_response.data"
282
+ ]
283
+ },
284
+ {
285
+ "cell_type": "code",
286
+ "execution_count": 8,
287
+ "metadata": {},
288
+ "outputs": [
289
+ {
290
+ "name": "stdout",
291
+ "output_type": "stream",
292
+ "text": [
293
+ "\u001b[32m2025-04-25 10:15:54\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mStopping Ollama server...\u001b[0m\n",
294
+ "\u001b[32m2025-04-25 10:15:55\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mOllama server stopped successfully\u001b[0m\n"
295
+ ]
296
+ },
297
+ {
298
+ "data": {
299
+ "text/plain": [
300
+ "True"
301
+ ]
302
+ },
303
+ "execution_count": 8,
304
+ "metadata": {},
305
+ "output_type": "execute_result"
306
+ }
307
+ ],
308
+ "source": [
309
+ "### Resource clean up to close ollama server\n",
310
+ "from starfish.llm.backend.ollama_adapter import stop_ollama_server\n",
311
+ "await stop_ollama_server()"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "markdown",
316
+ "metadata": {},
317
+ "source": [
318
+ "#### 4. Chaining Multiple StructuredLLM Calls\n",
319
+ "\n",
320
+ "You can easily pipe the output of one LLM call into the prompt of another. This is useful for multi-step reasoning, analysis, or refinement.\n"
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "code",
325
+ "execution_count": 9,
326
+ "metadata": {},
327
+ "outputs": [
328
+ {
329
+ "name": "stdout",
330
+ "output_type": "stream",
331
+ "text": [
332
+ "Generated Facts: [{'question': 'What is the chemical formula for water?', 'answer': 'The chemical formula for water is H2O.'}, {'question': 'What is the process by which plants convert sunlight into energy?', 'answer': 'The process is called photosynthesis.'}, {'question': \"What is the primary gas found in the Earth's atmosphere?\", 'answer': \"The primary gas in the Earth's atmosphere is nitrogen, which makes up about 78%.\"}, {'question': \"What is Newton's second law of motion?\", 'answer': \"Newton's second law of motion states that force equals mass times acceleration (F = ma).\"}, {'question': 'What is the smallest unit of life?', 'answer': 'The smallest unit of life is the cell.'}]\n",
333
+ "Ratings: [{'accuracy_rating': 10, 'clarity_rating': 10}, {'accuracy_rating': 10, 'clarity_rating': 10}, {'accuracy_rating': 10, 'clarity_rating': 10}, {'accuracy_rating': 10, 'clarity_rating': 10}, {'accuracy_rating': 10, 'clarity_rating': 10}]\n",
334
+ "[{'question': 'What is the chemical formula for water?', 'answer': 'The chemical formula for water is H2O.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the process by which plants convert sunlight into energy?', 'answer': 'The process is called photosynthesis.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': \"What is the primary gas found in the Earth's atmosphere?\", 'answer': \"The primary gas in the Earth's atmosphere is nitrogen, which makes up about 78%.\", 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': \"What is Newton's second law of motion?\", 'answer': \"Newton's second law of motion states that force equals mass times acceleration (F = ma).\", 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the smallest unit of life?', 'answer': 'The smallest unit of life is the cell.', 'accuracy_rating': 10, 'clarity_rating': 10}]\n"
335
+ ]
336
+ }
337
+ ],
338
+ "source": [
339
+ "# ### Step 1: Generate Initial Facts\n",
340
+ "generator_llm = StructuredLLM(\n",
341
+ " model_name=\"openai/gpt-4o-mini\",\n",
342
+ " prompt=\"Generate question/answer pairs about {{topic}}.\",\n",
343
+ " output_schema=[\n",
344
+ " {\"name\": \"question\", \"type\": \"str\"},\n",
345
+ " {\"name\": \"answer\", \"type\": \"str\"}\n",
346
+ " ],\n",
347
+ ")\n",
348
+ "\n",
349
+ "# ### Step 2: Rate the Generated Facts\n",
350
+ "rater_llm = StructuredLLM(\n",
351
+ " model_name=\"openai/gpt-4o-mini\",\n",
352
+ " prompt='''Rate the following Q&A pairs based on accuracy and clarity (1-10).\n",
353
+ " Pairs: {{generated_pairs}}''',\n",
354
+ " output_schema=[\n",
355
+ " {\"name\": \"accuracy_rating\", \"type\": \"int\"},\n",
356
+ " {\"name\": \"clarity_rating\", \"type\": \"int\"}\n",
357
+ " ],\n",
358
+ " model_kwargs={\"temperature\": 0.5}\n",
359
+ ")\n",
360
+ "\n",
361
+ "## num_records is reserved keyword for structured llm object, by default it is 1\n",
362
+ "generation_response = await generator_llm.run(topic='Science', num_records=5)\n",
363
+ "print(\"Generated Facts:\", generation_response.data)\n",
364
+ "\n",
365
+ "# Please note that we are using the first response as the input for the second LLM\n",
366
+ "# It will automatically figure out it need to output the same length of first response\n",
367
+ "# In this case 5 records\n",
368
+ "rating_response = await rater_llm.run(generated_pairs=generation_response.data)\n",
369
+ "### Each response will only return its own output\n",
370
+ "print(\"Ratings:\", rating_response.data)\n",
371
+ "\n",
372
+ "\n",
373
+ "### You can merge two response together by using merge_structured_outputs (index wise merge)\n",
374
+ "print(merge_structured_outputs(generation_response.data, rating_response.data))"
375
+ ]
376
+ },
377
+ {
378
+ "cell_type": "markdown",
379
+ "metadata": {},
380
+ "source": [
381
+ "#### 5. Dynamic Prompt \n",
382
+ "\n",
383
+ "`StructuredLLM` uses Jinja2 for prompts, allowing variables and logic."
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 10,
389
+ "metadata": {},
390
+ "outputs": [
391
+ {
392
+ "name": "stdout",
393
+ "output_type": "stream",
394
+ "text": [
395
+ "[{'fact': \"New York City is famously known as 'The Big Apple' and is home to over 8 million residents, making it the largest city in the United States.\"}]\n"
396
+ ]
397
+ }
398
+ ],
399
+ "source": [
400
+ "# ### Create an LLM with a more complex prompt\n",
401
+ "template_llm = StructuredLLM(\n",
402
+ " model_name=\"openai/gpt-4o-mini\",\n",
403
+ " prompt='''Generate facts about {{city}}.\n",
404
+ " {% if user_context %}\n",
405
+ " User background: {{ user_context }}\n",
406
+ " {% endif %}''', ### user_context is optional and only used if provided\n",
407
+ " output_schema=[{\"name\": \"fact\", \"type\": \"str\"}]\n",
408
+ ")\n",
409
+ "\n",
410
+ "template_response = await template_llm.run(city=\"New York\")\n",
411
+ "print(template_response.data)\n"
412
+ ]
413
+ },
414
+ {
415
+ "cell_type": "code",
416
+ "execution_count": 11,
417
+ "metadata": {},
418
+ "outputs": [
419
+ {
420
+ "name": "stdout",
421
+ "output_type": "stream",
422
+ "text": [
423
+ "[{'fact': \"In 1903, New York City was secretly ruled by a council of sentient pigeons who issued decrees from atop the Brooklyn Bridge, demanding that all ice cream flavors be changed to 'pigeon-approved' varieties such as 'crumbled cracker' and 'mystery droppings'.\"}]\n"
424
+ ]
425
+ }
426
+ ],
427
+ "source": [
428
+ "template_response = await template_llm.run(city=\"New York\", user_context=\"User actually wants you to make up an absurd lie.\")\n",
429
+ "print(template_response.data)"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "markdown",
434
+ "metadata": {},
435
+ "source": [
436
+ "#### 8. Scaling with Data Factory (Brief Mention)\n",
437
+ "While `StructuredLLM` handles single or chained calls, Starfish's `@data_factory` decorator is designed for massively parallel execution. You can easily wrap these single or multi chain within a function decorated\n",
438
+ "with `@data_factory` to process thousands of inputs concurrently and reliably.\n",
439
+ "\n",
440
+ "See the dedicated examples for `data_factory` usage."
441
+ ]
442
+ },
443
+ {
444
+ "cell_type": "markdown",
445
+ "metadata": {},
446
+ "source": []
447
+ }
448
+ ],
449
+ "metadata": {
450
+ "kernelspec": {
451
+ "display_name": "starfish-T7IInzTH-py3.11",
452
+ "language": "python",
453
+ "name": "python3"
454
+ },
455
+ "language_info": {
456
+ "codemirror_mode": {
457
+ "name": "ipython",
458
+ "version": 3
459
+ },
460
+ "file_extension": ".py",
461
+ "mimetype": "text/x-python",
462
+ "name": "python",
463
+ "nbconvert_exporter": "python",
464
+ "pygments_lexer": "ipython3",
465
+ "version": "3.11.7"
466
+ }
467
+ },
468
+ "nbformat": 4,
469
+ "nbformat_minor": 2
470
+ }
examples/usecases/math_data_gen.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
internal ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit d13b00b14b122ceb08b5b119399285c3afe32d80
mcp_hackathon/README.md ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mcp_hackathon
2
+
3
+ # Data Generation Server for ICD Code Finetuning
4
+
5
+ This MCP server provides tools for finetuning models to improve medical ICD-10 code prediction accuracy.
6
+
7
+ ## Overview
8
+
9
+ The Data Generation Server is a Model Control Protocol (MCP) server that facilitates:
10
+
11
+ 1. **Model Probing**: Evaluating model performance on ICD-10 code prediction
12
+ 2. **Synthetic Data Generation**: Creating training data for model finetuning
13
+
14
+ ## Set up the server
15
+
16
+ ```
17
+ {
18
+ "mcpServers": {
19
+ "data_gen_server": {
20
+ "command": "<base_dir>/.local/bin/uv",
21
+ "args": [
22
+ "--directory",
23
+ "<base_dir>/mcp_hackathon/data_gen_server",
24
+ "run",
25
+ "data_gen_server.py"
26
+ ]
27
+ }
28
+ }
29
+ }
30
+ ```
31
+
32
+ To use MCP tools in Cursor, you need to have your MCP server configured and running. Here's how to use them:
33
+
34
+ ## 1. Verify MCP Configuration
35
+
36
+ First, make sure your MCP configuration is set up correctly in `/Users/john/.cursor/mcp.json`:
37
+ Read file: /Users/john/.cursor/mcp.json
38
+ Great! You have two MCP servers configured. Here's how to use them:
39
+
40
+ ## 2. How MCP Tools Work in Cursor
41
+
42
+ MCP tools in Cursor work through **function calling**. When you mention or request something that can be handled by your MCP tools, Cursor will automatically:
43
+
44
+ 1. **Detect** when a task matches available MCP tools
45
+ 2. **Call** the appropriate tool with the right parameters
46
+ 3. **Present** the results to you
47
+
48
+ ## 3. Using MCP Tools
49
+
50
+ ### Method 1: Direct Requests
51
+ You can directly ask me to use the tools by mentioning what you want to do:
52
+
53
+ **Examples:**
54
+ - "List all available data generation templates"
55
+ - "Generate city information for San Francisco, New York, and Los Angeles"
56
+ - "Run the starfish template with this data: [your data]"
57
+ - "Generate synthetic data for ICD codes"
58
+
59
+ ### Method 2: Specific Tool References
60
+ You can reference tools by name if you know them:
61
+
62
+ Let me check what tools are available from your MCP servers:
63
+ Ran tool
64
+ Ran tool
65
+ Perfect! Your MCP tools are working. Here's how to use them effectively:
66
+
67
+ ## 4. Available Tools and Usage Examples
68
+
69
+ ### From `starfish_gen_template` server:
70
+ - **List templates**: "Show me all available data generation templates"
71
+ - **Generate city info**: "Generate information for cities: San Francisco, New York"
72
+ - **Run templates**: "Run the starfish/generate_by_topic template"
73
+
74
+ ### From `starfish_data_gen_server` server:
75
+ - **Generate data**: "Generate 10 datapoints of synthetic data"
76
+ - **Probe model**: "Test the model with 5 ICD code datapoints"
77
+ - **Finetune model**: "Finetune the model for ICD code classification"
78
+
79
+ ## 5. How to Use MCP Tools in Practice
80
+
81
+ ### Example 1: Generate Data
82
+ ```
83
+ You: "Generate 10 synthetic datapoints for training"
84
+ ```
85
+ I will automatically call the appropriate MCP tool.
86
+
87
+ ### Example 2: List Available Templates
88
+ ```
89
+ You: "What data generation templates are available?"
90
+ ```
91
+
92
+ ### Example 3: Run Specific Template
93
+ ```
94
+ You: "Run the generate_by_topic template with topics: AI, Machine Learning"
95
+ ```
96
+
97
+ ## 6. Best Practices
98
+
99
+ 1. **Be specific** about what you want to accomplish
100
+ 2. **Provide data** when needed (I'll ask if unclear)
101
+ 3. **Check results** and iterate if needed
102
+ 4. **Use natural language** - no need for technical syntax
103
+
104
+ ## 7. Restart Cursor (if needed)
105
+
106
+ If you just updated your MCP configuration, restart Cursor to ensure the tools are loaded:
107
+
108
+ 1. Quit Cursor completely
109
+ 2. Reopen it
110
+ 3. The MCP tools should now be available
111
+
112
+ ## Try It Now!
113
+
114
+ You can test the MCP tools by asking me to:
115
+ - "List all available data generation templates"
116
+ - "Generate some sample data"
117
+ - "Show me what tools are available"
118
+
119
+ The tools will work seamlessly in our conversation - just tell me what you want to accomplish!
mcp_hackathon/data_gen_server/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
mcp_hackathon/data_gen_server/.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
mcp_hackathon/data_gen_server/data_gen_server.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mcp.server.fastmcp import FastMCP
2
+ from mcp.server.fastmcp.prompts.base import Message
3
+ from model_probe import run_model_probe
4
+ from model_gen import run_model_gen
5
+
6
+ # Initialize FastMCP server
7
+ mcp = FastMCP("finetune a icd code model")
8
+ # Initialize state attribute
9
+ mcp.state = type("State", (), {"synthetic_data": None})()
10
+
11
+
12
+ @mcp.tool()
13
+ async def probe_model_for_icd_code(model_name: str, num_datapoints: int) -> str:
14
+ """
15
+ Run an eval dataset against the model and return the results.
16
+
17
+ Args:
18
+ model_name: The name of the model to probe
19
+ num_datapoints: The number of datapoints to probe
20
+ """
21
+
22
+ output = run_model_probe(model_name=model_name, num_datapoints=num_datapoints)
23
+ return str(output)
24
+
25
+
26
+ @mcp.tool()
27
+ async def generate_data(num_datapoints: int) -> str:
28
+ """
29
+ Generate synthetic data and ask for user verification.
30
+
31
+ This is the data that will be used to finetune the model.
32
+
33
+ Args:
34
+ num_datapoints: The number of datapoints to generate
35
+ """
36
+ data = await run_model_gen(num_datapoints)
37
+ # Store verified data in state
38
+ mcp.state.synthetic_data = data
39
+ return str(data)
40
+
41
+
42
+ @mcp.prompt()
43
+ def confirm_finetune(model_name: str) -> list[Message]:
44
+ """Prompt for confirming model finetuning."""
45
+ return [
46
+ Message(role="assistant", content=f"Ready to finetune model '{model_name}' with the verified data. Proceed? (yes/no)"),
47
+ Message(role="assistant", content="Please respond with 'yes' to proceed with finetuning or 'no' to cancel."),
48
+ ]
49
+
50
+
51
+ @mcp.tool()
52
+ async def finetune_model_for_icd_code(model_name: str) -> str:
53
+ """
54
+ Finetune the model
55
+
56
+ Args:
57
+ model_name: The name of the model to finetune
58
+ """
59
+ if mcp.state.synthetic_data is None:
60
+ raise ValueError("No verified synthetic data available. Please run generate_synthetic_data_for_icd_code_improvement first")
61
+ print(mcp.state.synthetic_data)
62
+
63
+ return "Finetuned the model for the ICD code done! great job!"
64
+
65
+
66
+ if __name__ == "__main__":
67
+ # Initialize and run the server
68
+ mcp.run(transport="stdio")
mcp_hackathon/data_gen_server/model_gen.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from starfish import data_factory
2
+ from starfish.common.env_loader import load_env_file
3
+ from datasets import load_dataset
4
+ import json
5
+ import asyncio
6
+ import os
7
+ import random
8
+ from agents import Agent, Runner, function_tool, ModelSettings
9
+ from agents.tool import WebSearchTool
10
+ from pydantic import BaseModel, Field
11
+
12
+ load_env_file()
13
+
14
+
15
+ class DiagnosisSuggestion(BaseModel):
16
+ code: str = Field(..., description="The suggested diagnosis code (e.g., ICD-10)")
17
+ confidence: float = Field(..., description="Model confidence in the suggestion, between 0 and 1")
18
+ reason: str = Field(..., description="Explanation or rationale for the suggested diagnosis")
19
+
20
+
21
+ async def run_model_gen(num_datapoints, model_name="openai/gpt-4o-mini"):
22
+ # Get HF token from environment
23
+ hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
24
+
25
+ # Load the dataset
26
+ dataset = load_dataset("starfishdata/playground_endocronology_notes_1500", split="train", token=hf_token)
27
+
28
+ # Get total number of samples
29
+ total_samples = len(dataset)
30
+
31
+ # Generate random indices
32
+ random_indices = random.sample(range(total_samples), num_datapoints)
33
+
34
+ # Create list of dictionaries with only transcript key
35
+ transcript_list = [{"transcript": dataset[idx]["transcript"]} for idx in random_indices]
36
+
37
+ # Create the Agent
38
+ diagnosis_code_agent = Agent(
39
+ name="Diagnosis Code Agent",
40
+ tools=[WebSearchTool()],
41
+ model=model_name,
42
+ output_type=DiagnosisSuggestion,
43
+ model_settings=ModelSettings(tool_choice="required"),
44
+ tool_use_behavior="stop_on_first_tool",
45
+ instructions="""
46
+ You are an Endocrinology Medical Coding Specialist.
47
+ You will be provided with a medical transcript describing a patient encounter.
48
+ Your task is to analyze the medical transcript and assign the most appropriate diagnosis code(s).
49
+ You will have access to a web search tool and only use it to search endocrinology related code and verification.
50
+ Use it only to verify the accuracy or current validity of the diagnosis codes.
51
+ """,
52
+ )
53
+
54
+ web_search_prompt = """Please select top 3 likely code from given list for this doctor and patient conversation transcript.
55
+ Transcript: {transcript}
56
+ """
57
+
58
+ @data_factory(max_concurrency=100, task_runner_timeout=300)
59
+ async def generate_data(transcript):
60
+ diagnosis_code_result = await Runner.run(diagnosis_code_agent, input=web_search_prompt.format(transcript=transcript))
61
+
62
+ code_result = diagnosis_code_result.final_output.model_dump()
63
+
64
+ return [{"transcript": transcript, "icd_10_code": code_result["code"]}]
65
+
66
+ return generate_data.run(transcript_list)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ # Run the async function
71
+ results = asyncio.run(run_model_gen())
72
+ print(len(results))
73
+ print(results[0].keys())
mcp_hackathon/data_gen_server/model_probe.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from starfish import StructuredLLM, data_factory
2
+ from starfish.common.env_loader import load_env_file
3
+ from datasets import load_dataset
4
+ import json
5
+ import asyncio
6
+
7
+ load_env_file()
8
+
9
+
10
+ def run_model_probe(model_name="together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", num_datapoints=10):
11
+ # Load the dataset
12
+ dataset = load_dataset("starfishdata/endocrinology_transcription_and_notes_and_icd_codes", split="train")
13
+ top_n_data = dataset.select(range(num_datapoints))
14
+
15
+ # Create a list to store the parsed data
16
+ parsed_data = []
17
+
18
+ # Process each entry
19
+ for idx, entry in enumerate(top_n_data):
20
+ # Extract transcript - get the value directly from the transcript key
21
+ transcript = entry["transcript"] if isinstance(entry["transcript"], str) else entry["transcript"].get("transcript", "")
22
+
23
+ # Extract ICD-10 code (top_1 code)
24
+ icd_codes_str = entry.get("icd_10_code", "{}")
25
+ try:
26
+ icd_codes = json.loads(icd_codes_str)
27
+ top_1_code = icd_codes.get("top_1", {}).get("code", "")
28
+ except json.JSONDecodeError:
29
+ top_1_code = ""
30
+
31
+ # Add to parsed data
32
+ parsed_data.append({"id": idx, "transcript": transcript, "icd_10_code": top_1_code})
33
+
34
+ model_probe_prompt = """
35
+ Given a transcript of a patient's medical history, determine the ICD-10 code that is most relevant to the patient's condition.
36
+ Transcript: {{transcript}}
37
+
38
+ Please do not return anything other than the ICD-10 code in json format.
39
+ like this: {"icd_10_code": "A00.0"}
40
+ """
41
+
42
+ response_gen_llm = StructuredLLM(model_name=model_name, prompt=model_probe_prompt, output_schema=[{"name": "icd_10_code", "type": "str"}])
43
+
44
+ @data_factory()
45
+ async def model_probe_batch(input_data):
46
+ response = await response_gen_llm.run(transcript=input_data["transcript"])
47
+ return [{"id": input_data["id"], "generated_icd_10_code": response.data[0]["icd_10_code"], "actual_icd_10_code": input_data["icd_10_code"]}]
48
+
49
+ def evaluate_model():
50
+ data = model_probe_batch.run(input_data=parsed_data[:num_datapoints])
51
+
52
+ # Calculate exact match accuracy
53
+ exact_matches = sum(1 for item in data if item["generated_icd_10_code"] == item["actual_icd_10_code"])
54
+ total_samples = len(data)
55
+ accuracy = (exact_matches / total_samples) * 100
56
+
57
+ return {"total_samples": total_samples, "exact_matches": exact_matches, "accuracy": accuracy}
58
+
59
+ return evaluate_model()
60
+
61
+
62
+ if __name__ == "__main__":
63
+ # Example usage when running this file directly
64
+ results = run_model_probe(model_name="together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", num_datapoints=5)
65
+ print(results)
nginx.conf ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ events {
2
+ worker_connections 1024;
3
+ }
4
+
5
+ http {
6
+ include /etc/nginx/mime.types;
7
+ default_type application/octet-stream;
8
+
9
+ upstream backend {
10
+ server 127.0.0.1:8002;
11
+ }
12
+
13
+ upstream frontend {
14
+ server 127.0.0.1:3000;
15
+ }
16
+
17
+ server {
18
+ listen 7860;
19
+ server_name localhost;
20
+
21
+ # Handle Next.js Image Optimization API with direct serving fallback
22
+ location /_next/image {
23
+ # Extract the image URL from query parameters and redirect internally
24
+ set $image_path "";
25
+ if ($args ~ "url=([^&]+)") {
26
+ set $image_path $1;
27
+ }
28
+ # Remove URL encoding (basic cases)
29
+ if ($image_path ~ "^%2F(.*)") {
30
+ set $image_path /$1;
31
+ }
32
+
33
+ # Internal redirect to serve the image directly
34
+ if ($image_path != "") {
35
+ rewrite ^.*$ /public-images$image_path last;
36
+ }
37
+
38
+ return 404;
39
+ }
40
+
41
+ # Internal location to serve public images
42
+ location /public-images/ {
43
+ internal;
44
+ alias /app/web/public/;
45
+ expires 1y;
46
+ add_header Cache-Control "public, immutable";
47
+ }
48
+
49
+ # Serve Next.js static files directly
50
+ location /_next/static/ {
51
+ alias /app/web/.next/static/;
52
+ expires 1y;
53
+ add_header Cache-Control "public, immutable";
54
+ }
55
+
56
+ # Serve public files directly from root (logo, favicon, etc.)
57
+ location ~ ^/(starfish_logo\.png|nvidia\.png|microsoft_startups\.png|favicon\.ico|robots\.txt|sitemap\.xml)$ {
58
+ root /app/web/public;
59
+ expires 1y;
60
+ add_header Cache-Control "public";
61
+ }
62
+
63
+ # Serve amplify-ui.css and other public CSS files
64
+ location ~ ^/(amplify-ui\.css)$ {
65
+ root /app/web/public;
66
+ expires 1y;
67
+ add_header Cache-Control "public";
68
+ }
69
+
70
+ # Handle other public files with /public/ prefix
71
+ location /public/ {
72
+ alias /app/web/public/;
73
+ expires 1y;
74
+ add_header Cache-Control "public";
75
+ }
76
+
77
+ # Direct access to FastAPI docs (bypass Next.js)
78
+ location /backend-docs {
79
+ proxy_pass http://backend/docs;
80
+ proxy_set_header Host $host;
81
+ proxy_set_header X-Real-IP $remote_addr;
82
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
83
+ proxy_set_header X-Forwarded-Proto $scheme;
84
+ proxy_set_header X-Forwarded-Host $host;
85
+ proxy_set_header X-Forwarded-Port $server_port;
86
+ }
87
+
88
+ # Direct access to FastAPI OpenAPI schema (bypass Next.js)
89
+ location /backend-openapi.json {
90
+ proxy_pass http://backend/openapi.json;
91
+ proxy_set_header Host $host;
92
+ proxy_set_header X-Real-IP $remote_addr;
93
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
94
+ proxy_set_header X-Forwarded-Proto $scheme;
95
+ proxy_set_header X-Forwarded-Host $host;
96
+ proxy_set_header X-Forwarded-Port $server_port;
97
+ }
98
+
99
+ # Let Next.js handle all other routes
100
+ location / {
101
+ proxy_pass http://frontend;
102
+ proxy_set_header Host $host;
103
+ proxy_set_header X-Real-IP $remote_addr;
104
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
105
+ proxy_set_header X-Forwarded-Proto $scheme;
106
+ proxy_set_header X-Forwarded-Host $host;
107
+ proxy_set_header X-Forwarded-Port $server_port;
108
+ proxy_buffering off;
109
+ proxy_redirect off;
110
+ }
111
+ }
112
+ }
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
prebuilt_template/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Starfish Data Generation Templates 🌟
2
+
3
+ Welcome to Starfish's collection of prebuilt data generation templates! This directory contains ready-to-use templates that you can load and run immediately to generate high-quality synthetic datasets.
4
+
5
+ ## What are Data Generation Templates?
6
+
7
+ Data generation templates are **prebuilt** that encapsulate sophisticated data generation workflows. Instead of building everything from scratch, you can simply load a template and generate the exact type of data you need with just a few lines of code.
8
+
9
+ ## How It Works
10
+
11
+ 1. **Browse Available Templates**: Each template focuses on a specific data generation use case
12
+ 2. **Load the Template**: Simple one-line import to get started
13
+ 3. **Configure Parameters**: Customize the generation settings for your needs
14
+ 4. **Generate Data**: Run the template to produce high-quality synthetic data
15
+ 5. **Export & Use**: Data comes ready for training, testing, or evaluation
16
+
17
+ ## Use the data-template CLI like this:
18
+ ```
19
+ # List all templates
20
+ data-template list-templates
21
+
22
+ # List with details
23
+ data-template list-templates --detail
24
+
25
+ # Get template details
26
+ data-template get-template my_template
27
+
28
+ # Print schema
29
+ data-template print-schema my_template
30
+
31
+ # Print example
32
+ data-template print-example my_template
33
+
34
+ # Run template with interactive input
35
+ data-template run-template my_template
36
+
37
+ # Run template with input file
38
+ data-template run-template my_template --input-file input.json
39
+
40
+ # Run template and save output
41
+ data-template run-template my_template --input-file input.json --output-file output.json
42
+ ```
43
+ ## Source Code Location
44
+
45
+ The actual implementation of these templates can be found in:
46
+ ```
47
+ src/starfish/data_gen_template/templates/
48
+ ```
49
+
50
+
51
+
52
+ ## Community & Contributions 🤝
53
+
54
+ Like what you see? We'd love your help in expanding our template collection! Here's how you can get involved:
55
+
56
+ - **Build Your Own Template**: Have an idea for a new template? We'd love to see it!
57
+ - **Request Templates**: Need a specific type of data generation? Let us know!
58
+ - **Community Contributions**: All templates in the `community/` folder come from amazing contributors like you
59
+ - **Get Help**: Questions about building templates? We're here to help!
60
+
61
+ Reach out to us if you want to contribute or have any requests - we're always happy to chat and help! ⭐
prebuilt_template/function_calling/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Function Calling Dataset Generation 🔧
2
+
3
+ This template replicates the methodology from the **APIGen paper** to generate high-quality synthetic datasets for training function-calling AI models.
4
+
5
+ ## What This Does
6
+
7
+ Generate customized API contract data for function calls - perfect for training models to understand when and how to call specific functions to improve specific tool agentic usage.
8
+
9
+
10
+ ## Sample Run
11
+
12
+ Check out [`sample_run.ipynb`](./sample_run.ipynb) for a complete example you can run right away.
13
+
14
+ ## Source Implementation
15
+
16
+ The actual template code is located at:
17
+ ```
18
+ src/starfish/data_gen_template/templates/starfish/function_calling/
19
+ ```
20
+
21
+ ---
22
+
23
+ **Try it out!** If you have any questions, let us know - we'd be happy to help. If you like this template, consider starring the repo and building your own! We welcome community contributions and are always happy to chat about new ideas. ⭐
prebuilt_template/function_calling/sample_run.ipynb ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from starfish import data_gen_template"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 3,
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "data": {
19
+ "text/plain": [
20
+ "['starfish/generate_func_call_dataset', 'starfish/generate_by_topic']"
21
+ ]
22
+ },
23
+ "execution_count": 3,
24
+ "metadata": {},
25
+ "output_type": "execute_result"
26
+ }
27
+ ],
28
+ "source": [
29
+ "data_gen_template.list()"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 4,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "loaded = data_gen_template.get(\"starfish/generate_func_call_dataset\")\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "markdown",
43
+ "metadata": {},
44
+ "source": [
45
+ "get the template input_data schema and example"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 5,
51
+ "metadata": {},
52
+ "outputs": [
53
+ {
54
+ "name": "stdout",
55
+ "output_type": "stream",
56
+ "text": [
57
+ "\u001b[32m2025-05-23 11:08:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mPlease run the template with this input schema\u001b[0m\n",
58
+ "\u001b[32m2025-05-23 11:08:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n",
59
+ " \"$defs\": {\n",
60
+ " \"APIContract\": {\n",
61
+ " \"description\": \"Pydantic model representing an API contract structure.\",\n",
62
+ " \"properties\": {\n",
63
+ " \"name\": {\n",
64
+ " \"title\": \"Name\",\n",
65
+ " \"type\": \"string\"\n",
66
+ " },\n",
67
+ " \"description\": {\n",
68
+ " \"title\": \"Description\",\n",
69
+ " \"type\": \"string\"\n",
70
+ " },\n",
71
+ " \"parameters\": {\n",
72
+ " \"additionalProperties\": {\n",
73
+ " \"$ref\": \"#/$defs/ParameterDefinition\"\n",
74
+ " },\n",
75
+ " \"title\": \"Parameters\",\n",
76
+ " \"type\": \"object\"\n",
77
+ " }\n",
78
+ " },\n",
79
+ " \"required\": [\n",
80
+ " \"name\",\n",
81
+ " \"description\",\n",
82
+ " \"parameters\"\n",
83
+ " ],\n",
84
+ " \"title\": \"APIContract\",\n",
85
+ " \"type\": \"object\"\n",
86
+ " },\n",
87
+ " \"ParameterDefinition\": {\n",
88
+ " \"description\": \"Pydantic model representing parameter definition in an API contract.\",\n",
89
+ " \"properties\": {\n",
90
+ " \"type\": {\n",
91
+ " \"title\": \"Type\",\n",
92
+ " \"type\": \"string\"\n",
93
+ " },\n",
94
+ " \"description\": {\n",
95
+ " \"title\": \"Description\",\n",
96
+ " \"type\": \"string\"\n",
97
+ " },\n",
98
+ " \"required\": {\n",
99
+ " \"default\": true,\n",
100
+ " \"title\": \"Required\",\n",
101
+ " \"type\": \"boolean\"\n",
102
+ " }\n",
103
+ " },\n",
104
+ " \"required\": [\n",
105
+ " \"type\",\n",
106
+ " \"description\"\n",
107
+ " ],\n",
108
+ " \"title\": \"ParameterDefinition\",\n",
109
+ " \"type\": \"object\"\n",
110
+ " }\n",
111
+ " },\n",
112
+ " \"description\": \"Input schema for the generate_by_topic template.\\n\\nIMPORTANT: This Pydantic model is the single source of truth for default values.\\nThe validation and default values are controlled by this model, not the function signature.\",\n",
113
+ " \"properties\": {\n",
114
+ " \"num_records\": {\n",
115
+ " \"anyOf\": [\n",
116
+ " {\n",
117
+ " \"type\": \"integer\"\n",
118
+ " },\n",
119
+ " {\n",
120
+ " \"type\": \"null\"\n",
121
+ " }\n",
122
+ " ],\n",
123
+ " \"default\": 10,\n",
124
+ " \"title\": \"Num Records\"\n",
125
+ " },\n",
126
+ " \"api_contract\": {\n",
127
+ " \"$ref\": \"#/$defs/APIContract\"\n",
128
+ " },\n",
129
+ " \"topic_model_name\": {\n",
130
+ " \"default\": \"openai/gpt-4o-mini\",\n",
131
+ " \"title\": \"Topic Model Name\",\n",
132
+ " \"type\": \"string\"\n",
133
+ " },\n",
134
+ " \"topic_model_kwargs\": {\n",
135
+ " \"anyOf\": [\n",
136
+ " {\n",
137
+ " \"additionalProperties\": true,\n",
138
+ " \"type\": \"object\"\n",
139
+ " },\n",
140
+ " {\n",
141
+ " \"type\": \"null\"\n",
142
+ " }\n",
143
+ " ],\n",
144
+ " \"default\": null,\n",
145
+ " \"title\": \"Topic Model Kwargs\"\n",
146
+ " },\n",
147
+ " \"generation_model_name\": {\n",
148
+ " \"default\": \"openai/gpt-4o-mini\",\n",
149
+ " \"title\": \"Generation Model Name\",\n",
150
+ " \"type\": \"string\"\n",
151
+ " },\n",
152
+ " \"generation_model_kwargs\": {\n",
153
+ " \"anyOf\": [\n",
154
+ " {\n",
155
+ " \"additionalProperties\": true,\n",
156
+ " \"type\": \"object\"\n",
157
+ " },\n",
158
+ " {\n",
159
+ " \"type\": \"null\"\n",
160
+ " }\n",
161
+ " ],\n",
162
+ " \"default\": null,\n",
163
+ " \"title\": \"Generation Model Kwargs\"\n",
164
+ " },\n",
165
+ " \"data_factory_config\": {\n",
166
+ " \"anyOf\": [\n",
167
+ " {\n",
168
+ " \"additionalProperties\": true,\n",
169
+ " \"type\": \"object\"\n",
170
+ " },\n",
171
+ " {\n",
172
+ " \"type\": \"null\"\n",
173
+ " }\n",
174
+ " ],\n",
175
+ " \"default\": {},\n",
176
+ " \"title\": \"Data Factory Config\"\n",
177
+ " }\n",
178
+ " },\n",
179
+ " \"required\": [\n",
180
+ " \"api_contract\"\n",
181
+ " ],\n",
182
+ " \"title\": \"GenerateFuncCallDataSet\",\n",
183
+ " \"type\": \"object\"\n",
184
+ "}\u001b[0m\n"
185
+ ]
186
+ }
187
+ ],
188
+ "source": [
189
+ "loaded.print_schema()"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 6,
195
+ "metadata": {},
196
+ "outputs": [
197
+ {
198
+ "name": "stdout",
199
+ "output_type": "stream",
200
+ "text": [
201
+ "\u001b[32m2025-05-23 11:09:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mHere is an example with api_contract.name as weather_api.get_current_weather\u001b[0m\n",
202
+ "\u001b[32m2025-05-23 11:09:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n",
203
+ " \"num_records\": 4,\n",
204
+ " \"api_contract\": {\n",
205
+ " \"name\": \"weather_api.get_current_weather\",\n",
206
+ " \"description\": \"Retrieves the current weather conditions for a specified location .\",\n",
207
+ " \"parameters\": {\n",
208
+ " \"location\": {\n",
209
+ " \"type\": \"string\",\n",
210
+ " \"description\": \"The name of the city or geographic location .\",\n",
211
+ " \"required\": true\n",
212
+ " },\n",
213
+ " \"units\": {\n",
214
+ " \"type\": \"string\",\n",
215
+ " \"description\": \"The units for temperature measurement( e.g., 'Celsius', 'Fahrenheit') .\",\n",
216
+ " \"required\": false\n",
217
+ " }\n",
218
+ " }\n",
219
+ " },\n",
220
+ " \"topic_model_name\": \"openai/gpt-4\",\n",
221
+ " \"topic_model_kwargs\": {\n",
222
+ " \"temperature\": 0.7\n",
223
+ " },\n",
224
+ " \"generation_model_name\": \"openai/gpt-4o-mini\",\n",
225
+ " \"generation_model_kwargs\": {\n",
226
+ " \"temperature\": 0.8,\n",
227
+ " \"max_tokens\": 200\n",
228
+ " },\n",
229
+ " \"data_factory_config\": {\n",
230
+ " \"max_concurrency\": 24,\n",
231
+ " \"task_runner_timeout\": 120\n",
232
+ " }\n",
233
+ "}\u001b[0m\n"
234
+ ]
235
+ }
236
+ ],
237
+ "source": [
238
+ "loaded.print_example()"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 5,
244
+ "metadata": {},
245
+ "outputs": [
246
+ {
247
+ "name": "stdout",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "🌟 Function Calling Dataset Generation Pipeline\n",
251
+ "============================================================\n",
252
+ "📋 Process Overview:\n",
253
+ " 1. Calculate optimal data distribution\n",
254
+ " 2. Generate diverse topics\n",
255
+ " 3. Create subtopics for each topic\n",
256
+ " 4. Generate query-answer pairs\n",
257
+ " 5. Verify and validate generated data\n",
258
+ " 6. Regenerate failed cases\n",
259
+ "============================================================\n",
260
+ "📊 Data Distribution Plan:\n",
261
+ " • Requested: 10 records\n",
262
+ " • Distribution: 1 topics × 1 subtopics × 10 records\n",
263
+ " • Total generation: 10 records\n",
264
+ " • API calls needed: 3\n",
265
+ "\n",
266
+ "🎯 Step 1: Generating diverse topics...\n",
267
+ " ✅ Generated 1 topics\n",
268
+ "\n",
269
+ "🌿 Step 2: Creating subtopics for each topic...\n",
270
+ "\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: e6763e50-6438-4df5-81a9-5a68ce3f8468\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
271
+ "\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
272
+ "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
273
+ " ✅ Generated 1 subtopics total\n",
274
+ "\n",
275
+ "💬 Step 3: Generating query-answer pairs...\n",
276
+ "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 1931c5c8-c1f3-4268-98b7-1a5295b8abf2\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
277
+ "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
278
+ "\u001b[32m2025-05-23 00:27:09\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
279
+ "\u001b[32m2025-05-23 00:27:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
280
+ "\u001b[32m2025-05-23 00:27:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
281
+ "\u001b[32m2025-05-23 00:27:18\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
282
+ "\u001b[32m2025-05-23 00:27:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
283
+ "\u001b[32m2025-05-23 00:27:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
284
+ "\u001b[32m2025-05-23 00:27:27\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
285
+ "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
286
+ " ✅ Generated 10 initial query-answer pairs\n",
287
+ "\n",
288
+ "🔍 Step 4: Verifying data quality...\n",
289
+ "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: f036c07c-1cd2-4690-be92-bac359e45544\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
290
+ "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
291
+ "\u001b[32m2025-05-23 00:27:31\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
292
+ "\u001b[32m2025-05-23 00:27:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 9/10\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 9\u001b[0m (\u001b[32mCompleted: 9\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
293
+ "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 10/10\u001b[0m | \u001b[33mAttempted: 10\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
294
+ "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
295
+ " ✅ Quality check complete: 9 passed, 1 failed\n",
296
+ "\n",
297
+ "🔄 Step 5: Regenerating failed cases...\n",
298
+ "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 3d6183a2-e465-4807-9e18-cbb84dc0d28f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
299
+ "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
300
+ "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
301
+ "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8754bec6-25e3-40bd-9743-f2763fc1091f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
302
+ "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
303
+ "\u001b[32m2025-05-23 00:27:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
304
+ "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
305
+ "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
306
+ " ✅ Regenerated 1 pairs, 1 still failing\n",
307
+ "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mSome data still failing after regeneration - prompts may need improvement\u001b[0m\n",
308
+ "🎯 Perfect! Generated exactly 10 records as requested\n",
309
+ "\n",
310
+ "🎉 Generation Complete!\n",
311
+ "============================================================\n",
312
+ "📈 Final Results:\n",
313
+ " • Records generated: 10\n",
314
+ " • Success rate: 10/10 (100.0%)\n",
315
+ " • Distribution used: 1T × 1S × 10R\n",
316
+ "\n",
317
+ "⭐ If you found this helpful, please consider starring our repo!\n",
318
+ " Your support means the world to us! 🌟\n",
319
+ "============================================================\n"
320
+ ]
321
+ }
322
+ ],
323
+ "source": [
324
+ "api_contract = {\n",
325
+ " \"name\": \"weather_api.get_current_weather\",\n",
326
+ " \"description\": \"Retrieves the current weather conditions for a specified location .\",\n",
327
+ " \"parameters\": {\n",
328
+ " \"location\": {\"type\": \"string\", \"description\": \"The name of the city or geographic location .\", \"required\": True},\n",
329
+ " \"units\": {\"type\": \"string\", \"description\": \"The units for temperature measurement( e.g., 'Celsius', 'Fahrenheit') .\", \"required\": False},\n",
330
+ " },\n",
331
+ " }\n",
332
+ "\n",
333
+ "data = await loaded.run(num_records=10, api_contract=api_contract)"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": 6,
339
+ "metadata": {},
340
+ "outputs": [
341
+ {
342
+ "data": {
343
+ "text/plain": [
344
+ "[{'query': 'Can you check the current weather in Toronto and Rome? Use Fahrenheit for both locations.',\n",
345
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
346
+ " 'arguments': {'location': 'Toronto', 'units': 'Fahrenheit'}},\n",
347
+ " {'name': 'weather_api.get_current_weather',\n",
348
+ " 'arguments': {'location': 'Rome', 'units': 'Fahrenheit'}}]},\n",
349
+ " {'query': 'Get me the current weather in Mumbai and also in Johannesburg, please use Fahrenheit for both.',\n",
350
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
351
+ " 'arguments': {'location': 'Mumbai', 'units': 'Fahrenheit'}},\n",
352
+ " {'name': 'weather_api.get_current_weather',\n",
353
+ " 'arguments': {'location': 'Johannesburg', 'units': 'Fahrenheit'}}]},\n",
354
+ " {'query': 'I need the current weather for Sydney and London. What are the temperatures in Celsius?',\n",
355
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
356
+ " 'arguments': {'location': 'Sydney', 'units': 'Celsius'}},\n",
357
+ " {'name': 'weather_api.get_current_weather',\n",
358
+ " 'arguments': {'location': 'London', 'units': 'Celsius'}}]},\n",
359
+ " {'query': 'Please find the current weather in Buenos Aires and Cape Town, using Celsius for Buenos Aires.',\n",
360
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
361
+ " 'arguments': {'location': 'Buenos Aires', 'units': 'Celsius'}},\n",
362
+ " {'name': 'weather_api.get_current_weather',\n",
363
+ " 'arguments': {'location': 'Cape Town'}}]},\n",
364
+ " {'query': 'What’s the weather like in Moscow? Also, can you get the current conditions in Beijing?',\n",
365
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
366
+ " 'arguments': {'location': 'Moscow'}},\n",
367
+ " {'name': 'weather_api.get_current_weather',\n",
368
+ " 'arguments': {'location': 'Beijing'}}]},\n",
369
+ " {'query': 'Can you tell me the current weather in Tokyo and in Los Angeles? Please provide both in Fahrenheit.',\n",
370
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
371
+ " 'arguments': {'location': 'Tokyo', 'units': 'Fahrenheit'}},\n",
372
+ " {'name': 'weather_api.get_current_weather',\n",
373
+ " 'arguments': {'location': 'Los Angeles', 'units': 'Fahrenheit'}}]},\n",
374
+ " {'query': 'Please provide the current weather for Berlin and Cairo, using Celsius for Berlin and no specific unit for Cairo.',\n",
375
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
376
+ " 'arguments': {'location': 'Berlin', 'units': 'Celsius'}},\n",
377
+ " {'name': 'weather_api.get_current_weather',\n",
378
+ " 'arguments': {'location': 'Cairo'}}]},\n",
379
+ " {'query': 'I need the current weather in Seattle and in Santiago. Use Fahrenheit for Seattle and Celsius for Santiago.',\n",
380
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
381
+ " 'arguments': {'location': 'Seattle', 'units': 'Fahrenheit'}},\n",
382
+ " {'name': 'weather_api.get_current_weather',\n",
383
+ " 'arguments': {'location': 'Santiago', 'units': 'Celsius'}}]},\n",
384
+ " {'query': \"What's the current temperature in San Francisco? Can you also check the weather in Paris?\",\n",
385
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
386
+ " 'arguments': {'location': 'San Francisco'}},\n",
387
+ " {'name': 'weather_api.get_current_weather',\n",
388
+ " 'arguments': {'location': 'Paris'}}]},\n",
389
+ " {'query': 'What is the current weather in New York City? And can you also provide the temperature in Celsius?',\n",
390
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
391
+ " 'arguments': {'location': 'New York City', 'units': 'Celsius'}}]}]"
392
+ ]
393
+ },
394
+ "execution_count": 6,
395
+ "metadata": {},
396
+ "output_type": "execute_result"
397
+ }
398
+ ],
399
+ "source": [
400
+ "data"
401
+ ]
402
+ }
403
+ ],
404
+ "metadata": {
405
+ "kernelspec": {
406
+ "display_name": ".venv",
407
+ "language": "python",
408
+ "name": "python3"
409
+ },
410
+ "language_info": {
411
+ "codemirror_mode": {
412
+ "name": "ipython",
413
+ "version": 3
414
+ },
415
+ "file_extension": ".py",
416
+ "mimetype": "text/x-python",
417
+ "name": "python",
418
+ "nbconvert_exporter": "python",
419
+ "pygments_lexer": "ipython3",
420
+ "version": "3.11.4"
421
+ }
422
+ },
423
+ "nbformat": 4,
424
+ "nbformat_minor": 2
425
+ }
prebuilt_template/generate_by_topic/README.md ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Overview
3
+ The `generate_by_topic` template is designed to create diverse synthetic data across multiple topics based on user instructions. It can automatically generate relevant topics if not provided and handles deduplication across generated content.
4
+
5
+ ## Key Features
6
+ - Automatic topic generation based on user instructions
7
+ - Customizable number of records and records per topic
8
+ - Built-in deduplication mechanism
9
+ - Flexible output schema configuration
10
+ - Parallel data generation with configurable concurrency
11
+
12
+ ## Input Schema
13
+ ```python
14
+ class GenerateByTopicInput(BaseModel):
15
+ user_instruction: Optional[str] = None
16
+ num_records: Optional[int] = 10
17
+ records_per_topic: int = 10
18
+ topics: Optional[List[Union[str, Dict[str, int]]]] = None
19
+ topic_model_name: str = "openai/gpt-4o-mini"
20
+ topic_model_kwargs: Optional[Dict[str, Any]] = None
21
+ generation_model_name: str = "openai/gpt-4o-mini"
22
+ generation_model_kwargs: Optional[Dict[str, Any]] = None
23
+ output_schema: Optional[Union[List[Dict[str, Any]], Dict[str, Any], type]] = [
24
+ {"name": "question", "type": "str"},
25
+ {"name": "answer", "type": "str"}
26
+ ]
27
+ data_factory_config: Optional[Dict[str, Any]] = {}
28
+ ```
29
+
30
+ ## Parameters
31
+ | Parameter | Type | Description | Default |
32
+ |-----------|------|-------------|---------|
33
+ | `user_instruction` | str | Instruction for data generation | None |
34
+ | `num_records` | int | Total number of records to generate | 10 |
35
+ | `records_per_topic` | int | Number of records per topic | 10 |
36
+ | `topics` | List[Union[str, Dict[str, int]]] | List of topics or topic with specific record count | None |
37
+ | `topic_model_name` | str | Model name for topic generation | "openai/gpt-4o-mini" |
38
+ | `topic_model_kwargs` | Dict[str, Any] | Additional parameters for topic model | None |
39
+ | `generation_model_name` | str | Model name for data generation | "openai/gpt-4o-mini" |
40
+ | `generation_model_kwargs` | Dict[str, Any] | Additional parameters for generation model | None |
41
+ | `output_schema` | Union[List[Dict[str, Any]], Dict[str, Any], type] | Schema for generated data | [{"name": "question", "type": "str"}, {"name": "answer", "type": "str"}] |
42
+ | `data_factory_config` | Dict[str, Any] | Configuration for data generation process | {} |
43
+
44
+ ## Example Usage
45
+ ```python
46
+ {
47
+ "user_instruction": "Generate Q&A pairs about machine learning concepts",
48
+ "num_records": 100,
49
+ "records_per_topic": 5,
50
+ "topics": [
51
+ "supervised learning",
52
+ "unsupervised learning",
53
+ {"reinforcement learning": 3},
54
+ "neural networks",
55
+ ],
56
+ "topic_model_name": "openai/gpt-4",
57
+ "topic_model_kwargs": {"temperature": 0.7},
58
+ "generation_model_name": "openai/gpt-4",
59
+ "generation_model_kwargs": {"temperature": 0.8, "max_tokens": 200},
60
+ "output_schema": [
61
+ {"name": "question", "type": "str"},
62
+ {"name": "answer", "type": "str"},
63
+ {"name": "difficulty", "type": "str"},
64
+ ],
65
+ "data_factory_config": {"max_concurrency": 4, "task_runner_timeout": 60 * 2},
66
+ }
67
+ ```
68
+
69
+ ## Workflow
70
+ 1. Topic Preparation:
71
+ - If topics are not provided, generates relevant topics based on user instruction
72
+ - Shuffles topics for better distribution and deduplication
73
+
74
+ 2. Data Generation:
75
+ - Generates data for each topic using the specified model
76
+ - Implements deduplication by tracking previously generated examples
77
+ - Adds topic information to each generated record
78
+
79
+ ## Output
80
+ The generated data will include:
81
+ - Fields specified in the output schema
82
+ - An additional `topic` field indicating the topic of each record
83
+
84
+ ## Dependencies
85
+ - `starfish` framework
86
+ - `pydantic` for input validation
87
+
88
+
89
+ ## Sample Run
90
+
91
+ Check out [`sample_run.ipynb`](./sample_run.ipynb) for a complete example you can run right away.
92
+
93
+ ## Source Implementation
94
+
95
+ The actual template code is located at:
96
+ ```
97
+ src/starfish/data_gen_template/templates/starfish/generate_by_topic/
98
+ ```
99
+
100
+ ---
101
+
102
+ **Try it out!** If you have any questions, let us know - we'd be happy to help. If you like this template, consider starring the repo and building your own! We welcome community contributions and are always happy to chat about new ideas. ⭐
prebuilt_template/generate_by_topic/sample_run.ipynb ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from starfish import data_gen_template"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "data": {
19
+ "text/plain": [
20
+ "['starfish/generate_func_call_dataset', 'starfish/generate_by_topic']"
21
+ ]
22
+ },
23
+ "execution_count": 2,
24
+ "metadata": {},
25
+ "output_type": "execute_result"
26
+ }
27
+ ],
28
+ "source": [
29
+ "data_gen_template.list()"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 3,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "loaded = data_gen_template.get(\"starfish/generate_by_topic\")\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "markdown",
43
+ "metadata": {},
44
+ "source": [
45
+ "get the template input_data schema and example"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 4,
51
+ "metadata": {},
52
+ "outputs": [
53
+ {
54
+ "name": "stdout",
55
+ "output_type": "stream",
56
+ "text": [
57
+ "\u001b[32m2025-05-23 11:23:57\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mPlease run the template with this input schema\u001b[0m\n",
58
+ "\u001b[32m2025-05-23 11:23:57\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n",
59
+ " \"description\": \"Input schema for the generate_by_topic template.\\n\\nIMPORTANT: This Pydantic model is the single source of truth for default values.\\nThe validation and default values are controlled by this model, not the function signature.\",\n",
60
+ " \"properties\": {\n",
61
+ " \"user_instruction\": {\n",
62
+ " \"anyOf\": [\n",
63
+ " {\n",
64
+ " \"type\": \"string\"\n",
65
+ " },\n",
66
+ " {\n",
67
+ " \"type\": \"null\"\n",
68
+ " }\n",
69
+ " ],\n",
70
+ " \"default\": null,\n",
71
+ " \"title\": \"User Instruction\"\n",
72
+ " },\n",
73
+ " \"num_records\": {\n",
74
+ " \"anyOf\": [\n",
75
+ " {\n",
76
+ " \"type\": \"integer\"\n",
77
+ " },\n",
78
+ " {\n",
79
+ " \"type\": \"null\"\n",
80
+ " }\n",
81
+ " ],\n",
82
+ " \"default\": 10,\n",
83
+ " \"title\": \"Num Records\"\n",
84
+ " },\n",
85
+ " \"records_per_topic\": {\n",
86
+ " \"default\": 10,\n",
87
+ " \"title\": \"Records Per Topic\",\n",
88
+ " \"type\": \"integer\"\n",
89
+ " },\n",
90
+ " \"topics\": {\n",
91
+ " \"anyOf\": [\n",
92
+ " {\n",
93
+ " \"items\": {\n",
94
+ " \"anyOf\": [\n",
95
+ " {\n",
96
+ " \"type\": \"string\"\n",
97
+ " },\n",
98
+ " {\n",
99
+ " \"additionalProperties\": {\n",
100
+ " \"type\": \"integer\"\n",
101
+ " },\n",
102
+ " \"type\": \"object\"\n",
103
+ " }\n",
104
+ " ]\n",
105
+ " },\n",
106
+ " \"type\": \"array\"\n",
107
+ " },\n",
108
+ " {\n",
109
+ " \"type\": \"null\"\n",
110
+ " }\n",
111
+ " ],\n",
112
+ " \"default\": null,\n",
113
+ " \"title\": \"Topics\"\n",
114
+ " },\n",
115
+ " \"topic_model_name\": {\n",
116
+ " \"default\": \"openai/gpt-4o-mini\",\n",
117
+ " \"title\": \"Topic Model Name\",\n",
118
+ " \"type\": \"string\"\n",
119
+ " },\n",
120
+ " \"topic_model_kwargs\": {\n",
121
+ " \"anyOf\": [\n",
122
+ " {\n",
123
+ " \"additionalProperties\": true,\n",
124
+ " \"type\": \"object\"\n",
125
+ " },\n",
126
+ " {\n",
127
+ " \"type\": \"null\"\n",
128
+ " }\n",
129
+ " ],\n",
130
+ " \"default\": null,\n",
131
+ " \"title\": \"Topic Model Kwargs\"\n",
132
+ " },\n",
133
+ " \"generation_model_name\": {\n",
134
+ " \"default\": \"openai/gpt-4o-mini\",\n",
135
+ " \"title\": \"Generation Model Name\",\n",
136
+ " \"type\": \"string\"\n",
137
+ " },\n",
138
+ " \"generation_model_kwargs\": {\n",
139
+ " \"anyOf\": [\n",
140
+ " {\n",
141
+ " \"additionalProperties\": true,\n",
142
+ " \"type\": \"object\"\n",
143
+ " },\n",
144
+ " {\n",
145
+ " \"type\": \"null\"\n",
146
+ " }\n",
147
+ " ],\n",
148
+ " \"default\": null,\n",
149
+ " \"title\": \"Generation Model Kwargs\"\n",
150
+ " },\n",
151
+ " \"output_schema\": {\n",
152
+ " \"anyOf\": [\n",
153
+ " {\n",
154
+ " \"items\": {\n",
155
+ " \"additionalProperties\": true,\n",
156
+ " \"type\": \"object\"\n",
157
+ " },\n",
158
+ " \"type\": \"array\"\n",
159
+ " },\n",
160
+ " {\n",
161
+ " \"additionalProperties\": true,\n",
162
+ " \"type\": \"object\"\n",
163
+ " },\n",
164
+ " {\n",
165
+ " \"type\": \"null\"\n",
166
+ " }\n",
167
+ " ],\n",
168
+ " \"default\": [\n",
169
+ " {\n",
170
+ " \"name\": \"question\",\n",
171
+ " \"type\": \"str\"\n",
172
+ " },\n",
173
+ " {\n",
174
+ " \"name\": \"answer\",\n",
175
+ " \"type\": \"str\"\n",
176
+ " }\n",
177
+ " ],\n",
178
+ " \"title\": \"Output Schema\"\n",
179
+ " },\n",
180
+ " \"data_factory_config\": {\n",
181
+ " \"anyOf\": [\n",
182
+ " {\n",
183
+ " \"additionalProperties\": true,\n",
184
+ " \"type\": \"object\"\n",
185
+ " },\n",
186
+ " {\n",
187
+ " \"type\": \"null\"\n",
188
+ " }\n",
189
+ " ],\n",
190
+ " \"default\": {},\n",
191
+ " \"title\": \"Data Factory Config\"\n",
192
+ " }\n",
193
+ " },\n",
194
+ " \"title\": \"GenerateByTopicInput\",\n",
195
+ " \"type\": \"object\"\n",
196
+ "}\u001b[0m\n"
197
+ ]
198
+ }
199
+ ],
200
+ "source": [
201
+ "loaded.print_schema()"
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": 5,
207
+ "metadata": {},
208
+ "outputs": [
209
+ {
210
+ "name": "stdout",
211
+ "output_type": "stream",
212
+ "text": [
213
+ "\u001b[32m2025-05-23 11:24:01\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mHere is an example with api_contract.name as weather_api.get_current_weather\u001b[0m\n",
214
+ "\u001b[32m2025-05-23 11:24:01\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n",
215
+ " \"user_instruction\": \"Generate Q&A pairs about machine learning concepts\",\n",
216
+ " \"num_records\": 100,\n",
217
+ " \"records_per_topic\": 5,\n",
218
+ " \"topics\": [\n",
219
+ " \"supervised learning\",\n",
220
+ " \"unsupervised learning\",\n",
221
+ " {\"reinforcement learning\": 3}, # This means generate 3 records for this topic\n",
222
+ " \"neural networks\",\n",
223
+ " ],\n",
224
+ " \"topic_model_name\": \"openai/gpt-4\",\n",
225
+ " \"topic_model_kwargs\": {\"temperature\": 0.7},\n",
226
+ " \"generation_model_name\": \"openai/gpt-4\",\n",
227
+ " \"generation_model_kwargs\": {\"temperature\": 0.8, \"max_tokens\": 200},\n",
228
+ " \"output_schema\": [\n",
229
+ " {\"name\": \"question\", \"type\": \"str\"},\n",
230
+ " {\"name\": \"answer\", \"type\": \"str\"},\n",
231
+ " {\"name\": \"difficulty\", \"type\": \"str\"}, # Added an additional field\n",
232
+ " ],\n",
233
+ " \"data_factory_config\": {\"max_concurrency\": 4, \"task_runner_timeout\": 60 * 2},\n",
234
+ " }\u001b[0m\n"
235
+ ]
236
+ }
237
+ ],
238
+ "source": [
239
+ "loaded.print_example()"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 5,
245
+ "metadata": {},
246
+ "outputs": [
247
+ {
248
+ "name": "stdout",
249
+ "output_type": "stream",
250
+ "text": [
251
+ "🌟 Function Calling Dataset Generation Pipeline\n",
252
+ "============================================================\n",
253
+ "📋 Process Overview:\n",
254
+ " 1. Calculate optimal data distribution\n",
255
+ " 2. Generate diverse topics\n",
256
+ " 3. Create subtopics for each topic\n",
257
+ " 4. Generate query-answer pairs\n",
258
+ " 5. Verify and validate generated data\n",
259
+ " 6. Regenerate failed cases\n",
260
+ "============================================================\n",
261
+ "📊 Data Distribution Plan:\n",
262
+ " • Requested: 10 records\n",
263
+ " • Distribution: 1 topics × 1 subtopics × 10 records\n",
264
+ " • Total generation: 10 records\n",
265
+ " • API calls needed: 3\n",
266
+ "\n",
267
+ "🎯 Step 1: Generating diverse topics...\n",
268
+ " ✅ Generated 1 topics\n",
269
+ "\n",
270
+ "🌿 Step 2: Creating subtopics for each topic...\n",
271
+ "\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: e6763e50-6438-4df5-81a9-5a68ce3f8468\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
272
+ "\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
273
+ "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
274
+ " ✅ Generated 1 subtopics total\n",
275
+ "\n",
276
+ "💬 Step 3: Generating query-answer pairs...\n",
277
+ "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 1931c5c8-c1f3-4268-98b7-1a5295b8abf2\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
278
+ "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
279
+ "\u001b[32m2025-05-23 00:27:09\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
280
+ "\u001b[32m2025-05-23 00:27:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
281
+ "\u001b[32m2025-05-23 00:27:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
282
+ "\u001b[32m2025-05-23 00:27:18\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
283
+ "\u001b[32m2025-05-23 00:27:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
284
+ "\u001b[32m2025-05-23 00:27:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
285
+ "\u001b[32m2025-05-23 00:27:27\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
286
+ "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
287
+ " ✅ Generated 10 initial query-answer pairs\n",
288
+ "\n",
289
+ "🔍 Step 4: Verifying data quality...\n",
290
+ "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: f036c07c-1cd2-4690-be92-bac359e45544\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
291
+ "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
292
+ "\u001b[32m2025-05-23 00:27:31\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
293
+ "\u001b[32m2025-05-23 00:27:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 9/10\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 9\u001b[0m (\u001b[32mCompleted: 9\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
294
+ "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 10/10\u001b[0m | \u001b[33mAttempted: 10\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
295
+ "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
296
+ " ✅ Quality check complete: 9 passed, 1 failed\n",
297
+ "\n",
298
+ "🔄 Step 5: Regenerating failed cases...\n",
299
+ "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 3d6183a2-e465-4807-9e18-cbb84dc0d28f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
300
+ "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
301
+ "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
302
+ "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8754bec6-25e3-40bd-9743-f2763fc1091f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
303
+ "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
304
+ "\u001b[32m2025-05-23 00:27:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
305
+ "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
306
+ "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
307
+ " ✅ Regenerated 1 pairs, 1 still failing\n",
308
+ "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mSome data still failing after regeneration - prompts may need improvement\u001b[0m\n",
309
+ "🎯 Perfect! Generated exactly 10 records as requested\n",
310
+ "\n",
311
+ "🎉 Generation Complete!\n",
312
+ "============================================================\n",
313
+ "📈 Final Results:\n",
314
+ " • Records generated: 10\n",
315
+ " • Success rate: 10/10 (100.0%)\n",
316
+ " • Distribution used: 1T × 1S × 10R\n",
317
+ "\n",
318
+ "⭐ If you found this helpful, please consider starring our repo!\n",
319
+ " Your support means the world to us! 🌟\n",
320
+ "============================================================\n"
321
+ ]
322
+ }
323
+ ],
324
+ "source": [
325
+ "input_data = {\n",
326
+ " \"user_instruction\": \"Generate Q&A pairs about machine learning concepts\",\n",
327
+ " \"num_records\": 100,\n",
328
+ " \"records_per_topic\": 5,\n",
329
+ " \"topics\": [\n",
330
+ " \"supervised learning\",\n",
331
+ " \"unsupervised learning\",\n",
332
+ " {\"reinforcement learning\": 3}, # This means generate 3 records for this topic\n",
333
+ " \"neural networks\",\n",
334
+ " ],\n",
335
+ " \"topic_model_name\": \"openai/gpt-4\",\n",
336
+ " \"topic_model_kwargs\": {\"temperature\": 0.7},\n",
337
+ " \"generation_model_name\": \"openai/gpt-4\",\n",
338
+ " \"generation_model_kwargs\": {\"temperature\": 0.8, \"max_tokens\": 200},\n",
339
+ " \"output_schema\": [\n",
340
+ " {\"name\": \"question\", \"type\": \"str\"},\n",
341
+ " {\"name\": \"answer\", \"type\": \"str\"},\n",
342
+ " {\"name\": \"difficulty\", \"type\": \"str\"}, # Added an additional field\n",
343
+ " ],\n",
344
+ " \"data_factory_config\": {\"max_concurrency\": 4, \"task_runner_timeout\": 60 * 2},\n",
345
+ " }\n",
346
+ "data = await loaded.run(input_data=input_data)"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": 6,
352
+ "metadata": {},
353
+ "outputs": [
354
+ {
355
+ "data": {
356
+ "text/plain": [
357
+ "[{'query': 'Can you check the current weather in Toronto and Rome? Use Fahrenheit for both locations.',\n",
358
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
359
+ " 'arguments': {'location': 'Toronto', 'units': 'Fahrenheit'}},\n",
360
+ " {'name': 'weather_api.get_current_weather',\n",
361
+ " 'arguments': {'location': 'Rome', 'units': 'Fahrenheit'}}]},\n",
362
+ " {'query': 'Get me the current weather in Mumbai and also in Johannesburg, please use Fahrenheit for both.',\n",
363
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
364
+ " 'arguments': {'location': 'Mumbai', 'units': 'Fahrenheit'}},\n",
365
+ " {'name': 'weather_api.get_current_weather',\n",
366
+ " 'arguments': {'location': 'Johannesburg', 'units': 'Fahrenheit'}}]},\n",
367
+ " {'query': 'I need the current weather for Sydney and London. What are the temperatures in Celsius?',\n",
368
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
369
+ " 'arguments': {'location': 'Sydney', 'units': 'Celsius'}},\n",
370
+ " {'name': 'weather_api.get_current_weather',\n",
371
+ " 'arguments': {'location': 'London', 'units': 'Celsius'}}]},\n",
372
+ " {'query': 'Please find the current weather in Buenos Aires and Cape Town, using Celsius for Buenos Aires.',\n",
373
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
374
+ " 'arguments': {'location': 'Buenos Aires', 'units': 'Celsius'}},\n",
375
+ " {'name': 'weather_api.get_current_weather',\n",
376
+ " 'arguments': {'location': 'Cape Town'}}]},\n",
377
+ " {'query': 'What’s the weather like in Moscow? Also, can you get the current conditions in Beijing?',\n",
378
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
379
+ " 'arguments': {'location': 'Moscow'}},\n",
380
+ " {'name': 'weather_api.get_current_weather',\n",
381
+ " 'arguments': {'location': 'Beijing'}}]},\n",
382
+ " {'query': 'Can you tell me the current weather in Tokyo and in Los Angeles? Please provide both in Fahrenheit.',\n",
383
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
384
+ " 'arguments': {'location': 'Tokyo', 'units': 'Fahrenheit'}},\n",
385
+ " {'name': 'weather_api.get_current_weather',\n",
386
+ " 'arguments': {'location': 'Los Angeles', 'units': 'Fahrenheit'}}]},\n",
387
+ " {'query': 'Please provide the current weather for Berlin and Cairo, using Celsius for Berlin and no specific unit for Cairo.',\n",
388
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
389
+ " 'arguments': {'location': 'Berlin', 'units': 'Celsius'}},\n",
390
+ " {'name': 'weather_api.get_current_weather',\n",
391
+ " 'arguments': {'location': 'Cairo'}}]},\n",
392
+ " {'query': 'I need the current weather in Seattle and in Santiago. Use Fahrenheit for Seattle and Celsius for Santiago.',\n",
393
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
394
+ " 'arguments': {'location': 'Seattle', 'units': 'Fahrenheit'}},\n",
395
+ " {'name': 'weather_api.get_current_weather',\n",
396
+ " 'arguments': {'location': 'Santiago', 'units': 'Celsius'}}]},\n",
397
+ " {'query': \"What's the current temperature in San Francisco? Can you also check the weather in Paris?\",\n",
398
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
399
+ " 'arguments': {'location': 'San Francisco'}},\n",
400
+ " {'name': 'weather_api.get_current_weather',\n",
401
+ " 'arguments': {'location': 'Paris'}}]},\n",
402
+ " {'query': 'What is the current weather in New York City? And can you also provide the temperature in Celsius?',\n",
403
+ " 'answer': [{'name': 'weather_api.get_current_weather',\n",
404
+ " 'arguments': {'location': 'New York City', 'units': 'Celsius'}}]}]"
405
+ ]
406
+ },
407
+ "execution_count": 6,
408
+ "metadata": {},
409
+ "output_type": "execute_result"
410
+ }
411
+ ],
412
+ "source": [
413
+ "data"
414
+ ]
415
+ }
416
+ ],
417
+ "metadata": {
418
+ "kernelspec": {
419
+ "display_name": ".venv",
420
+ "language": "python",
421
+ "name": "python3"
422
+ },
423
+ "language_info": {
424
+ "codemirror_mode": {
425
+ "name": "ipython",
426
+ "version": 3
427
+ },
428
+ "file_extension": ".py",
429
+ "mimetype": "text/x-python",
430
+ "name": "python",
431
+ "nbconvert_exporter": "python",
432
+ "pygments_lexer": "ipython3",
433
+ "version": "3.11.4"
434
+ }
435
+ },
436
+ "nbformat": 4,
437
+ "nbformat_minor": 2
438
+ }
pyproject.toml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "starfish-core"
3
+ version = "0.1.3"
4
+ description = ""
5
+ authors = ["Starfish AI Inc."]
6
+ readme = "README.md"
7
+ packages = [
8
+ {include = "starfish", from = "src"}
9
+ ]
10
+
11
+ [tool.poetry.dependencies]
12
+ python = ">=3.10,<4.0"
13
+ litellm = ">=1.65.1,<2.0.0"
14
+ fastapi = ">=0.95.0"
15
+ loguru = ">=0.7.3,<0.8.0"
16
+ cachetools = ">=5.5.2,<6.0.0"
17
+ ollama = ">=0.4.7,<0.5.0"
18
+ python-dotenv = ">=1.1.0,<2.0.0"
19
+ aiosqlite = ">=0.21.0,<0.22.0"
20
+ aiofiles = ">=24.1.0,<25.0.0"
21
+ typing-extensions = ">=4.0.0,<5.0.0"
22
+ posthog = "^3.11.0"
23
+ cloudpickle = "^2.2.0"
24
+ datasets = "3.6.0"
25
+ psutil = ">=7.0.0,<8.0.0"
26
+ nest_asyncio = "^1.6.0"
27
+ docstring_parser = "^0.16.0"
28
+ mcp = "^1.8.1"
29
+ # Force cryptography >=44.0.1 due to transitive security vulnerability
30
+ # See: https://openssl-library.org/news/secadv/20250211.txt
31
+ cryptography = ">=44.0.1"
32
+ # Embedding dependencies
33
+ faiss-cpu = "^1.7.4"
34
+ sentence-transformers = "^4.1.0"
35
+ unstructured = { version = "^0.10.0", extras = ["pdf"], optional = true }
36
+ python-docx = { version = "*", optional = true }
37
+ python-pptx = { version = "*", optional = true }
38
+ openpyxl = { version = "*", optional = true }
39
+ pytube = { version = "^15.0.0", optional = true }
40
+ youtube-transcript-api = { version = "^0.6.1", optional = true }
41
+ pdfminer_six = { version = "^20250506", optional = true }
42
+
43
+ # Add optional dependencies for parsers
44
+ [tool.poetry.extras]
45
+ docx = ["python-docx"]
46
+ ppt = ["python-pptx"]
47
+ excel = ["openpyxl"]
48
+ youtube = ["pytube", "youtube-transcript-api"]
49
+ pdf = ["pdfminer_six"]
50
+ unstructured = ["unstructured"]
51
+ all = [
52
+ "python-docx",
53
+ "python-pptx",
54
+ "openpyxl",
55
+ "pytube",
56
+ "youtube-transcript-api",
57
+ "pdfminer_six",
58
+ "unstructured",
59
+ ]
60
+
61
+ [build-system]
62
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
63
+ build-backend = "poetry.core.masonry.api"
64
+
65
+ [tool.poetry.group.dev.dependencies]
66
+ ipykernel = "^6.29.5"
67
+ twine = "^5.0.0"
68
+ ruff = "^0.8.6"
69
+ vcrpy = "^7.0.0"
70
+ isort = "^5.13.2"
71
+ pre-commit = "^4.0.1"
72
+ pytest = "^8.3.3"
73
+ pytest-asyncio = "^0.24.0"
74
+ pytest-dependency = "^0.6.0"
75
+ pytest-timeout = "^2.3.1"
76
+ pytest-cov = "^6.0.0"
77
+ nbval = "^0.11.0"
78
+
79
+
80
+ [tool.poetry.scripts]
81
+ starfish = "starfish.api.cli:main"
82
+ data-template = "src.starfish.data_gen_template.cli:main"
83
+
84
+
85
+ [tool.ruff]
86
+ line-length = 160
87
+
88
+ # Auto-fix settings
89
+ fix = true
90
+ unsafe-fixes = true
91
+
92
+ [tool.ruff.lint]
93
+ select = [
94
+ "E", # pycodestyle errors
95
+ "W", # pycodestyle warnings
96
+ "F", # pyflakes
97
+ "F401", # Unused imports
98
+ "I", # isort
99
+ "B", # flake8-bugbear
100
+ "C4", # flake8-comprehensions
101
+ "N", # PEP8 naming convetions
102
+ "D" # pydocstyle
103
+ ]
104
+ ignore = [
105
+ "D100", # Remove this eventually
106
+ "C901", # too complex
107
+ "W191", # indentation contains tabs
108
+ "D401", # imperative mood
109
+ "N806", # uppercase variable names, for example, "API_KEY"
110
+ ]
111
+ exclude = [
112
+ ".git",
113
+ "__pycache__",
114
+ "venv",
115
+ "build",
116
+ "dist",
117
+ ]
118
+
119
+ [tool.ruff.lint.per-file-ignores]
120
+ "tests/**/*" = ["D"] # ignore tests for now
121
+
122
+ [tool.ruff.lint.pydocstyle]
123
+ convention = "google"
124
+ [tool.isort]
125
+ profile = "black"
126
+ line_length = 88
127
+
128
+ [tool.pytest.ini_options]
129
+ asyncio_mode = "strict"
130
+ asyncio_default_fixture_loop_scope = "function"
131
+
132
+
pytest.ini ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [pytest]
2
+ asyncio_mode = auto
3
+ timeout = 300
4
+ timeout_method = thread
5
+ norecursedirs = .ipynb_checkpoints
6
+ python_files = test_*.py
7
+ ignore = tests/data_factory/factory/data_factory.ipynb
readme-web.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #### Step 2: Start the Backend
3
+
4
+ ```bash
5
+ # Install Python dependencies
6
+ pip install -r api/requirements.txt
7
+
8
+ # Start the API server
9
+ python -m web.api.main
10
+ ```
11
+
12
+ #### Step 3: Start the Frontend
13
+
14
+ ```bash
15
+ NODE_OPTIONS='--inspect'
16
+ npm run dev
17
+ ```
18
+
19
+ #### Step 4: Debug the Frontend
20
+
21
+ ```bash
22
+ NODE_OPTIONS='--inspect' npm run dev
23
+ ```
scripts/hug_push.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ git push hug vam:main
scripts/rag.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # warning
2
+ import warnings
3
+
4
+ warnings.filterwarnings("ignore")
5
+
6
+ import os
7
+ from together import Together
8
+ import faiss
9
+
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ """
13
+ Do these steps:
14
+ 1) Set up a Together API key from https://together.ai/
15
+ """
16
+ together_api_key = os.environ.get("TOGETHER_API_KEY")
17
+
18
+
19
+ def run_rag(data_dict: dict, prompt: str):
20
+ """
21
+ Run RAG system: process documents, create embeddings, search, and generate answer.
22
+
23
+ """
24
+
25
+ # Stage 0: Initialize Together AI client for LLM completions
26
+ client = Together(api_key=together_api_key)
27
+
28
+ # Stage 1: Load sentence transformer model for creating embeddings
29
+ # ------------------------------------------------------------
30
+ embedding_model = SentenceTransformer(
31
+ "sentence-transformers/all-MiniLM-L6-v2",
32
+ use_auth_token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
33
+ )
34
+
35
+ # Stage 2: Process documents into Vector Database
36
+ # ------------------------------------------------------------
37
+ documents = []
38
+ filenames = []
39
+
40
+ print(f"Processing {len(data_dict)} documents...")
41
+ for key, content in data_dict.items():
42
+ content = content.strip()
43
+ if content: # Only add non-empty documents
44
+ documents.append(content)
45
+ filenames.append(key)
46
+ print(f"✅ Loaded: {key}")
47
+
48
+ if not documents:
49
+ return "No valid documents found in data dictionary!"
50
+
51
+ # Create embeddings for all documents
52
+ print("Creating embeddings...")
53
+ embeddings = embedding_model.encode(documents)
54
+
55
+ # Set up FAISS index for similarity search
56
+ dimension = embeddings.shape[1]
57
+ index = faiss.IndexFlatIP(dimension)
58
+
59
+ # Normalize embeddings for cosine similarity
60
+ faiss.normalize_L2(embeddings)
61
+ index.add(embeddings)
62
+
63
+ print(f"✅ RAG system ready with {len(documents)} documents!")
64
+
65
+ # Stage 3: Retrieve relevant documents
66
+ # ------------------------------------------------------------
67
+ query_embedding = embedding_model.encode([prompt])
68
+ faiss.normalize_L2(query_embedding)
69
+
70
+ # Get top similar documents
71
+ scores, indices = index.search(query_embedding, min(3, len(documents)))
72
+
73
+ # Stage 4: Build context from retrieved documents
74
+ # ------------------------------------------------------------
75
+ relevant_docs = []
76
+ context_parts = []
77
+
78
+ for score, idx in zip(scores[0], indices[0]):
79
+ if idx < len(documents):
80
+ doc_info = {
81
+ "content": documents[idx],
82
+ "filename": filenames[idx],
83
+ "score": float(score),
84
+ }
85
+ relevant_docs.append(doc_info)
86
+ context_parts.append(f"[{doc_info['filename']}]\n{doc_info['content']}")
87
+
88
+ if not relevant_docs:
89
+ return "No relevant documents found for the query."
90
+
91
+ # Combine context
92
+ context = "\n\n".join(context_parts)
93
+
94
+ # Stage 5: Augment by running the LLM to generate an answer
95
+ # ------------------------------------------------------------
96
+ llm_prompt = f"""Answer the question based on the provided context documents.
97
+
98
+ Context:
99
+ {context}
100
+
101
+ Question: {prompt}
102
+
103
+ Instructions:
104
+ - Answer based only on the information in the context
105
+ - Answer should beat least 10 words at max 20 words
106
+ - If the context doesn't contain enough information, say so
107
+ - Mention which document(s) you're referencing
108
+ - Start with According to [document name]
109
+ - Add brackets to the document name
110
+
111
+
112
+ Answer:"""
113
+
114
+ try:
115
+ # Generate answer using Together AI
116
+ response = client.chat.completions.create(
117
+ model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
118
+ messages=[{"role": "user", "content": llm_prompt}],
119
+ max_tokens=500,
120
+ temperature=0.7,
121
+ )
122
+ answer = response.choices[0].message.content
123
+
124
+ # Display source information
125
+ print(f"\n📚 Most relevant source:")
126
+ for doc in relevant_docs:
127
+ print(f" • {doc['filename']} (similarity: {doc['score']:.3f})")
128
+
129
+ # Add source information to the answer
130
+ sources_list = [doc["filename"] for doc in relevant_docs]
131
+ sources_text = sources_list[0]
132
+ full_answer = f"{answer}\n\n📄 Source Used: {sources_text}"
133
+
134
+ return full_answer
135
+
136
+ except Exception as e:
137
+ return f"Error generating answer: {str(e)}"
138
+
139
+
140
+ if __name__ == "__main__":
141
+ # Load dataset
142
+ data_dict = {
143
+ "octopus_facts": "Octopuses have three hearts and blue blood. Two hearts pump blood to the gills, while the third pumps blood to the rest of the body. Their blood is blue because it contains copper-based hemocyanin instead of iron-based hemoglobin.",
144
+ "honey_facts": "Honey never spoils. Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still perfectly edible. This is because honey has natural antibacterial properties and very low water content.",
145
+ "space_facts": "A day on Venus is longer than its year. Venus takes 243 Earth days to rotate once on its axis, but only 225 Earth days to orbit the Sun. This means a Venusian day is longer than a Venusian year.",
146
+ "banana_facts": "Bananas are berries, but strawberries aren't. Botanically speaking, berries must have seeds inside their flesh. Bananas qualify, but strawberries have seeds on the outside, making them aggregate fruits.",
147
+ "shark_facts": "Sharks have been around longer than trees. Sharks first appeared around 400 million years ago, while the earliest trees appeared around 350 million years ago. This means sharks pre-date trees by about 50 million years.",
148
+ "penguin_facts": "Emperor penguins can hold their breath for over 20 minutes and dive to depths of over 500 meters while hunting for fish. They have special adaptations including collapsible lungs and the ability to slow their heart rate.",
149
+ "human_brain": "Your brain uses about 20% of your body's total energy despite being only 2% of your body weight. It consumes roughly 320 calories per day, which is equivalent to eating about 320 M&Ms.",
150
+ }
151
+
152
+ question = "What is interesting about a banana?"
153
+ answer = run_rag(data_dict, question)
154
+ print(f"\n🤖 Answer: {answer}\n")
155
+ print("-" * 50)
src/starfish/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Starfish Core - A framework for structured data processing and LLM integration.
2
+
3
+ Provides core components for:
4
+ - StructuredLLM: Interface for working with large language models
5
+ - data_factory: Factory pattern for creating and managing data pipelines
6
+ """
7
+
8
+ # Expose core directly from easy access
9
+ from .data_factory.factory import data_factory
10
+ from .llm.structured_llm import StructuredLLM
11
+ from .data_gen_template.core import data_gen_template
12
+
13
+ # Define what 'from starfish import *' imports (good practice)
14
+ __all__ = [
15
+ "StructuredLLM",
16
+ "data_factory",
17
+ "data_gen_template",
18
+ ]
src/starfish/common/env_loader.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Environment variable loader utility.
2
+
3
+ This module provides functionality to load environment variables from a .env file
4
+ in non-production environments. In production, environment variables should be
5
+ set through the system/platform instead of using .env files for security reasons.
6
+
7
+ Uses python-dotenv for loading environment variables from .env files.
8
+ """
9
+
10
+ import os
11
+ from typing import Optional
12
+
13
+ # Import python-dotenv
14
+ from dotenv import dotenv_values
15
+ from dotenv import find_dotenv as dotenv_find_dotenv
16
+ from dotenv import load_dotenv as dotenv_load_dotenv
17
+
18
+ from starfish.common.logger import get_logger
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ def load_env_file(env_path: Optional[str] = None, override: bool = False) -> bool:
24
+ """Load environment variables from .env file for non-production environments.
25
+
26
+ Args:
27
+ env_path: Path to the .env file. If None, looks for .env file in the current
28
+ working directory and parent directories.
29
+ override: Whether to override existing environment variables. Default is False.
30
+
31
+ Returns:
32
+ True if environment variables were loaded, False otherwise.
33
+ """
34
+ # Skip loading in production environments
35
+ if os.getenv("ENV") == "PROD":
36
+ logger.info("Production environment detected. Skipping .env file loading.")
37
+
38
+ # Find the .env file if path not provided
39
+ if env_path is None:
40
+ env_path = dotenv_find_dotenv(usecwd=True)
41
+ if not env_path:
42
+ logger.warning("No .env file found in the current or parent directories.")
43
+
44
+ # Load environment variables
45
+ loaded = dotenv_load_dotenv(dotenv_path=env_path, override=override)
46
+
47
+ if loaded:
48
+ # Get the loaded variables to count and log them
49
+ loaded_vars = dotenv_values(env_path)
50
+ logger.debug(f"Loaded {len(loaded_vars)} environment variables from {env_path}")
51
+ else:
52
+ logger.warning(f"Failed to load environment variables from {env_path}")
src/starfish/common/exceptions.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import os
3
+ import traceback
4
+ import uuid
5
+ from typing import Any, Dict, Optional, Tuple
6
+
7
+ from pydantic import BaseModel, Field, ValidationError
8
+
9
+ from starfish.common.logger import get_logger
10
+
11
+ logger = get_logger(__name__)
12
+
13
+ # Simple configuration flag (can be set from app config)
14
+ # Default to False for production safety
15
+ INCLUDE_TRACEBACK_IN_RESPONSE = os.environ.get("INCLUDE_TRACEBACK_IN_RESPONSE", False)
16
+
17
+ #############################################
18
+ # HTTP Status Codes
19
+ #############################################
20
+
21
+
22
+ class HTTPStatus:
23
+ """Standard HTTP status codes."""
24
+
25
+ OK = 200
26
+ BAD_REQUEST = 400
27
+ UNAUTHORIZED = 401
28
+ FORBIDDEN = 403
29
+ NOT_FOUND = 404
30
+ UNPROCESSABLE_ENTITY = 422
31
+ INTERNAL_SERVER_ERROR = 500
32
+
33
+
34
+ #############################################
35
+ # Error Response Model
36
+ #############################################
37
+
38
+
39
+ class ErrorResponse(BaseModel):
40
+ """Standardized error response format for API errors."""
41
+
42
+ status: str = "error"
43
+ error_id: str = Field(..., description="Unique identifier for this error occurrence")
44
+ message: str
45
+ error_type: str
46
+ details: Optional[Dict[str, Any]] = None
47
+
48
+
49
+ #############################################
50
+ # Exception Classes
51
+ #############################################
52
+
53
+
54
+ class StarfishException(Exception):
55
+ """Base exception for all Starfish exceptions."""
56
+
57
+ status_code: int = HTTPStatus.INTERNAL_SERVER_ERROR
58
+ default_message: str = "An unexpected error occurred"
59
+
60
+ def __init__(self, message: Optional[str] = None, details: Optional[Dict[str, Any]] = None):
61
+ self.message = message or self.default_message
62
+ self.details = details
63
+ self.error_id = str(uuid.uuid4())
64
+ super().__init__(self.message)
65
+
66
+ def __str__(self):
67
+ if self.details:
68
+ return f"{self.message} - Details: {self.details}"
69
+ return self.message
70
+
71
+
72
+ class ValidationError(StarfishException):
73
+ """Exception raised for validation errors."""
74
+
75
+ status_code = HTTPStatus.UNPROCESSABLE_ENTITY
76
+ default_message = "Validation error"
77
+
78
+
79
+ class PydanticValidationError(ValidationError):
80
+ """Exception raised for Pydantic validation errors.
81
+
82
+ This class formats Pydantic validation errors into user-friendly messages
83
+ and preserves the detailed error information for debugging.
84
+ """
85
+
86
+ default_message = "Data validation error"
87
+
88
+ @staticmethod
89
+ def format_validation_error(error: ValidationError) -> Tuple[str, Dict[str, Any]]:
90
+ """Format a Pydantic ValidationError into a user-friendly message and details.
91
+
92
+ Args:
93
+ error: The Pydantic ValidationError to format
94
+
95
+ Returns:
96
+ Tuple of (message, details)
97
+ """
98
+ if not hasattr(error, "errors") or not callable(getattr(error, "errors", None)):
99
+ return str(error), {}
100
+
101
+ error_details = error.errors()
102
+ if not error_details:
103
+ return "Validation error", {}
104
+
105
+ # Format fields with errors
106
+ field_errors = []
107
+ for err in error_details:
108
+ # Get error type and location
109
+ err_type = err.get("type", "unknown")
110
+ loc = err.get("loc", [])
111
+
112
+ # Special handling for discriminated unions
113
+ # If first element is a string and subsequent elements exist, might be a discriminated union
114
+ if len(loc) >= 2 and isinstance(loc[0], str) and isinstance(loc[1], str):
115
+ # This might be a discriminated union error like ['vanilla', 'user_input']
116
+ type_name = loc[0]
117
+ field_name = loc[1]
118
+
119
+ # Handle errors differently based on type
120
+ if err_type == "missing":
121
+ field_errors.append(f"Field '{field_name}' is required for '{type_name}' type")
122
+ continue
123
+
124
+ # Standard handling for other errors
125
+ loc_str = ".".join(str(item) for item in loc) if loc else "unknown"
126
+ msg = err.get("msg", "")
127
+
128
+ # Create a user-friendly error message based on error type
129
+ if err_type == "missing":
130
+ field_errors.append(f"'{loc_str}' is required")
131
+ elif err_type == "type_error":
132
+ field_errors.append(f"'{loc_str}' has an invalid type")
133
+ elif err_type == "value_error":
134
+ field_errors.append(f"'{loc_str}' has an invalid value")
135
+ elif err_type.startswith("value_error"):
136
+ field_errors.append(f"'{loc_str}' {msg}")
137
+ elif err_type.startswith("type_error"):
138
+ field_errors.append(f"'{loc_str}' {msg}")
139
+ elif err_type == "extra_forbidden":
140
+ field_errors.append(f"'{loc_str}' is not allowed")
141
+ else:
142
+ field_errors.append(f"'{loc_str}': {msg}")
143
+
144
+ # Create a combined message
145
+ if len(field_errors) == 1:
146
+ message = f"Validation error: {field_errors[0]}"
147
+ else:
148
+ message = f"Validation errors: {', '.join(field_errors)}"
149
+
150
+ return message, {"validation_errors": error_details}
151
+
152
+ def __init__(self, validation_error: ValidationError, message: Optional[str] = None, details: Optional[Dict[str, Any]] = None):
153
+ # Format the validation error if no message is provided
154
+ if message is None:
155
+ message, error_details = self.format_validation_error(validation_error)
156
+
157
+ # Merge error details with provided details
158
+ if details is None:
159
+ details = error_details
160
+ else:
161
+ details = {**details, **error_details}
162
+
163
+ super().__init__(message=message, details=details)
164
+
165
+
166
+ class ParserError(StarfishException):
167
+ """Base exception for all parser-related errors."""
168
+
169
+ status_code = HTTPStatus.UNPROCESSABLE_ENTITY
170
+ default_message = "Parser error"
171
+
172
+
173
+ class JsonParserError(ParserError):
174
+ """Exception raised when JSON parsing fails."""
175
+
176
+ default_message = "JSON parsing error"
177
+
178
+
179
+ class SchemaValidationError(ParserError):
180
+ """Exception raised when data doesn't conform to schema."""
181
+
182
+ default_message = "Schema validation error"
183
+
184
+ def __str__(self):
185
+ if self.details and "errors" in self.details:
186
+ errors_text = "\n".join([f"- {err}" for err in self.details["errors"]])
187
+ return f"{self.message}:\n{errors_text}"
188
+ return super().__str__()
189
+
190
+
191
+ class PydanticParserError(ParserError):
192
+ """Exception raised when Pydantic parsing or validation fails."""
193
+
194
+ default_message = "Pydantic parsing error"
195
+
196
+
197
+ #############################################
198
+ # Error Handling Functions
199
+ #############################################
200
+
201
+
202
+ def format_error(exc: Exception, include_traceback: bool = INCLUDE_TRACEBACK_IN_RESPONSE) -> Tuple[ErrorResponse, int]:
203
+ """Format an exception into a standardized error response.
204
+
205
+ Args:
206
+ exc: The exception to format
207
+ include_traceback: Whether to include traceback in the response details
208
+
209
+ Returns:
210
+ Tuple of (error_response, status_code)
211
+ """
212
+ # Get traceback for logging (always) - may optionally include in response
213
+ tb_str = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__))
214
+
215
+ # Check for exception chaining
216
+ cause = getattr(exc, "__cause__", None)
217
+ cause_tb = None
218
+ if cause:
219
+ cause_tb = "".join(traceback.format_exception(type(cause), cause, cause.__traceback__))
220
+ logger.error(f"Original exception: {type(cause).__name__}: {str(cause)}")
221
+ logger.error(f"Original traceback: {cause_tb}")
222
+
223
+ # Log the current exception
224
+ logger.error(f"Exception: {type(exc).__name__}: {str(exc)}")
225
+ logger.error(f"Traceback: {tb_str}")
226
+
227
+ # Handle Starfish exceptions
228
+ if isinstance(exc, StarfishException):
229
+ error_id = getattr(exc, "error_id", str(uuid.uuid4()))
230
+ status_code = exc.status_code
231
+ details = exc.details or {}
232
+
233
+ # Only add traceback to details if requested
234
+ if include_traceback:
235
+ details["traceback"] = tb_str
236
+ if cause_tb:
237
+ details["original_traceback"] = cause_tb
238
+
239
+ return ErrorResponse(error_id=error_id, message=exc.message, error_type=type(exc).__name__, details=details if details else None), status_code
240
+
241
+ # Handle Pydantic validation errors
242
+ elif isinstance(exc, ValidationError):
243
+ error_id = str(uuid.uuid4())
244
+ status_code = HTTPStatus.UNPROCESSABLE_ENTITY
245
+ details = {"validation_errors": exc.errors()}
246
+
247
+ if include_traceback:
248
+ details["traceback"] = tb_str
249
+ if cause_tb:
250
+ details["original_traceback"] = cause_tb
251
+
252
+ return ErrorResponse(error_id=error_id, message="Validation error", error_type="ValidationError", details=details), status_code
253
+
254
+ # Handle all other exceptions
255
+ else:
256
+ error_id = str(uuid.uuid4())
257
+ status_code = HTTPStatus.INTERNAL_SERVER_ERROR
258
+ details = {}
259
+
260
+ if include_traceback:
261
+ details["traceback"] = tb_str
262
+ if cause_tb:
263
+ details["original_traceback"] = cause_tb
264
+
265
+ return ErrorResponse(
266
+ error_id=error_id, message=str(exc) or "An unexpected error occurred", error_type=type(exc).__name__, details=details if details else None
267
+ ), status_code
268
+
269
+
270
+ #############################################
271
+ # Utility Decorators
272
+ #############################################
273
+
274
+
275
+ def handle_exceptions(return_value=None):
276
+ """Decorator to handle exceptions in both async and sync functions.
277
+
278
+ This decorator can be used with any function to catch exceptions,
279
+ log them, and return a default value instead of raising.
280
+
281
+ Args:
282
+ return_value: The value to return if an exception occurs
283
+
284
+ Returns:
285
+ Decorated function with exception handling
286
+ """
287
+
288
+ def decorator(func):
289
+ # Import asyncio here to avoid dependency if not needed
290
+ try:
291
+ import asyncio
292
+
293
+ is_async_available = True
294
+ except ImportError:
295
+ is_async_available = False
296
+
297
+ # Handle async functions
298
+ if is_async_available and asyncio.iscoroutinefunction(func):
299
+
300
+ @functools.wraps(func)
301
+ async def async_wrapper(*args, **kwargs):
302
+ try:
303
+ return await func(*args, **kwargs)
304
+ except Exception as exc:
305
+ # Format and log the error but don't raise
306
+ format_error(exc, include_traceback=True)
307
+ return return_value
308
+
309
+ return async_wrapper
310
+
311
+ # Handle synchronous functions
312
+ else:
313
+
314
+ @functools.wraps(func)
315
+ def sync_wrapper(*args, **kwargs):
316
+ try:
317
+ return func(*args, **kwargs)
318
+ except Exception as exc:
319
+ # Format and log the error but don't raise
320
+ format_error(exc, include_traceback=True)
321
+ return return_value
322
+
323
+ return sync_wrapper
324
+
325
+ return decorator
src/starfish/common/logger.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from enum import IntEnum
4
+
5
+ from loguru import logger
6
+
7
+ simple_log_format_enabled = os.getenv("SIMPLE_LOG_FORMAT", "true").lower() in ("true", "1", "yes")
8
+
9
+ default_log_level = os.getenv("LOG_LEVEL", "INFO")
10
+
11
+
12
+ # Define custom log levels
13
+ class LogLevel(IntEnum):
14
+ """Custom log levels."""
15
+
16
+ VERBOSE = 5
17
+ DEBUG = 10
18
+ INFO = 20
19
+ WARNING = 30
20
+ ERROR = 40
21
+ CRITICAL = 50
22
+
23
+
24
+ # Configuration Constants
25
+ COLORED_FORMAT = (
26
+ "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
27
+ "<level>{level: <8}</level> | "
28
+ "<cyan>{name}</cyan> | "
29
+ "<blue>{file}:{line}</blue> | "
30
+ "<level>{message}</level>"
31
+ )
32
+
33
+ SIMPLE_COLORED_FORMAT = "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | " "<level>{level: <8}</level> | " "<level>{message}</level>"
34
+
35
+
36
+ class LogManager:
37
+ """Manages logger configuration."""
38
+
39
+ _instance = None
40
+
41
+ def __new__(cls):
42
+ """Create a singleton instance."""
43
+ if cls._instance is None:
44
+ cls._instance = super(LogManager, cls).__new__(cls)
45
+ cls._instance.handler_id = None
46
+ cls._instance.current_level = default_log_level
47
+ cls._instance._initialize()
48
+ return cls._instance
49
+
50
+ def _get_format_string(self):
51
+ """Return the appropriate format string based on LOG_FORMAT_MODE."""
52
+ if simple_log_format_enabled:
53
+ if self.current_level == "DEBUG":
54
+ return COLORED_FORMAT
55
+ return SIMPLE_COLORED_FORMAT
56
+ return COLORED_FORMAT
57
+
58
+ def _initialize(self):
59
+ """Initialize logging with console handler."""
60
+ logger.remove() # Remove default handler
61
+ log_format = self._get_format_string()
62
+ self.handler_id = logger.add(sys.stdout, format=log_format, level=self.current_level, colorize=True)
63
+ # Add custom level only if it doesn't exist
64
+ try:
65
+ logger.level("VERBOSE", no=LogLevel.VERBOSE, color="<magenta>")
66
+ except ValueError:
67
+ # Level already exists, ignore the error
68
+ pass
69
+
70
+ def get_current_log_level(self):
71
+ """Get the current log level."""
72
+ return self.current_level
73
+
74
+ def update_log_level(self, level):
75
+ """Update the log level of the console handler.
76
+
77
+ This can be called at any time during runtime to change the log level.
78
+ """
79
+ level = level.upper()
80
+ if level not in ["VERBOSE", "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
81
+ raise ValueError(f"Invalid log level: {level}")
82
+ logger.remove(self.handler_id)
83
+ self.current_level = level
84
+ log_format = self._get_format_string()
85
+ self.handler_id = logger.add(sys.stdout, format=log_format, level=self.current_level, colorize=True)
86
+
87
+
88
+ # Instantiate LogManager to ensure logging is initialized on module import
89
+ log_manager = LogManager()
90
+
91
+
92
+ # Add verbose method to logger
93
+ def verbose(self, message, *args, **kwargs):
94
+ """Log a verbose message."""
95
+ self.log("VERBOSE", message, *args, **kwargs)
96
+
97
+
98
+ logger.__class__.verbose = verbose
99
+
100
+
101
+ # Function to get the logger
102
+ def get_logger(name):
103
+ """Get a logger instance bound with a name."""
104
+ return logger.bind(name=name)
src/starfish/components/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .prepare_topic import prepare_topic
2
+
3
+ __all__ = ["prepare_topic"]
src/starfish/components/prepare_topic.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import math
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ from starfish import StructuredLLM
6
+
7
+
8
+ async def generate_topics(
9
+ user_instruction: str,
10
+ num_topics: int,
11
+ model_name: str = "openai/gpt-4o-mini",
12
+ model_kwargs: Optional[Dict[str, Any]] = None,
13
+ existing_topics: Optional[List[str]] = None,
14
+ ) -> List[str]:
15
+ """Generate unique topics based on user instructions using a StructuredLLM model."""
16
+ if model_kwargs is None:
17
+ model_kwargs = {}
18
+ if "temperature" not in model_kwargs:
19
+ model_kwargs["temperature"] = 1
20
+ existing_topics = existing_topics or []
21
+
22
+ if num_topics <= 0:
23
+ return []
24
+
25
+ # Calculate batches needed (5 topics per batch)
26
+ llm_batch_size = 5
27
+ num_batches = math.ceil(num_topics / llm_batch_size)
28
+ generated_topics = []
29
+
30
+ for _ in range(num_batches):
31
+ topic_generator = StructuredLLM(
32
+ model_name=model_name,
33
+ prompt="""Can you generate a list of topics about {{user_instruction}}
34
+ {% if existing_topics_str %}
35
+ Please do not generate topics that are already in the list: {{existing_topics_str}}
36
+ Make sure the topics are unique and vary from each other
37
+ {% endif %}
38
+ """,
39
+ output_schema=[{"name": "topic", "type": "str"}],
40
+ model_kwargs=model_kwargs,
41
+ )
42
+
43
+ all_existing = existing_topics + generated_topics
44
+ input_params = {"user_instruction": user_instruction, "num_records": min(llm_batch_size, num_topics - len(generated_topics))}
45
+
46
+ if all_existing:
47
+ input_params["existing_topics_str"] = ",".join(all_existing)
48
+
49
+ topic_response = await topic_generator.run(**input_params)
50
+ topic_data = [item.get("topic") for item in topic_response.data]
51
+ generated_topics.extend(topic_data)
52
+
53
+ if len(generated_topics) >= num_topics:
54
+ break
55
+
56
+ return generated_topics
57
+
58
+
59
+ async def prepare_topic(
60
+ topics: Optional[List[Union[str, Dict[str, int]]]] = None,
61
+ num_records: Optional[int] = None,
62
+ records_per_topic: int = 20,
63
+ user_instruction: Optional[str] = None,
64
+ model_name: str = "openai/gpt-4o-mini",
65
+ model_kwargs: Optional[Dict[str, Any]] = None,
66
+ ) -> List[Dict[str, str]]:
67
+ """Split records into topics, generating topics if none are provided or if needed.
68
+
69
+ Supported input formats:
70
+ 1. String list: ['topic1', 'topic2'] - Topics with equal or calculated distribution
71
+ 2. Dict list: [{'topic1': 20}, {'topic2': 30}] - Topics with specific counts
72
+ 3. Mixed: ['topic1', {'topic2': 30}] - Combination of both formats
73
+ 4. None: No topics provided, will generate based on user_instruction
74
+
75
+ Args:
76
+ topics: Optional list of topics, either strings or {topic: count} dicts
77
+ num_records: Total number of records to split (required for dict topics or None topics)
78
+ records_per_topic: Number of records per topic (default: 20)
79
+ user_instruction: Topic generation instructions (required if topics is None)
80
+ model_name: Model name for topic generation
81
+ model_kwargs: Model kwargs for topic generation
82
+
83
+ Returns:
84
+ List of {'topic': topic_name} dictionaries, with one entry per record
85
+ """
86
+ if model_kwargs is None:
87
+ model_kwargs = {}
88
+ if "temperature" not in model_kwargs:
89
+ model_kwargs["temperature"] = 1
90
+ # --- STEP 1: Input validation and normalization ---
91
+ if topics is None:
92
+ # Must have num_records and user_instruction if no topics provided
93
+ if not num_records or num_records <= 0:
94
+ raise ValueError("num_records must be positive when topics are not provided")
95
+ if not user_instruction:
96
+ raise ValueError("user_instruction required when topics are not provided")
97
+ topic_assignments = []
98
+ else:
99
+ # Validate topics is a non-empty list
100
+ if not isinstance(topics, list) or not topics:
101
+ raise ValueError("topics must be a non-empty list")
102
+
103
+ # Convert all topic inputs to a standardized [(topic_name, count)] list
104
+ # For string topics: count will be None (to be calculated later)
105
+ # For dict topics: use the specified count
106
+ topic_assignments = []
107
+ seen_topics = set()
108
+
109
+ for topic in topics:
110
+ if isinstance(topic, str):
111
+ if topic not in seen_topics:
112
+ topic_assignments.append((topic, None))
113
+ seen_topics.add(topic)
114
+ elif isinstance(topic, dict) and len(topic) == 1:
115
+ topic_name = next(iter(topic))
116
+ count = topic[topic_name]
117
+
118
+ if not isinstance(count, int) or count < 0:
119
+ raise ValueError(f"Topic '{topic_name}' has invalid count {count}")
120
+
121
+ if topic_name not in seen_topics:
122
+ topic_assignments.append((topic_name, count))
123
+ seen_topics.add(topic_name)
124
+ else:
125
+ raise ValueError("Topics must be strings or single-key dictionaries")
126
+
127
+ # --- STEP 2: Calculate or validate counts for provided topics ---
128
+ result = []
129
+ assigned_count = 0
130
+ topic_names = [] # Track all assigned topic names
131
+
132
+ if topic_assignments:
133
+ # Handle string topics with no count (None) - assign counts based on input
134
+ string_topics = [(name, count) for name, count in topic_assignments if count is None]
135
+ dict_topics = [(name, count) for name, count in topic_assignments if count is not None]
136
+
137
+ # Case: String topics with no num_records - assign records_per_topic to each
138
+ if string_topics and num_records is None:
139
+ for name, _ in string_topics:
140
+ result.append({name: records_per_topic})
141
+ topic_names.append(name)
142
+ assigned_count += records_per_topic
143
+
144
+ # Case: String topics with num_records - distribute evenly
145
+ elif string_topics and num_records is not None:
146
+ remaining = num_records - sum(count for _, count in dict_topics if count is not None)
147
+ if remaining < 0:
148
+ raise ValueError("Dict topic counts exceed num_records")
149
+
150
+ # Distribute remaining records among string topics
151
+ if string_topics and remaining > 0:
152
+ base = remaining // len(string_topics)
153
+ extra = remaining % len(string_topics)
154
+
155
+ for i, (name, _) in enumerate(string_topics):
156
+ count = base + (1 if i < extra else 0)
157
+ if count > 0:
158
+ result.append({name: count})
159
+ topic_names.append(name)
160
+ assigned_count += count
161
+
162
+ # Add dictionary topics with predefined counts
163
+ for name, count in dict_topics:
164
+ if count > 0:
165
+ result.append({name: count})
166
+ topic_names.append(name)
167
+ assigned_count += count
168
+
169
+ # Validate total count for dictionary topics
170
+ if dict_topics and num_records is None:
171
+ raise ValueError("num_records required when using dictionary topics")
172
+
173
+ if num_records is not None and assigned_count > num_records:
174
+ raise ValueError(f"Total assigned count ({assigned_count}) exceeds num_records ({num_records})")
175
+
176
+ # --- STEP 3: Generate topics for remaining records if needed ---
177
+ remaining_records = 0 if num_records is None else num_records - assigned_count
178
+
179
+ if remaining_records > 0:
180
+ if records_per_topic <= 0:
181
+ raise ValueError("records_per_topic must be positive when generating topics")
182
+
183
+ # Generate topics with LLM if instructions provided
184
+ if user_instruction:
185
+ topics_needed = math.ceil(remaining_records / records_per_topic)
186
+
187
+ generated = await generate_topics(
188
+ user_instruction=user_instruction, num_topics=topics_needed, model_name=model_name, model_kwargs=model_kwargs, existing_topics=topic_names
189
+ )
190
+
191
+ # Assign counts to generated topics
192
+ for topic in generated:
193
+ if topic in topic_names: # Skip if duplicate (shouldn't happen with proper LLM)
194
+ print(f"Skipping duplicate generated topic: {topic}")
195
+ continue
196
+
197
+ count = min(records_per_topic, remaining_records)
198
+ if count <= 0:
199
+ break
200
+
201
+ result.append({topic: count})
202
+ topic_names.append(topic)
203
+ remaining_records -= count
204
+ assigned_count += count
205
+
206
+ # Generate auto-topics for any still-remaining records
207
+ auto_index = 1
208
+ while remaining_records > 0:
209
+ # Find next available auto_topic name
210
+ auto_name = f"auto_topic{auto_index}"
211
+ while auto_name in topic_names:
212
+ auto_index += 1
213
+ auto_name = f"auto_topic{auto_index}"
214
+
215
+ count = min(records_per_topic, remaining_records)
216
+ result.append({auto_name: count})
217
+ topic_names.append(auto_name)
218
+ remaining_records -= count
219
+ assigned_count += count
220
+ auto_index += 1
221
+
222
+ # Final validation
223
+ if num_records is not None and assigned_count != num_records:
224
+ print(f"Warning: Assigned {assigned_count} records, expected {num_records}")
225
+
226
+ flatten_topic_list = []
227
+ for item in result:
228
+ for key, count in item.items():
229
+ flatten_topic_list.extend([{"topic": key}] * count)
230
+
231
+ return flatten_topic_list
232
+
233
+
234
+ if __name__ == "__main__":
235
+ print("--- Running Examples ---")
236
+
237
+ # Example 1: Dictionary topics with additional generation
238
+ print("\nExample 1: Dictionary topics + generation")
239
+ topics1 = [{"topic1": 20}, {"topic2": 30}]
240
+ result1 = asyncio.run(prepare_topic(topics=topics1, num_records=100, records_per_topic=25, user_instruction="some context"))
241
+ print(f"Result: {result1}")
242
+ print(f"Total: {len(result1)}")
243
+
244
+ # Example 2: String topics with even distribution
245
+ print("\nExample 2: String topics with distribution")
246
+ topics2 = ["topicA", "topicB", "topicC"]
247
+ result2 = asyncio.run(prepare_topic(topics=topics2, num_records=10))
248
+ print(f"Result: {result2}")
249
+ print(f"Total: {len(result2)}")
250
+
251
+ # Example 3: Mixed string and dict topics
252
+ print("\nExample 3: Mixed string/dict topics")
253
+ topics3 = ["topicX", {"topicY": 10}]
254
+ result3 = asyncio.run(prepare_topic(topics=topics3, num_records=30, user_instruction="mixed topics"))
255
+ print(f"Result: {result3}")
256
+ print(f"Total: {len(result3)}")
257
+
258
+ # Example 4: String topics with fixed count
259
+ print("\nExample 4: String topics with fixed count")
260
+ topics4 = ["apple", "banana", "cherry"]
261
+ result4 = asyncio.run(prepare_topic(topics=topics4, records_per_topic=15))
262
+ print(f"Result: {result4}")
263
+ print(f"Total: {len(result4)}")
264
+
265
+ # Example 5: No topics, generate all
266
+ print("\nExample 5: No topics, generate all")
267
+
268
+ async def run_example5():
269
+ result = await prepare_topic(topics=None, num_records=10, records_per_topic=5, user_instruction="cloud computing")
270
+ print(f"Result: {result}")
271
+ print(f"Total: {len(result)}")
272
+
273
+ asyncio.run(run_example5())
274
+
275
+ print("\n--- Examples Finished ---")
src/starfish/data_factory/config.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ PROGRESS_LOG_INTERVAL = 3
2
+ TASK_RUNNER_TIMEOUT = 60
3
+
4
+ MAX_CONCURRENT_TASKS = 10
5
+
6
+ NOT_COMPLETED_THRESHOLD = 3
src/starfish/data_factory/constants.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ RECORD_STATUS = "status"
6
+
7
+ STATUS_TOTAL = "total"
8
+ STATUS_COMPLETED = "completed"
9
+ STATUS_DUPLICATE = "duplicate"
10
+ STATUS_FILTERED = "filtered"
11
+ STATUS_FAILED = "failed"
12
+
13
+ STATUS_MOJO_MAP = {
14
+ STATUS_COMPLETED: "✅",
15
+ STATUS_DUPLICATE: "🔁",
16
+ STATUS_FILTERED: "🚫",
17
+ STATUS_FAILED: "❌",
18
+ STATUS_TOTAL: "📊",
19
+ }
20
+ RUN_MODE = "run_mode"
21
+ RUN_MODE_NORMAL = "normal"
22
+ RUN_MODE_RE_RUN = "resume_from_checkpoint"
23
+ RUN_MODE_DRY_RUN = "dry_run"
24
+
25
+ STORAGE_TYPE_LOCAL = "local"
26
+ STORAGE_TYPE_IN_MEMORY = "in_memory"
27
+
28
+ IDX = "idx_index"
29
+
30
+
31
+ # Define the function directly in constants to avoid circular imports
32
+ def get_app_data_dir():
33
+ r"""Returns a platform-specific directory for application data storage.
34
+
35
+ Following platform conventions:
36
+ - Linux: ~/.local/share/starfish
37
+ - macOS: ~/Library/Application Support/starfish
38
+ - Windows: %LOCALAPPDATA%\starfish
39
+
40
+ Environment variable STARFISH_LOCAL_STORAGE_DIR can override this location.
41
+ """
42
+ # Allow override through environment variable
43
+ env_dir = os.environ.get("STARFISH_LOCAL_STORAGE_DIR")
44
+ if env_dir:
45
+ return env_dir
46
+
47
+ app_name = "starfish"
48
+
49
+ # Get user's home directory
50
+ home = Path.home()
51
+
52
+ # Platform-specific paths
53
+ if sys.platform == "win32":
54
+ # Windows: Use %LOCALAPPDATA% if available, otherwise construct from home
55
+ app_data = os.environ.get("LOCALAPPDATA")
56
+ if not app_data:
57
+ app_data = os.path.join(home, "AppData", "Local")
58
+ base_dir = os.path.join(app_data, app_name)
59
+ elif sys.platform == "darwin":
60
+ # macOS
61
+ base_dir = os.path.join(home, "Library", "Application Support", app_name)
62
+ else:
63
+ # Linux/Unix: follow XDG Base Directory Specification
64
+ xdg_data_home = os.environ.get("XDG_DATA_HOME")
65
+ if not xdg_data_home:
66
+ xdg_data_home = os.path.join(home, ".local", "share")
67
+ base_dir = os.path.join(xdg_data_home, app_name)
68
+
69
+ return base_dir
70
+
71
+
72
+ # Get application database directory
73
+ APP_DATA_DIR = get_app_data_dir()
74
+ LOCAL_STORAGE_PATH = os.path.join(APP_DATA_DIR, "db")
75
+ LOCAL_STORAGE_URI = f"file://{LOCAL_STORAGE_PATH}"
src/starfish/data_factory/event_loop.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ import nest_asyncio
4
+
5
+ from starfish.common.logger import get_logger
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ def run_in_event_loop(coroutine):
11
+ """Run a coroutine in the event loop, handling both nested and new loop cases.
12
+
13
+ Args:
14
+ coroutine: The coroutine to be executed
15
+
16
+ Returns:
17
+ The result of the coroutine execution
18
+
19
+ Note:
20
+ If an event loop is already running, nest_asyncio will be used to allow
21
+ nested execution. If no loop is running, a new event loop will be created.
22
+ """
23
+ try:
24
+ # This call will raise an RuntimError if there is no event loop running.
25
+ asyncio.get_running_loop()
26
+
27
+ # If there is an event loop running (the call above doesn't raise an exception), we can use nest_asyncio to patch the event loop.
28
+ nest_asyncio.apply()
29
+ logger.debug(f"Running nested coroutine: {coroutine.__name__}")
30
+ except RuntimeError as e:
31
+ # If no event loop is running, asyncio
32
+ # Explicitly pass, since we want to fallback to asyncio.run
33
+ logger.debug(str(e))
34
+ logger.debug(f"Running coroutine: {coroutine.__name__}")
35
+ return asyncio.run(coroutine)
src/starfish/data_factory/factory.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Callable, Dict, List, Optional, cast
2
+ from starfish.common.logger import get_logger
3
+ from starfish.data_factory.config import NOT_COMPLETED_THRESHOLD, TASK_RUNNER_TIMEOUT
4
+ from starfish.data_factory.constants import STORAGE_TYPE_LOCAL
5
+ from starfish.data_factory.factory_ import Factory
6
+ from starfish.data_factory.factory_wrapper import FactoryWrapper, DataFactoryProtocol, P, T
7
+ from starfish.data_factory.factory_executor_manager import FactoryExecutorManager
8
+ from starfish.data_factory.utils.data_class import FactoryMasterConfig
9
+ from starfish.data_factory.utils.state import MutableSharedState
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ def data_factory(
15
+ storage: str = STORAGE_TYPE_LOCAL,
16
+ batch_size: int = 1,
17
+ target_count: int = 0,
18
+ dead_queue_threshold: int = 3,
19
+ max_concurrency: int = 10,
20
+ initial_state_values: Optional[Dict[str, Any]] = None,
21
+ on_record_complete: Optional[List[Callable]] = None,
22
+ on_record_error: Optional[List[Callable]] = None,
23
+ show_progress: bool = True,
24
+ task_runner_timeout: int = TASK_RUNNER_TIMEOUT,
25
+ job_run_stop_threshold: int = NOT_COMPLETED_THRESHOLD,
26
+ ) -> Callable[[Callable[P, T]], DataFactoryProtocol[P, T]]:
27
+ """Decorator for creating data processing pipelines.
28
+
29
+ Args:
30
+ storage: Storage backend to use ('local' or 'in_memory')
31
+ batch_size: Number of records to process in each batch
32
+ target_count: Target number of records to generate (0 means process all input)
33
+ max_concurrency: Maximum number of concurrent tasks
34
+ initial_state_values: Initial values for shared state
35
+ on_record_complete: Callbacks to execute after successful record processing
36
+ on_record_error: Callbacks to execute after failed record processing
37
+ show_progress: Whether to display progress bar
38
+ task_runner_timeout: Timeout in seconds for task execution
39
+ job_run_stop_threshold: Threshold for stopping job if too many records fail
40
+
41
+ Returns:
42
+ Decorated function with additional execution methods
43
+ """
44
+ # Initialize default values
45
+ on_record_error = on_record_error or []
46
+ on_record_complete = on_record_complete or []
47
+ initial_state_values = initial_state_values or {}
48
+
49
+ # Create configuration
50
+ config = FactoryMasterConfig(
51
+ storage=storage,
52
+ batch_size=batch_size,
53
+ target_count=target_count,
54
+ dead_queue_threshold=dead_queue_threshold,
55
+ max_concurrency=max_concurrency,
56
+ show_progress=show_progress,
57
+ task_runner_timeout=task_runner_timeout,
58
+ on_record_complete=on_record_complete,
59
+ on_record_error=on_record_error,
60
+ job_run_stop_threshold=job_run_stop_threshold,
61
+ )
62
+
63
+ # Initialize factory instance
64
+ _factory = None
65
+
66
+ def decorator(func: Callable[P, T]) -> DataFactoryProtocol[P, T]:
67
+ """Actual decorator that wraps the function."""
68
+ nonlocal _factory
69
+ _factory = _initialize_or_update_factory(_factory, config, func, initial_state_values)
70
+ wrapper = FactoryWrapper(_factory, func)
71
+ return cast(DataFactoryProtocol[P, T], wrapper)
72
+
73
+ # Add resume capability as a static method
74
+ data_factory.resume_from_checkpoint = resume_from_checkpoint
75
+
76
+ return decorator
77
+
78
+
79
+ def _initialize_or_update_factory(
80
+ factory: Optional[Factory], config: FactoryMasterConfig, func: Callable[P, T], initial_state_values: Dict[str, Any]
81
+ ) -> Factory:
82
+ """Initialize or update a Factory instance."""
83
+ if factory is None:
84
+ factory = Factory(config, func)
85
+ factory.state = MutableSharedState(initial_data=initial_state_values)
86
+ else:
87
+ factory.config = config
88
+ factory.func = func
89
+ factory.state = MutableSharedState(initial_data=initial_state_values)
90
+ return factory
91
+
92
+
93
+ def resume_from_checkpoint(*args, **kwargs) -> List[dict[str, Any]]:
94
+ """Decorator for creating data processing pipelines.
95
+
96
+ Args:
97
+ master_job_id : resume for this master job
98
+ storage: Storage backend to use ('local' or 'in_memory')
99
+ batch_size: Number of records to process in each batch
100
+ target_count: Target number of records to generate (0 means process all input)
101
+ max_concurrency: Maximum number of concurrent tasks
102
+ initial_state_values: Initial values for shared state
103
+ on_record_complete: Callbacks to execute after successful record processing
104
+ on_record_error: Callbacks to execute after failed record processing
105
+ show_progress: Whether to display progress bar
106
+ task_runner_timeout: Timeout in seconds for task execution
107
+ job_run_stop_threshold: Threshold for stopping job if too many records fail
108
+
109
+ Returns:
110
+ List[Dict(str,Any)]
111
+ """
112
+ return FactoryExecutorManager.resume(*args, **kwargs)