Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .env.example +28 -0
- .gitattributes +7 -0
- .github/workflows/cd.yaml +43 -0
- .github/workflows/ci.yaml +69 -0
- .gitignore +177 -0
- .gradio/certificate.pem +31 -0
- .pre-commit-config.yaml +10 -0
- .python-version +1 -0
- .vscode/settings.json +15 -0
- Chat with your Video Library – Engineering AI Agents.pdf +3 -0
- Dockerfile +47 -0
- LICENSE +21 -0
- README.md +661 -7
- clips/clip_eFgkZKhNUdM_1270_0.847.mp4 +3 -0
- clips/clip_eFgkZKhNUdM_642_0.847.mp4 +3 -0
- clips/clip_eFgkZKhNUdM_874_0.838.mp4 +3 -0
- code_snippets/03_custom_odm_example.py +10 -0
- code_snippets/03_orm.py +37 -0
- code_snippets/08_instructor_embeddings.py +18 -0
- code_snippets/08_text_embeddings.py +28 -0
- code_snippets/08_text_image_embeddings.py +37 -0
- configs/digital_data_etl_maxime_labonne.yaml +38 -0
- configs/digital_data_etl_paul_iusztin.yaml +62 -0
- configs/end_to_end_data.yaml +87 -0
- configs/evaluating.yaml +9 -0
- configs/export_artifact_to_json.yaml +13 -0
- configs/feature_engineering.yaml +11 -0
- configs/generate_instruct_datasets.yaml +13 -0
- configs/generate_preference_datasets.yaml +13 -0
- configs/training.yaml +14 -0
- data/artifacts/cleaned_documents.json +0 -0
- data/artifacts/instruct_datasets.json +0 -0
- data/artifacts/preference_datasets.json +0 -0
- data/artifacts/raw_documents.json +0 -0
- data/data_warehouse_raw_data/ArticleDocument.json +0 -0
- data/data_warehouse_raw_data/PostDocument.json +1 -0
- data/data_warehouse_raw_data/RepositoryDocument.json +1 -0
- data/data_warehouse_raw_data/UserDocument.json +1 -0
- demonstration.ipynb +1027 -0
- docker-compose.yml +40 -0
- en_core_web_sm-3.7.0-py3-none-any.whl +3 -0
- images/cover_plus.png +3 -0
- images/crazy_cat.jpg +3 -0
- llm_engineering/__init__.py +4 -0
- llm_engineering/application/__init__.py +3 -0
- llm_engineering/application/crawlers/__init__.py +6 -0
- llm_engineering/application/crawlers/base.py +66 -0
- llm_engineering/application/crawlers/custom_article.py +54 -0
- llm_engineering/application/crawlers/dispatcher.py +51 -0
- llm_engineering/application/crawlers/github.py +68 -0
.env.example
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --- Required settings even when working locally. ---
|
2 |
+
|
3 |
+
# OpenAI API Config
|
4 |
+
OPENAI_MODEL_ID=gpt-4o-mini
|
5 |
+
OPENAI_API_KEY=str
|
6 |
+
|
7 |
+
# Huggingface API Config
|
8 |
+
HUGGINGFACE_ACCESS_TOKEN=str
|
9 |
+
|
10 |
+
# Comet ML (during training and inference)
|
11 |
+
COMET_API_KEY=str
|
12 |
+
|
13 |
+
# --- Required settings when deploying the code. ---
|
14 |
+
# --- Otherwise, default values work fine. ---
|
15 |
+
|
16 |
+
# MongoDB database
|
17 |
+
DATABASE_HOST="mongodb://llm_engineering:[email protected]:27017"
|
18 |
+
|
19 |
+
# Qdrant vector database
|
20 |
+
USE_QDRANT_CLOUD=false
|
21 |
+
QDRANT_CLOUD_URL=str
|
22 |
+
QDRANT_APIKEY=str
|
23 |
+
|
24 |
+
# AWS Authentication
|
25 |
+
AWS_ARN_ROLE=str
|
26 |
+
AWS_REGION=eu-central-1
|
27 |
+
AWS_ACCESS_KEY=str
|
28 |
+
AWS_SECRET_KEY=str
|
.gitattributes
CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Chat[[:space:]]with[[:space:]]your[[:space:]]Video[[:space:]]Library[[:space:]]–[[:space:]]Engineering[[:space:]]AI[[:space:]]Agents.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
clips/clip_eFgkZKhNUdM_1270_0.847.mp4 filter=lfs diff=lfs merge=lfs -text
|
38 |
+
clips/clip_eFgkZKhNUdM_642_0.847.mp4 filter=lfs diff=lfs merge=lfs -text
|
39 |
+
clips/clip_eFgkZKhNUdM_874_0.838.mp4 filter=lfs diff=lfs merge=lfs -text
|
40 |
+
en_core_web_sm-3.7.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
41 |
+
images/cover_plus.png filter=lfs diff=lfs merge=lfs -text
|
42 |
+
images/crazy_cat.jpg filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/cd.yaml
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: CD
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
|
8 |
+
concurrency:
|
9 |
+
group: ${{ github.workflow }}-${{ github.ref }}
|
10 |
+
cancel-in-progress: true
|
11 |
+
|
12 |
+
jobs:
|
13 |
+
build:
|
14 |
+
name: Build & Push Docker Image
|
15 |
+
runs-on: ubuntu-latest
|
16 |
+
steps:
|
17 |
+
- name: Checkout Code
|
18 |
+
uses: actions/checkout@v3
|
19 |
+
|
20 |
+
- name: Set up Docker Buildx
|
21 |
+
uses: docker/setup-buildx-action@v3
|
22 |
+
|
23 |
+
- name: Configure AWS credentials
|
24 |
+
uses: aws-actions/configure-aws-credentials@v1
|
25 |
+
with:
|
26 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
27 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
28 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
29 |
+
|
30 |
+
- name: Login to Amazon ECR
|
31 |
+
id: login-ecr
|
32 |
+
uses: aws-actions/amazon-ecr-login@v1
|
33 |
+
|
34 |
+
- name: Build images & push to ECR
|
35 |
+
id: build-image
|
36 |
+
uses: docker/build-push-action@v6
|
37 |
+
with:
|
38 |
+
context: .
|
39 |
+
file: ./Dockerfile
|
40 |
+
tags: |
|
41 |
+
${{ steps.login-ecr.outputs.registry }}/${{ secrets.AWS_ECR_NAME }}:${{ github.sha }}
|
42 |
+
${{ steps.login-ecr.outputs.registry }}/${{ secrets.AWS_ECR_NAME }}:latest
|
43 |
+
push: true
|
.github/workflows/ci.yaml
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: CI
|
2 |
+
|
3 |
+
on:
|
4 |
+
pull_request:
|
5 |
+
|
6 |
+
concurrency:
|
7 |
+
group: ${{ github.workflow }}-${{ github.ref }}
|
8 |
+
cancel-in-progress: true
|
9 |
+
|
10 |
+
jobs:
|
11 |
+
qa:
|
12 |
+
name: QA
|
13 |
+
runs-on: ubuntu-latest
|
14 |
+
|
15 |
+
steps:
|
16 |
+
- name: Checkout
|
17 |
+
uses: actions/checkout@v3
|
18 |
+
|
19 |
+
- name: Setup Python
|
20 |
+
uses: actions/setup-python@v3
|
21 |
+
with:
|
22 |
+
python-version: "3.11"
|
23 |
+
|
24 |
+
- name: Install poetry
|
25 |
+
uses: abatilo/actions-poetry@v2
|
26 |
+
with:
|
27 |
+
poetry-version: 1.8.3
|
28 |
+
|
29 |
+
- name: Install packages
|
30 |
+
run: |
|
31 |
+
poetry install --only dev
|
32 |
+
poetry self add 'poethepoet[poetry_plugin]'
|
33 |
+
|
34 |
+
- name: gitleaks check
|
35 |
+
run: poetry poe gitleaks-check
|
36 |
+
|
37 |
+
- name: Lint check [Python]
|
38 |
+
run: poetry poe lint-check
|
39 |
+
|
40 |
+
- name: Format check [Python]
|
41 |
+
run: poetry poe format-check
|
42 |
+
|
43 |
+
test:
|
44 |
+
name: Test
|
45 |
+
runs-on: ubuntu-latest
|
46 |
+
|
47 |
+
steps:
|
48 |
+
- name: Checkout
|
49 |
+
uses: actions/checkout@v3
|
50 |
+
|
51 |
+
- name: Setup Python
|
52 |
+
uses: actions/setup-python@v3
|
53 |
+
with:
|
54 |
+
python-version: "3.11"
|
55 |
+
|
56 |
+
- name: Install poetry
|
57 |
+
uses: abatilo/actions-poetry@v2
|
58 |
+
with:
|
59 |
+
poetry-version: 1.8.3
|
60 |
+
|
61 |
+
- name: Install packages
|
62 |
+
run: |
|
63 |
+
poetry install
|
64 |
+
poetry self add 'poethepoet[poetry_plugin]'
|
65 |
+
|
66 |
+
- name: Run tests
|
67 |
+
run: |
|
68 |
+
echo "Running tests..."
|
69 |
+
poetry poe test
|
.gitignore
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
110 |
+
.pdm.toml
|
111 |
+
.pdm-python
|
112 |
+
.pdm-build/
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# IDEs
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
#.idea/
|
163 |
+
|
164 |
+
# MacOs
|
165 |
+
.DS_Store
|
166 |
+
|
167 |
+
# VS Code
|
168 |
+
.vscode/**/launch.json
|
169 |
+
|
170 |
+
# Data
|
171 |
+
output/
|
172 |
+
sagemaker_*.json
|
173 |
+
run_ids.txt
|
174 |
+
|
175 |
+
# Virtual environments
|
176 |
+
*_venv
|
177 |
+
*_myenv
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
repos:
|
2 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
3 |
+
rev: v0.3.5
|
4 |
+
hooks:
|
5 |
+
- id: ruff # Run the linter.
|
6 |
+
- id: ruff-format # Run the formatter.
|
7 |
+
- repo: https://github.com/gitleaks/gitleaks
|
8 |
+
rev: v8.18.2
|
9 |
+
hooks:
|
10 |
+
- id: gitleaks
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.11.8
|
.vscode/settings.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[python]": {
|
3 |
+
"editor.formatOnSave": true,
|
4 |
+
"editor.codeActionsOnSave": {
|
5 |
+
"source.fixAll": "explicit",
|
6 |
+
"source.organizeImports": "explicit"
|
7 |
+
},
|
8 |
+
"editor.defaultFormatter": "charliermarsh.ruff"
|
9 |
+
},
|
10 |
+
"notebook.formatOnSave.enabled": true,
|
11 |
+
"notebook.codeActionsOnSave": {
|
12 |
+
"notebook.source.fixAll": "explicit",
|
13 |
+
"notebook.source.organizeImports": "explicit"
|
14 |
+
},
|
15 |
+
}
|
Chat with your Video Library – Engineering AI Agents.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c32250914f76da5d6294d4dd86e6540cff5278bee30cbc4e4d1571fe26403c46
|
3 |
+
size 2216057
|
Dockerfile
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11-slim-bullseye AS release
|
2 |
+
|
3 |
+
ENV WORKSPACE_ROOT=/app/
|
4 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
5 |
+
ENV PYTHONUNBUFFERED=1
|
6 |
+
ENV POETRY_VERSION=1.8.3
|
7 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
8 |
+
ENV POETRY_NO_INTERACTION=1
|
9 |
+
|
10 |
+
# Install Google Chrome
|
11 |
+
RUN apt-get update -y && \
|
12 |
+
apt-get install -y gnupg wget curl --no-install-recommends && \
|
13 |
+
wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/google-linux-signing-key.gpg && \
|
14 |
+
echo "deb [signed-by=/usr/share/keyrings/google-linux-signing-key.gpg] https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
|
15 |
+
apt-get update -y && \
|
16 |
+
apt-get install -y google-chrome-stable && \
|
17 |
+
rm -rf /var/lib/apt/lists/*
|
18 |
+
|
19 |
+
# Install other system dependencies.
|
20 |
+
RUN apt-get update -y \
|
21 |
+
&& apt-get install -y --no-install-recommends build-essential \
|
22 |
+
gcc \
|
23 |
+
python3-dev \
|
24 |
+
build-essential \
|
25 |
+
libglib2.0-dev \
|
26 |
+
libnss3-dev \
|
27 |
+
&& apt-get clean \
|
28 |
+
&& rm -rf /var/lib/apt/lists/*
|
29 |
+
|
30 |
+
# Install Poetry using pip and clear cache
|
31 |
+
RUN pip install --no-cache-dir "poetry==$POETRY_VERSION"
|
32 |
+
RUN poetry config installer.max-workers 20
|
33 |
+
|
34 |
+
WORKDIR $WORKSPACE_ROOT
|
35 |
+
|
36 |
+
# Copy the poetry lock file and pyproject.toml file to install dependencies
|
37 |
+
COPY pyproject.toml poetry.lock $WORKSPACE_ROOT
|
38 |
+
|
39 |
+
# Install the dependencies and clear cache
|
40 |
+
RUN poetry config virtualenvs.create false && \
|
41 |
+
poetry install --no-root --no-interaction --no-cache --without dev && \
|
42 |
+
poetry self add 'poethepoet[poetry_plugin]' && \
|
43 |
+
rm -rf ~/.cache/pypoetry/cache/ && \
|
44 |
+
rm -rf ~/.cache/pypoetry/artifacts/
|
45 |
+
|
46 |
+
# Copy the rest of the code.
|
47 |
+
COPY . $WORKSPACE_ROOT
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Packt
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1,666 @@
|
|
1 |
---
|
2 |
-
title: LLM
|
3 |
-
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: blue
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.29.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: LLM-Engineers-Handbook
|
3 |
+
app_file: demonstration.ipynb
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 5.29.0
|
|
|
|
|
6 |
---
|
7 |
+
<div align="center">
|
8 |
+
<h1>👷 LLM Engineer's Handbook</h1>
|
9 |
+
<p class="tagline">Official repository of the <a href="https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/">LLM Engineer's Handbook</a> by <a href="https://github.com/iusztinpaul">Paul Iusztin</a> and <a href="https://github.com/mlabonne">Maxime Labonne</a></p>
|
10 |
+
</div>
|
11 |
+
</br>
|
12 |
|
13 |
+
<p align="center">
|
14 |
+
<a href="https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/">
|
15 |
+
<img src="images/cover_plus.png" alt="Book cover">
|
16 |
+
</a>
|
17 |
+
</p>
|
18 |
+
|
19 |
+
<p align="center">
|
20 |
+
Find the book on <a href="https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/">Amazon</a> or <a href="https://www.packtpub.com/en-us/product/llm-engineers-handbook-9781836200062">Packt</a>
|
21 |
+
</p>
|
22 |
+
|
23 |
+
## 🌟 Features
|
24 |
+
|
25 |
+
The goal of this book is to create your own end-to-end LLM-based system using best practices:
|
26 |
+
|
27 |
+
- 📝 Data collection & generation
|
28 |
+
- 🔄 LLM training pipeline
|
29 |
+
- 📊 Simple RAG system
|
30 |
+
- 🚀 Production-ready AWS deployment
|
31 |
+
- 🔍 Comprehensive monitoring
|
32 |
+
- 🧪 Testing and evaluation framework
|
33 |
+
|
34 |
+
You can download and use the final trained model on [Hugging Face](https://huggingface.co/mlabonne/TwinLlama-3.1-8B-DPO).
|
35 |
+
|
36 |
+
> [!IMPORTANT]
|
37 |
+
> The code in this GitHub repository is actively maintained and may contain updates not reflected in the book. **Always refer to this repository for the latest version of the code.**
|
38 |
+
|
39 |
+
## 🔗 Dependencies
|
40 |
+
|
41 |
+
### Local dependencies
|
42 |
+
|
43 |
+
To install and run the project locally, you need the following dependencies.
|
44 |
+
|
45 |
+
| Tool | Version | Purpose | Installation Link |
|
46 |
+
|------|---------|---------|------------------|
|
47 |
+
| pyenv | ≥2.3.36 | Multiple Python versions (optional) | [Install Guide](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) |
|
48 |
+
| Python | 3.11 | Runtime environment | [Download](https://www.python.org/downloads/) |
|
49 |
+
| Poetry | >= 1.8.3 and < 2.0 | Package management | [Install Guide](https://python-poetry.org/docs/#installation) |
|
50 |
+
| Docker | ≥27.1.1 | Containerization | [Install Guide](https://docs.docker.com/engine/install/) |
|
51 |
+
| AWS CLI | ≥2.15.42 | Cloud management | [Install Guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) |
|
52 |
+
| Git | ≥2.44.0 | Version control | [Download](https://git-scm.com/downloads) |
|
53 |
+
|
54 |
+
### Cloud services
|
55 |
+
|
56 |
+
The code also uses and depends on the following cloud services. For now, you don't have to do anything. We will guide you in the installation and deployment sections on how to use them:
|
57 |
+
|
58 |
+
| Service | Purpose |
|
59 |
+
|---------|---------|
|
60 |
+
| [HuggingFace](https://huggingface.com/) | Model registry |
|
61 |
+
| [Comet ML](https://www.comet.com/site/products/opik/?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik) | Experiment tracker |
|
62 |
+
| [Opik](https://www.comet.com/site/products/opik/?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik) | Prompt monitoring |
|
63 |
+
| [ZenML](https://www.zenml.io/) | Orchestrator and artifacts layer |
|
64 |
+
| [AWS](https://aws.amazon.com/) | Compute and storage |
|
65 |
+
| [MongoDB](https://www.mongodb.com/) | NoSQL database |
|
66 |
+
| [Qdrant](https://qdrant.tech/) | Vector database |
|
67 |
+
| [GitHub Actions](https://github.com/features/actions) | CI/CD pipeline |
|
68 |
+
|
69 |
+
In the [LLM Engineer's Handbook](https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/), Chapter 2 will walk you through each tool. Chapters 10 and 11 provide step-by-step guides on how to set up everything you need.
|
70 |
+
|
71 |
+
## 🗂️ Project Structure
|
72 |
+
|
73 |
+
Here is the directory overview:
|
74 |
+
|
75 |
+
```bash
|
76 |
+
.
|
77 |
+
├── code_snippets/ # Standalone example code
|
78 |
+
├── configs/ # Pipeline configuration files
|
79 |
+
├── llm_engineering/ # Core project package
|
80 |
+
│ ├── application/
|
81 |
+
│ ├── domain/
|
82 |
+
│ ├── infrastructure/
|
83 |
+
│ ├── model/
|
84 |
+
├── pipelines/ # ML pipeline definitions
|
85 |
+
├── steps/ # Pipeline components
|
86 |
+
├── tests/ # Test examples
|
87 |
+
├── tools/ # Utility scripts
|
88 |
+
│ ├── run.py
|
89 |
+
│ ├── ml_service.py
|
90 |
+
│ ├── rag.py
|
91 |
+
│ ├── data_warehouse.py
|
92 |
+
```
|
93 |
+
|
94 |
+
`llm_engineering/` is the main Python package implementing LLM and RAG functionality. It follows Domain-Driven Design (DDD) principles:
|
95 |
+
|
96 |
+
- `domain/`: Core business entities and structures
|
97 |
+
- `application/`: Business logic, crawlers, and RAG implementation
|
98 |
+
- `model/`: LLM training and inference
|
99 |
+
- `infrastructure/`: External service integrations (AWS, Qdrant, MongoDB, FastAPI)
|
100 |
+
|
101 |
+
The code logic and imports flow as follows: `infrastructure` → `model` → `application` → `domain`
|
102 |
+
|
103 |
+
`pipelines/`: Contains the ZenML ML pipelines, which serve as the entry point for all the ML pipelines. Coordinates the data processing and model training stages of the ML lifecycle.
|
104 |
+
|
105 |
+
`steps/`: Contains individual ZenML steps, which are reusable components for building and customizing ZenML pipelines. Steps perform specific tasks (e.g., data loading, preprocessing) and can be combined within the ML pipelines.
|
106 |
+
|
107 |
+
`tests/`: Covers a few sample tests used as examples within the CI pipeline.
|
108 |
+
|
109 |
+
`tools/`: Utility scripts used to call the ZenML pipelines and inference code:
|
110 |
+
- `run.py`: Entry point script to run ZenML pipelines.
|
111 |
+
- `ml_service.py`: Starts the REST API inference server.
|
112 |
+
- `rag.py`: Demonstrates usage of the RAG retrieval module.
|
113 |
+
- `data_warehouse.py`: Used to export or import data from the MongoDB data warehouse through JSON files.
|
114 |
+
|
115 |
+
`configs/`: ZenML YAML configuration files to control the execution of pipelines and steps.
|
116 |
+
|
117 |
+
`code_snippets/`: Independent code examples that can be executed independently.
|
118 |
+
|
119 |
+
## 💻 Installation
|
120 |
+
|
121 |
+
> [!NOTE]
|
122 |
+
> If you are experiencing issues while installing and running the repository, consider checking the [Issues](https://github.com/PacktPublishing/LLM-Engineers-Handbook/issues) GitHub section for other people who solved similar problems or directly asking us for help.
|
123 |
+
|
124 |
+
### 1. Clone the Repository
|
125 |
+
|
126 |
+
Start by cloning the repository and navigating to the project directory:
|
127 |
+
|
128 |
+
```bash
|
129 |
+
git clone https://github.com/PacktPublishing/LLM-Engineers-Handbook.git
|
130 |
+
cd LLM-Engineers-Handbook
|
131 |
+
```
|
132 |
+
|
133 |
+
Next, we have to prepare your Python environment and its adjacent dependencies.
|
134 |
+
|
135 |
+
### 2. Set Up Python Environment
|
136 |
+
|
137 |
+
The project requires Python 3.11. You can either use your global Python installation or set up a project-specific version using pyenv.
|
138 |
+
|
139 |
+
#### Option A: Using Global Python (if version 3.11 is installed)
|
140 |
+
|
141 |
+
Verify your Python version:
|
142 |
+
|
143 |
+
```bash
|
144 |
+
python --version # Should show Python 3.11.x
|
145 |
+
```
|
146 |
+
|
147 |
+
#### Option B: Using pyenv (recommended)
|
148 |
+
|
149 |
+
1. Verify pyenv installation:
|
150 |
+
|
151 |
+
```bash
|
152 |
+
pyenv --version # Should show pyenv 2.3.36 or later
|
153 |
+
```
|
154 |
+
|
155 |
+
2. Install Python 3.11.8:
|
156 |
+
|
157 |
+
```bash
|
158 |
+
pyenv install 3.11.8
|
159 |
+
```
|
160 |
+
|
161 |
+
3. Verify the installation:
|
162 |
+
|
163 |
+
```bash
|
164 |
+
python --version # Should show Python 3.11.8
|
165 |
+
```
|
166 |
+
|
167 |
+
4. Confirm Python version in the project directory:
|
168 |
+
|
169 |
+
```bash
|
170 |
+
python --version
|
171 |
+
# Output: Python 3.11.8
|
172 |
+
```
|
173 |
+
|
174 |
+
> [!NOTE]
|
175 |
+
> The project includes a `.python-version` file that automatically sets the correct Python version when you're in the project directory.
|
176 |
+
|
177 |
+
### 3. Install Dependencies
|
178 |
+
|
179 |
+
The project uses Poetry for dependency management.
|
180 |
+
|
181 |
+
1. Verify Poetry installation:
|
182 |
+
|
183 |
+
```bash
|
184 |
+
poetry --version # Should show Poetry version 1.8.3 or later
|
185 |
+
```
|
186 |
+
|
187 |
+
2. Set up the project environment and install dependencies:
|
188 |
+
|
189 |
+
```bash
|
190 |
+
poetry env use 3.11
|
191 |
+
poetry install --without aws
|
192 |
+
poetry run pre-commit install
|
193 |
+
```
|
194 |
+
|
195 |
+
This will:
|
196 |
+
|
197 |
+
- Configure Poetry to use Python 3.11
|
198 |
+
- Install project dependencies (excluding AWS-specific packages)
|
199 |
+
- Set up pre-commit hooks for code verification
|
200 |
+
|
201 |
+
### 4. Activate the Environment
|
202 |
+
|
203 |
+
As our task manager, we run all the scripts using [Poe the Poet](https://poethepoet.natn.io/index.html).
|
204 |
+
|
205 |
+
1. Start a Poetry shell:
|
206 |
+
|
207 |
+
```bash
|
208 |
+
poetry shell
|
209 |
+
```
|
210 |
+
|
211 |
+
2. Run project commands using Poe the Poet:
|
212 |
+
|
213 |
+
```bash
|
214 |
+
poetry poe ...
|
215 |
+
```
|
216 |
+
|
217 |
+
<details>
|
218 |
+
<summary>🔧 Troubleshooting Poe the Poet Installation</summary>
|
219 |
+
|
220 |
+
### Alternative Command Execution
|
221 |
+
|
222 |
+
If you're experiencing issues with `poethepoet`, you can still run the project commands directly through Poetry. Here's how:
|
223 |
+
|
224 |
+
1. Look up the command definition in `pyproject.toml`
|
225 |
+
2. Use `poetry run` with the underlying command
|
226 |
+
|
227 |
+
#### Example:
|
228 |
+
Instead of:
|
229 |
+
```bash
|
230 |
+
poetry poe local-infrastructure-up
|
231 |
+
```
|
232 |
+
Use the direct command from pyproject.toml:
|
233 |
+
```bash
|
234 |
+
poetry run <actual-command-from-pyproject-toml>
|
235 |
+
```
|
236 |
+
Note: All project commands are defined in the [tool.poe.tasks] section of pyproject.toml
|
237 |
+
</details>
|
238 |
+
|
239 |
+
Now, let's configure our local project with all the necessary credentials and tokens to run the code locally.
|
240 |
+
|
241 |
+
### 5. Local Development Setup
|
242 |
+
|
243 |
+
After you have installed all the dependencies, you must create and fill a `.env` file with your credentials to appropriately interact with other services and run the project. Setting your sensitive credentials in a `.env` file is a good security practice, as this file won't be committed to GitHub or shared with anyone else.
|
244 |
+
|
245 |
+
1. First, copy our example by running the following:
|
246 |
+
|
247 |
+
```bash
|
248 |
+
cp .env.example .env # The file must be at your repository's root!
|
249 |
+
```
|
250 |
+
|
251 |
+
2. Now, let's understand how to fill in all the essential variables within the `.env` file to get you started. The following are the mandatory settings we must complete when working locally:
|
252 |
+
|
253 |
+
#### OpenAI
|
254 |
+
|
255 |
+
To authenticate to OpenAI's API, you must fill out the `OPENAI_API_KEY` env var with an authentication token.
|
256 |
+
|
257 |
+
```env
|
258 |
+
OPENAI_API_KEY=your_api_key_here
|
259 |
+
```
|
260 |
+
|
261 |
+
→ Check out this [tutorial](https://platform.openai.com/docs/quickstart) to learn how to provide one from OpenAI.
|
262 |
+
|
263 |
+
#### Hugging Face
|
264 |
+
|
265 |
+
To authenticate to Hugging Face, you must fill out the `HUGGINGFACE_ACCESS_TOKEN` env var with an authentication token.
|
266 |
+
|
267 |
+
```env
|
268 |
+
HUGGINGFACE_ACCESS_TOKEN=your_token_here
|
269 |
+
```
|
270 |
+
|
271 |
+
→ Check out this [tutorial](https://huggingface.co/docs/hub/en/security-tokens) to learn how to provide one from Hugging Face.
|
272 |
+
|
273 |
+
#### Comet ML & Opik
|
274 |
+
|
275 |
+
To authenticate to Comet ML (required only during training) and Opik, you must fill out the `COMET_API_KEY` env var with your authentication token.
|
276 |
+
|
277 |
+
```env
|
278 |
+
COMET_API_KEY=your_api_key_here
|
279 |
+
```
|
280 |
+
|
281 |
+
→ Check out this [tutorial](https://www.comet.com/docs/opik/?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik) to learn how to get started with Opik. You can also access Opik's dashboard using 🔗[this link](https://www.comet.com/opik?utm_source=llm_handbook&utm_medium=github&utm_content=opik).
|
282 |
+
|
283 |
+
### 6. Deployment Setup
|
284 |
+
|
285 |
+
When deploying the project to the cloud, we must set additional settings for Mongo, Qdrant, and AWS. If you are just working locally, the default values of these env vars will work out of the box. Detailed deployment instructions are available in Chapter 11 of the [LLM Engineer's Handbook](https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/).
|
286 |
+
|
287 |
+
#### MongoDB
|
288 |
+
|
289 |
+
We must change the `DATABASE_HOST` env var with the URL pointing to your cloud MongoDB cluster.
|
290 |
+
|
291 |
+
```env
|
292 |
+
DATABASE_HOST=your_mongodb_url
|
293 |
+
```
|
294 |
+
|
295 |
+
→ Check out this [tutorial](https://www.mongodb.com/resources/products/fundamentals/mongodb-cluster-setup) to learn how to create and host a MongoDB cluster for free.
|
296 |
+
|
297 |
+
#### Qdrant
|
298 |
+
|
299 |
+
Change `USE_QDRANT_CLOUD` to `true`, `QDRANT_CLOUD_URL` with the URL point to your cloud Qdrant cluster, and `QDRANT_APIKEY` with its API key.
|
300 |
+
|
301 |
+
```env
|
302 |
+
USE_QDRANT_CLOUD=true
|
303 |
+
QDRANT_CLOUD_URL=your_qdrant_cloud_url
|
304 |
+
QDRANT_APIKEY=your_qdrant_api_key
|
305 |
+
```
|
306 |
+
|
307 |
+
→ Check out this [tutorial](https://qdrant.tech/documentation/cloud/create-cluster/) to learn how to create a Qdrant cluster for free
|
308 |
+
|
309 |
+
#### AWS
|
310 |
+
|
311 |
+
For your AWS set-up to work correctly, you need the AWS CLI installed on your local machine and properly configured with an admin user (or a user with enough permissions to create new SageMaker, ECR, and S3 resources; using an admin user will make everything more straightforward).
|
312 |
+
|
313 |
+
Chapter 2 provides step-by-step instructions on how to install the AWS CLI, create an admin user on AWS, and get an access key to set up the `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` environment variables. If you already have an AWS admin user in place, you have to configure the following env vars in your `.env` file:
|
314 |
+
|
315 |
+
```bash
|
316 |
+
AWS_REGION=eu-central-1 # Change it with your AWS region.
|
317 |
+
AWS_ACCESS_KEY=your_aws_access_key
|
318 |
+
AWS_SECRET_KEY=your_aws_secret_key
|
319 |
+
```
|
320 |
+
|
321 |
+
AWS credentials are typically stored in `~/.aws/credentials`. You can view this file directly using `cat` or similar commands:
|
322 |
+
|
323 |
+
```bash
|
324 |
+
cat ~/.aws/credentials
|
325 |
+
```
|
326 |
+
|
327 |
+
> [!IMPORTANT]
|
328 |
+
> Additional configuration options are available in [settings.py](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/llm_engineering/settings.py). Any variable in the `Settings` class can be configured through the `.env` file.
|
329 |
+
|
330 |
+
## 🏗️ Infrastructure
|
331 |
+
|
332 |
+
### Local infrastructure (for testing and development)
|
333 |
+
|
334 |
+
When running the project locally, we host a MongoDB and Qdrant database using Docker. Also, a testing ZenML server is made available through their Python package.
|
335 |
+
|
336 |
+
> [!WARNING]
|
337 |
+
> You need Docker installed (>= v27.1.1)
|
338 |
+
|
339 |
+
For ease of use, you can start the whole local development infrastructure with the following command:
|
340 |
+
```bash
|
341 |
+
poetry poe local-infrastructure-up
|
342 |
+
```
|
343 |
+
|
344 |
+
Also, you can stop the ZenML server and all the Docker containers using the following command:
|
345 |
+
```bash
|
346 |
+
poetry poe local-infrastructure-down
|
347 |
+
```
|
348 |
+
|
349 |
+
> [!WARNING]
|
350 |
+
> When running on MacOS, before starting the server, export the following environment variable:
|
351 |
+
> `export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`
|
352 |
+
> Otherwise, the connection between the local server and pipeline will break. 🔗 More details in [this issue](https://github.com/zenml-io/zenml/issues/2369).
|
353 |
+
> This is done by default when using Poe the Poet.
|
354 |
+
|
355 |
+
Start the inference real-time RESTful API:
|
356 |
+
```bash
|
357 |
+
poetry poe run-inference-ml-service
|
358 |
+
```
|
359 |
+
|
360 |
+
> [!IMPORTANT]
|
361 |
+
> The LLM microservice, called by the RESTful API, will work only after deploying the LLM to AWS SageMaker.
|
362 |
+
|
363 |
+
#### ZenML
|
364 |
+
|
365 |
+
Dashboard URL: `localhost:8237`
|
366 |
+
|
367 |
+
Default credentials:
|
368 |
+
- `username`: default
|
369 |
+
- `password`:
|
370 |
+
|
371 |
+
→ Find out more about using and setting up [ZenML](https://docs.zenml.io/).
|
372 |
+
|
373 |
+
#### Qdrant
|
374 |
+
|
375 |
+
REST API URL: `localhost:6333`
|
376 |
+
|
377 |
+
Dashboard URL: `localhost:6333/dashboard`
|
378 |
+
|
379 |
+
→ Find out more about using and setting up [Qdrant with Docker](https://qdrant.tech/documentation/quick-start/).
|
380 |
+
|
381 |
+
#### MongoDB
|
382 |
+
|
383 |
+
Database URI: `mongodb://llm_engineering:[email protected]:27017`
|
384 |
+
|
385 |
+
Database name: `twin`
|
386 |
+
|
387 |
+
Default credentials:
|
388 |
+
- `username`: llm_engineering
|
389 |
+
- `password`: llm_engineering
|
390 |
+
|
391 |
+
→ Find out more about using and setting up [MongoDB with Docker](https://www.mongodb.com/docs/manual/tutorial/install-mongodb-community-with-docker).
|
392 |
+
|
393 |
+
You can search your MongoDB collections using your **IDEs MongoDB plugin** (which you have to install separately), where you have to use the database URI to connect to the MongoDB database hosted within the Docker container: `mongodb://llm_engineering:[email protected]:27017`
|
394 |
+
|
395 |
+
> [!IMPORTANT]
|
396 |
+
> Everything related to training or running the LLMs (e.g., training, evaluation, inference) can only be run if you set up AWS SageMaker, as explained in the next section on cloud infrastructure.
|
397 |
+
|
398 |
+
### Cloud infrastructure (for production)
|
399 |
+
|
400 |
+
Here we will quickly present how to deploy the project to AWS and other serverless services. We won't go into the details (as everything is presented in the book) but only point out the main steps you have to go through.
|
401 |
+
|
402 |
+
First, reinstall your Python dependencies with the AWS group:
|
403 |
+
```bash
|
404 |
+
poetry install --with aws
|
405 |
+
```
|
406 |
+
|
407 |
+
#### AWS SageMaker
|
408 |
+
|
409 |
+
> [!NOTE]
|
410 |
+
> Chapter 10 provides step-by-step instructions in the section "Implementing the LLM microservice using AWS SageMaker".
|
411 |
+
|
412 |
+
By this point, we expect you to have AWS CLI installed and your AWS CLI and project's env vars (within the `.env` file) properly configured with an AWS admin user.
|
413 |
+
|
414 |
+
To ensure best practices, we must create a new AWS user restricted to creating and deleting only resources related to AWS SageMaker. Create it by running:
|
415 |
+
```bash
|
416 |
+
poetry poe create-sagemaker-role
|
417 |
+
```
|
418 |
+
It will create a `sagemaker_user_credentials.json` file at the root of your repository with your new `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` values. **But before replacing your new AWS credentials, also run the following command to create the execution role (to create it using your admin credentials).**
|
419 |
+
|
420 |
+
To create the IAM execution role used by AWS SageMaker to access other AWS resources on our behalf, run the following:
|
421 |
+
```bash
|
422 |
+
poetry poe create-sagemaker-execution-role
|
423 |
+
```
|
424 |
+
It will create a `sagemaker_execution_role.json` file at the root of your repository with your new `AWS_ARN_ROLE` value. Add it to your `.env` file.
|
425 |
+
|
426 |
+
Once you've updated the `AWS_ACCESS_KEY`, `AWS_SECRET_KEY`, and `AWS_ARN_ROLE` values in your `.env` file, you can use AWS SageMaker. **Note that this step is crucial to complete the AWS setup.**
|
427 |
+
|
428 |
+
#### Training
|
429 |
+
|
430 |
+
We start the training pipeline through ZenML by running the following:
|
431 |
+
```bash
|
432 |
+
poetry poe run-training-pipeline
|
433 |
+
```
|
434 |
+
This will start the training code using the configs from `configs/training.yaml` directly in SageMaker. You can visualize the results in Comet ML's dashboard.
|
435 |
+
|
436 |
+
We start the evaluation pipeline through ZenML by running the following:
|
437 |
+
```bash
|
438 |
+
poetry poe run-evaluation-pipeline
|
439 |
+
```
|
440 |
+
This will start the evaluation code using the configs from `configs/evaluating.yaml` directly in SageMaker. You can visualize the results in `*-results` datasets saved to your Hugging Face profile.
|
441 |
+
|
442 |
+
#### Inference
|
443 |
+
|
444 |
+
To create an AWS SageMaker Inference Endpoint, run:
|
445 |
+
```bash
|
446 |
+
poetry poe deploy-inference-endpoint
|
447 |
+
```
|
448 |
+
To test it out, run:
|
449 |
+
```bash
|
450 |
+
poetry poe test-sagemaker-endpoint
|
451 |
+
```
|
452 |
+
To delete it, run:
|
453 |
+
```bash
|
454 |
+
poetry poe delete-inference-endpoint
|
455 |
+
```
|
456 |
+
|
457 |
+
#### AWS: ML pipelines, artifacts, and containers
|
458 |
+
|
459 |
+
The ML pipelines, artifacts, and containers are deployed to AWS by leveraging ZenML's deployment features. Thus, you must create an account with ZenML Cloud and follow their guide on deploying a ZenML stack to AWS. Otherwise, we provide step-by-step instructions in **Chapter 11**, section **Deploying the LLM Twin's pipelines to the cloud** on what you must do.
|
460 |
+
|
461 |
+
#### Qdrant & MongoDB
|
462 |
+
|
463 |
+
We leverage Qdrant's and MongoDB's serverless options when deploying the project. Thus, you can either follow [Qdrant's](https://qdrant.tech/documentation/cloud/create-cluster/) and [MongoDB's](https://www.mongodb.com/resources/products/fundamentals/mongodb-cluster-setup) tutorials on how to create a freemium cluster for each or go through **Chapter 11**, section **Deploying the LLM Twin's pipelines to the cloud** and follow our step-by-step instructions.
|
464 |
+
|
465 |
+
#### GitHub Actions
|
466 |
+
|
467 |
+
We use GitHub Actions to implement our CI/CD pipelines. To implement your own, you have to fork our repository and set the following env vars as Actions secrets in your forked repository:
|
468 |
+
- `AWS_ACCESS_KEY_ID`
|
469 |
+
- `AWS_SECRET_ACCESS_KEY`
|
470 |
+
- `AWS_ECR_NAME`
|
471 |
+
- `AWS_REGION`
|
472 |
+
|
473 |
+
Also, we provide instructions on how to set everything up in **Chapter 11**, section **Adding LLMOps to the LLM Twin**.
|
474 |
+
|
475 |
+
#### Comet ML & Opik
|
476 |
+
|
477 |
+
You can visualize the results on their self-hosted dashboards if you create a Comet account and correctly set the `COMET_API_KEY` env var. As Opik is powered by Comet, you don't have to set up anything else along Comet:
|
478 |
+
- [Comet ML (for experiment tracking)](https://www.comet.com/?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik)
|
479 |
+
- [Opik (for prompt monitoring)](https://www.comet.com/opik?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik)
|
480 |
+
|
481 |
+
### 💰 Running the Project Costs
|
482 |
+
|
483 |
+
We will mostly stick to free tiers for all the services except for AWS and OpenAI's API, which are both pay-as-you-go services. The cost of running the project once, with our default values, will be roughly ~$25 (most of it comes from using AWS SageMaker for training and inference).
|
484 |
+
|
485 |
+
## ⚡ Pipelines
|
486 |
+
|
487 |
+
All the ML pipelines will be orchestrated behind the scenes by [ZenML](https://www.zenml.io/). A few exceptions exist when running utility scrips, such as exporting or importing from the data warehouse.
|
488 |
+
|
489 |
+
The ZenML pipelines are the entry point for most processes throughout this project. They are under the `pipelines/` folder. Thus, when you want to understand or debug a workflow, starting with the ZenML pipeline is the best approach.
|
490 |
+
|
491 |
+
To see the pipelines running and their results:
|
492 |
+
- go to your ZenML dashboard
|
493 |
+
- go to the `Pipelines` section
|
494 |
+
- click on a specific pipeline (e.g., `feature_engineering`)
|
495 |
+
- click on a specific run (e.g., `feature_engineering_run_2024_06_20_18_40_24`)
|
496 |
+
- click on a specific step or artifact of the DAG to find more details about it
|
497 |
+
|
498 |
+
Now, let's explore all the pipelines you can run. From data collection to training, we will present them in their natural order to go through the LLM project end-to-end.
|
499 |
+
|
500 |
+
### Data pipelines
|
501 |
+
|
502 |
+
Run the data collection ETL:
|
503 |
+
```bash
|
504 |
+
poetry poe run-digital-data-etl
|
505 |
+
```
|
506 |
+
|
507 |
+
> [!WARNING]
|
508 |
+
> You must have Chrome (or another Chromium-based browser) installed on your system for LinkedIn and Medium crawlers to work (which use Selenium under the hood). Based on your Chrome version, the Chromedriver will be automatically installed to enable Selenium support. Another option is to run everything using our Docker image if you don't want to install Chrome. For example, to run all the pipelines combined you can run `poetry poe run-docker-end-to-end-data-pipeline`. Note that the command can be tweaked to support any other pipeline.
|
509 |
+
>
|
510 |
+
> If, for any other reason, you don't have a Chromium-based browser installed and don't want to use Docker, you have two other options to bypass this Selenium issue:
|
511 |
+
> - Comment out all the code related to Selenium, Chrome and all the links that use Selenium to crawl them (e.g., Medium), such as the `chromedriver_autoinstaller.install()` command from [application.crawlers.base](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/llm_engineering/application/crawlers/base.py) and other static calls that check for Chrome drivers and Selenium.
|
512 |
+
> - Install Google Chrome using your CLI in environments such as GitHub Codespaces or other cloud VMs using the same command as in our [Docker file](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/Dockerfile#L10).
|
513 |
+
|
514 |
+
To add additional links to collect from, go to `configs/digital_data_etl_[author_name].yaml` and add them to the `links` field. Also, you can create a completely new file and specify it at run time, like this: `python -m llm_engineering.interfaces.orchestrator.run --run-etl --etl-config-filename configs/digital_data_etl_[your_name].yaml`
|
515 |
+
|
516 |
+
Run the feature engineering pipeline:
|
517 |
+
```bash
|
518 |
+
poetry poe run-feature-engineering-pipeline
|
519 |
+
```
|
520 |
+
|
521 |
+
Generate the instruct dataset:
|
522 |
+
```bash
|
523 |
+
poetry poe run-generate-instruct-datasets-pipeline
|
524 |
+
```
|
525 |
+
|
526 |
+
Generate the preference dataset:
|
527 |
+
```bash
|
528 |
+
poetry poe run-generate-preference-datasets-pipeline
|
529 |
+
```
|
530 |
+
|
531 |
+
Run all of the above compressed into a single pipeline:
|
532 |
+
```bash
|
533 |
+
poetry poe run-end-to-end-data-pipeline
|
534 |
+
```
|
535 |
+
|
536 |
+
### Utility pipelines
|
537 |
+
|
538 |
+
Export the data from the data warehouse to JSON files:
|
539 |
+
```bash
|
540 |
+
poetry poe run-export-data-warehouse-to-json
|
541 |
+
```
|
542 |
+
|
543 |
+
Import data to the data warehouse from JSON files (by default, it imports the data from the `data/data_warehouse_raw_data` directory):
|
544 |
+
```bash
|
545 |
+
poetry poe run-import-data-warehouse-from-json
|
546 |
+
```
|
547 |
+
|
548 |
+
Export ZenML artifacts to JSON:
|
549 |
+
```bash
|
550 |
+
poetry poe run-export-artifact-to-json-pipeline
|
551 |
+
```
|
552 |
+
|
553 |
+
This will export the following ZenML artifacts to the `output` folder as JSON files (it will take their latest version):
|
554 |
+
- cleaned_documents.json
|
555 |
+
- instruct_datasets.json
|
556 |
+
- preference_datasets.json
|
557 |
+
- raw_documents.json
|
558 |
+
|
559 |
+
You can configure what artifacts to export by tweaking the `configs/export_artifact_to_json.yaml` configuration file.
|
560 |
+
|
561 |
+
### Training pipelines
|
562 |
+
|
563 |
+
Run the training pipeline:
|
564 |
+
```bash
|
565 |
+
poetry poe run-training-pipeline
|
566 |
+
```
|
567 |
+
|
568 |
+
Run the evaluation pipeline:
|
569 |
+
```bash
|
570 |
+
poetry poe run-evaluation-pipeline
|
571 |
+
```
|
572 |
+
|
573 |
+
> [!WARNING]
|
574 |
+
> For this to work, make sure you properly configured AWS SageMaker as described in [Set up cloud infrastructure (for production)](#set-up-cloud-infrastructure-for-production).
|
575 |
+
|
576 |
+
### Inference pipelines
|
577 |
+
|
578 |
+
Call the RAG retrieval module with a test query:
|
579 |
+
```bash
|
580 |
+
poetry poe call-rag-retrieval-module
|
581 |
+
```
|
582 |
+
|
583 |
+
Start the inference real-time RESTful API:
|
584 |
+
```bash
|
585 |
+
poetry poe run-inference-ml-service
|
586 |
+
```
|
587 |
+
|
588 |
+
Call the inference real-time RESTful API with a test query:
|
589 |
+
```bash
|
590 |
+
poetry poe call-inference-ml-service
|
591 |
+
```
|
592 |
+
|
593 |
+
Remember that you can monitor the prompt traces on [Opik](https://www.comet.com/opik).
|
594 |
+
|
595 |
+
> [!WARNING]
|
596 |
+
> For the inference service to work, you must have the LLM microservice deployed to AWS SageMaker, as explained in the setup cloud infrastructure section.
|
597 |
+
|
598 |
+
### Linting & formatting (QA)
|
599 |
+
|
600 |
+
Check or fix your linting issues:
|
601 |
+
```bash
|
602 |
+
poetry poe lint-check
|
603 |
+
poetry poe lint-fix
|
604 |
+
```
|
605 |
+
|
606 |
+
Check or fix your formatting issues:
|
607 |
+
```bash
|
608 |
+
poetry poe format-check
|
609 |
+
poetry poe format-fix
|
610 |
+
```
|
611 |
+
|
612 |
+
Check the code for leaked credentials:
|
613 |
+
```bash
|
614 |
+
poetry poe gitleaks-check
|
615 |
+
```
|
616 |
+
|
617 |
+
### Tests
|
618 |
+
|
619 |
+
Run all the tests using the following command:
|
620 |
+
```bash
|
621 |
+
poetry poe test
|
622 |
+
```
|
623 |
+
|
624 |
+
## 🏃 Run project
|
625 |
+
|
626 |
+
Based on the setup and usage steps described above, assuming the local and cloud infrastructure works and the `.env` is filled as expected, follow the next steps to run the LLM system end-to-end:
|
627 |
+
|
628 |
+
### Data
|
629 |
+
|
630 |
+
1. Collect data: `poetry poe run-digital-data-etl`
|
631 |
+
|
632 |
+
2. Compute features: `poetry poe run-feature-engineering-pipeline`
|
633 |
+
|
634 |
+
3. Compute instruct dataset: `poetry poe run-generate-instruct-datasets-pipeline`
|
635 |
+
|
636 |
+
4. Compute preference alignment dataset: `poetry poe run-generate-preference-datasets-pipeline`
|
637 |
+
|
638 |
+
### Training
|
639 |
+
|
640 |
+
> [!IMPORTANT]
|
641 |
+
> From now on, for these steps to work, you need to properly set up AWS SageMaker, such as running `poetry install --with aws` and filling in the AWS-related environment variables and configs.
|
642 |
+
|
643 |
+
5. SFT fine-tuning Llamma 3.1: `poetry poe run-training-pipeline`
|
644 |
+
|
645 |
+
6. For DPO, go to `configs/training.yaml`, change `finetuning_type` to `dpo`, and run `poetry poe run-training-pipeline` again
|
646 |
+
|
647 |
+
7. Evaluate fine-tuned models: `poetry poe run-evaluation-pipeline`
|
648 |
+
|
649 |
+
### Inference
|
650 |
+
|
651 |
+
> [!IMPORTANT]
|
652 |
+
> From now on, for these steps to work, you need to properly set up AWS SageMaker, such as running `poetry install --with aws` and filling in the AWS-related environment variables and configs.
|
653 |
+
|
654 |
+
8. Call only the RAG retrieval module: `poetry poe call-rag-retrieval-module`
|
655 |
+
|
656 |
+
9. Deploy the LLM Twin microservice to SageMaker: `poetry poe deploy-inference-endpoint`
|
657 |
+
|
658 |
+
10. Test the LLM Twin microservice: `poetry poe test-sagemaker-endpoint`
|
659 |
+
|
660 |
+
11. Start end-to-end RAG server: `poetry poe run-inference-ml-service`
|
661 |
+
|
662 |
+
12. Test RAG server: `poetry poe call-inference-ml-service`
|
663 |
+
|
664 |
+
## 📄 License
|
665 |
+
|
666 |
+
This course is an open-source project released under the MIT license. Thus, as long you distribute our LICENSE and acknowledge our work, you can safely clone or fork this project and use it as a source of inspiration for whatever you want (e.g., university projects, college degree projects, personal projects, etc.).
|
clips/clip_eFgkZKhNUdM_1270_0.847.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e29cb777c6b6984f5599a455d760923147593388b1a9076e6f9afe0bbf679d3c
|
3 |
+
size 7315834
|
clips/clip_eFgkZKhNUdM_642_0.847.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5653d4b661233328997880cd40df4aba4e74d3aca5198685f3b4aaaac0e03a3
|
3 |
+
size 1628484
|
clips/clip_eFgkZKhNUdM_874_0.838.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be8086177badfba4783ccca4040c990ea0a16ebc8d33185dce7a354e43720317
|
3 |
+
size 3585253
|
code_snippets/03_custom_odm_example.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from llm_engineering.domain.documents import ArticleDocument, UserDocument
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
user = UserDocument.get_or_create(first_name="Paul", last_name="Iusztin")
|
5 |
+
articles = ArticleDocument.bulk_find(author_id=str(user.id))
|
6 |
+
|
7 |
+
print(f"User ID: {user.id}") # noqa
|
8 |
+
print(f"User name: {user.first_name} {user.last_name}") # noqa
|
9 |
+
print(f"Number of articles: {len(articles)}") # noqa
|
10 |
+
print("First article link:", articles[0].link) # noqa
|
code_snippets/03_orm.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sqlalchemy import Column, Integer, String, create_engine
|
2 |
+
from sqlalchemy.orm import declarative_base, sessionmaker
|
3 |
+
|
4 |
+
# Create virtual environment, install dependencies and run the code:
|
5 |
+
# 1. Create: python3 -m venv orm_venv
|
6 |
+
# 2. Activate: source orm_venv/bin/activate
|
7 |
+
# 3. Install: pip install sqlalchemy==2.0.35
|
8 |
+
# 4. Run the code: python code_snippets/03_orm.py
|
9 |
+
|
10 |
+
if __name__ == "__main__":
|
11 |
+
Base = declarative_base()
|
12 |
+
|
13 |
+
# Define a class that maps to the users table.
|
14 |
+
class User(Base):
|
15 |
+
__tablename__ = "users"
|
16 |
+
|
17 |
+
id = Column(Integer, primary_key=True)
|
18 |
+
name = Column(String)
|
19 |
+
|
20 |
+
# Create an SQLite database in memory.
|
21 |
+
engine = create_engine("sqlite:///:memory:")
|
22 |
+
Base.metadata.create_all(engine)
|
23 |
+
|
24 |
+
# Create a session used to interact with the database.
|
25 |
+
Session = sessionmaker(bind=engine)
|
26 |
+
session = Session()
|
27 |
+
|
28 |
+
# Add a new user.
|
29 |
+
new_user = User(name="Alice")
|
30 |
+
session.add(new_user)
|
31 |
+
session.commit()
|
32 |
+
|
33 |
+
# Query the database.
|
34 |
+
user = session.query(User).first()
|
35 |
+
if user:
|
36 |
+
print(f"User ID: {user.id}") # noqa
|
37 |
+
print(f"User name: {user.name}") # noqa
|
code_snippets/08_instructor_embeddings.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
|
3 |
+
# Create virtual environment, install dependencies and run the code:
|
4 |
+
# 1. Create: python3 -m venv instructor_venv
|
5 |
+
# 2. Activate: source instructor_venv/bin/activate
|
6 |
+
# 3. Install: pip install sentence-transformers==3.3.0
|
7 |
+
# 4. Run the code: python code_snippets/08_instructor_embeddings.py
|
8 |
+
|
9 |
+
if __name__ == "__main__":
|
10 |
+
model = SentenceTransformer("hkunlp/instructor-base")
|
11 |
+
|
12 |
+
sentence = "RAG Fundamentals First"
|
13 |
+
|
14 |
+
instruction = "Represent the title of an article about AI:"
|
15 |
+
|
16 |
+
embeddings = model.encode([[instruction, sentence]])
|
17 |
+
print(embeddings.shape) # noqa
|
18 |
+
# Output: (1, 768)
|
code_snippets/08_text_embeddings.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
|
3 |
+
# Leverage the Poetry virtual environment to run the code:
|
4 |
+
# poetry run python code_snippets/08_text_embeddings.py
|
5 |
+
|
6 |
+
if __name__ == "__main__":
|
7 |
+
# 1. Load a pretrained Sentence Transformer model.
|
8 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
9 |
+
|
10 |
+
# The sentences to encode.
|
11 |
+
sentences = ["The dog sits outside waiting for a treat.", "I am going swimming.", "The dog is swimming."]
|
12 |
+
|
13 |
+
# 2. Calculate embeddings.
|
14 |
+
embeddings = model.encode(sentences)
|
15 |
+
print(embeddings.shape) # noqa
|
16 |
+
# Output: [3, 384]
|
17 |
+
|
18 |
+
# 3. Calculate the embedding similarities using cosine similarity.
|
19 |
+
similarities = model.similarity(embeddings, embeddings)
|
20 |
+
print(similarities) # noqa
|
21 |
+
# Output:
|
22 |
+
# tensor([[ 1.0000, -0.0389, 0.2692],
|
23 |
+
# [-0.0389, 1.0000, 0.3837],
|
24 |
+
# [ 0.2692, 0.3837, 1.0000]])
|
25 |
+
#
|
26 |
+
# similarities[0, 0] = The similarity between the first sentence and itself.
|
27 |
+
# similarities[0, 1] = The similarity between the first and second sentence.
|
28 |
+
# similarities[2, 1] = The similarity between the third and second sentence.
|
code_snippets/08_text_image_embeddings.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import BytesIO
|
2 |
+
|
3 |
+
import requests
|
4 |
+
from PIL import Image
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
|
7 |
+
# Leverage the Poetry virtual environment to run the code:
|
8 |
+
# poetry run python code_snippets/08_text_image_embeddings.py
|
9 |
+
|
10 |
+
if __name__ == "__main__":
|
11 |
+
# Load an image with a crazy cat.
|
12 |
+
response = requests.get(
|
13 |
+
"https://github.com/PacktPublishing/LLM-Engineering/blob/main/images/crazy_cat.jpg?raw=true"
|
14 |
+
)
|
15 |
+
image = Image.open(BytesIO(response.content))
|
16 |
+
|
17 |
+
# Load CLIP model.
|
18 |
+
model = SentenceTransformer("clip-ViT-B-32")
|
19 |
+
|
20 |
+
# Encode the loaded image.
|
21 |
+
img_emb = model.encode(image)
|
22 |
+
|
23 |
+
# Encode text descriptions.
|
24 |
+
text_emb = model.encode(
|
25 |
+
[
|
26 |
+
"A crazy cat smiling.",
|
27 |
+
"A white and brown cat with a yellow bandana.",
|
28 |
+
"A man eating in the garden.",
|
29 |
+
]
|
30 |
+
)
|
31 |
+
print(text_emb.shape) # noqa
|
32 |
+
# Output: (3, 512)
|
33 |
+
|
34 |
+
# Compute similarities.
|
35 |
+
similarity_scores = model.similarity(img_emb, text_emb)
|
36 |
+
print(similarity_scores) # noqa
|
37 |
+
# Output: tensor([[0.3068, 0.3300, 0.1719]])
|
configs/digital_data_etl_maxime_labonne.yaml
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings:
|
2 |
+
docker:
|
3 |
+
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
|
4 |
+
skip_build: True
|
5 |
+
orchestrator.sagemaker:
|
6 |
+
synchronous: false
|
7 |
+
|
8 |
+
parameters:
|
9 |
+
user_full_name: Maxime Labonne # [First Name(s)] [Last Name]
|
10 |
+
links:
|
11 |
+
# Personal Blog
|
12 |
+
- https://mlabonne.github.io/blog/posts/2024-07-29_Finetune_Llama31.html
|
13 |
+
- https://mlabonne.github.io/blog/posts/2024-07-15_The_Rise_of_Agentic_Data_Generation.html
|
14 |
+
# Substack
|
15 |
+
- https://maximelabonne.substack.com/p/uncensor-any-llm-with-abliteration-d30148b7d43e
|
16 |
+
- https://maximelabonne.substack.com/p/create-mixtures-of-experts-with-mergekit-11b318c99562
|
17 |
+
- https://maximelabonne.substack.com/p/merge-large-language-models-with-mergekit-2118fb392b54
|
18 |
+
- https://maximelabonne.substack.com/p/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac
|
19 |
+
- https://maximelabonne.substack.com/p/exllamav2-the-fastest-library-to-run-llms-32aeda294d26
|
20 |
+
- https://maximelabonne.substack.com/p/quantize-llama-models-with-ggml-and-llama-cpp-3612dfbcc172
|
21 |
+
- https://maximelabonne.substack.com/p/a-beginners-guide-to-llm-fine-tuning-4bae7d4da672
|
22 |
+
- https://maximelabonne.substack.com/p/graph-convolutional-networks-introduction-to-gnns-24b3f60d6c95
|
23 |
+
- https://maximelabonne.substack.com/p/4-bit-quantization-with-gptq-36b0f4f02c34
|
24 |
+
- https://maximelabonne.substack.com/p/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32
|
25 |
+
- https://maximelabonne.substack.com/p/introduction-to-weight-quantization-2494701b9c0c
|
26 |
+
- https://maximelabonne.substack.com/p/decoding-strategies-in-large-language-models-9733a8f70539
|
27 |
+
- https://maximelabonne.substack.com/p/the-art-of-spending-optimizing-your-marketing-budget-with-nonlinear-optimization-6c8a39afb3c2
|
28 |
+
- https://maximelabonne.substack.com/p/create-a-bot-to-find-diamonds-in-minecraft-d836606a993a
|
29 |
+
- https://maximelabonne.substack.com/p/constraint-programming-67ac16fa0c81
|
30 |
+
- https://maximelabonne.substack.com/p/how-to-design-the-most-powerful-graph-neural-network-3d18b07a6e66
|
31 |
+
- https://maximelabonne.substack.com/p/introduction-to-graphsage-in-python-a9e7f9ecf9d7
|
32 |
+
- https://maximelabonne.substack.com/p/graph-attention-networks-in-python-975736ac5c0c
|
33 |
+
- https://maximelabonne.substack.com/p/integer-programming-vs-linear-programming-in-python-f1be5bb4e60e
|
34 |
+
- https://maximelabonne.substack.com/p/introduction-to-linear-programming-in-python-9261e7eb44b
|
35 |
+
- https://maximelabonne.substack.com/p/what-is-a-tensor-in-deep-learning-6dedd95d6507
|
36 |
+
- https://maximelabonne.substack.com/p/efficiently-iterating-over-rows-in-a-pandas-dataframe-7dd5f9992c01
|
37 |
+
- https://maximelabonne.substack.com/p/q-learning-for-beginners-2837b777741
|
38 |
+
- https://maximelabonne.substack.com/p/how-to-start-machine-learning-for-developers-in-2022-390af12b193f
|
configs/digital_data_etl_paul_iusztin.yaml
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings:
|
2 |
+
docker:
|
3 |
+
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
|
4 |
+
skip_build: True
|
5 |
+
orchestrator.sagemaker:
|
6 |
+
synchronous: false
|
7 |
+
|
8 |
+
parameters:
|
9 |
+
user_full_name: Paul Iusztin # [First Name(s)] [Last Name]
|
10 |
+
links:
|
11 |
+
# Medium (only articles that are not under the paid wall work)
|
12 |
+
- https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f
|
13 |
+
- https://medium.com/decodingml/a-real-time-retrieval-system-for-rag-on-social-media-data-9cc01d50a2a0
|
14 |
+
- https://medium.com/decodingml/sota-python-streaming-pipelines-for-fine-tuning-llms-and-rag-in-real-time-82eb07795b87
|
15 |
+
- https://medium.com/decodingml/the-4-advanced-rag-algorithms-you-must-know-to-implement-5d0c7f1199d2
|
16 |
+
- https://medium.com/decodingml/architect-scalable-and-cost-effective-llm-rag-inference-pipelines-73b94ef82a99
|
17 |
+
# Substack
|
18 |
+
- https://decodingml.substack.com/p/real-time-feature-pipelines-with?r=1ttoeh
|
19 |
+
- https://decodingml.substack.com/p/building-ml-systems-the-right-way?r=1ttoeh
|
20 |
+
- https://decodingml.substack.com/p/reduce-your-pytorchs-code-latency?r=1ttoeh
|
21 |
+
- https://decodingml.substack.com/p/llm-agents-demystified?r=1ttoeh
|
22 |
+
- https://decodingml.substack.com/p/scalable-rag-ingestion-pipeline-using?r=1ttoeh
|
23 |
+
- https://decodingml.substack.com/p/the-ultimate-mlops-tool?r=1ttoeh
|
24 |
+
- https://decodingml.substack.com/p/the-new-king-of-infrastructure-as?r=1ttoeh
|
25 |
+
- https://decodingml.substack.com/p/highly-scalable-data-ingestion-architecture?r=1ttoeh
|
26 |
+
- https://decodingml.substack.com/p/2-key-llmops-concepts?r=1ttoeh
|
27 |
+
- https://decodingml.substack.com/p/the-llm-twin-free-course-on-production?r=1ttoeh
|
28 |
+
- https://decodingml.substack.com/p/a-blueprint-for-designing-production?r=1ttoeh
|
29 |
+
- https://decodingml.substack.com/p/the-difference-between-development?r=1ttoeh
|
30 |
+
- https://decodingml.substack.com/p/architect-scalable-and-cost-effective?r=1ttoeh
|
31 |
+
- https://decodingml.substack.com/p/7-tips-to-reduce-your-vram-when-training?r=1ttoeh
|
32 |
+
- https://decodingml.substack.com/p/using-this-python-package-you-can?r=1ttoeh
|
33 |
+
- https://decodingml.substack.com/p/the-4-advanced-rag-algorithms-you?r=1ttoeh
|
34 |
+
- https://decodingml.substack.com/p/problems-deploying-your-ml-models?r=1ttoeh
|
35 |
+
- https://decodingml.substack.com/p/sota-python-streaming-pipelines-for?r=1ttoeh
|
36 |
+
- https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
|
37 |
+
- https://decodingml.substack.com/p/my-ml-monthly-learning-resource-recommendations?r=1ttoeh
|
38 |
+
- https://decodingml.substack.com/p/an-end-to-end-framework-for-production?r=1ttoeh
|
39 |
+
- https://decodingml.substack.com/p/upskill-your-llm-knowledge-base-with?r=1ttoeh
|
40 |
+
- https://decodingml.substack.com/p/want-to-learn-an-end-to-end-framework?r=1ttoeh
|
41 |
+
- https://decodingml.substack.com/p/my-favorite-way-to-implement-a-configuration?r=1ttoeh
|
42 |
+
- https://decodingml.substack.com/p/a-real-time-retrieval-system-for?r=1ttoeh
|
43 |
+
- https://decodingml.substack.com/p/4-key-decoding-strategies-for-llms?r=1ttoeh
|
44 |
+
- https://decodingml.substack.com/p/dml-new-year-the-new-and-improved?r=1ttoeh
|
45 |
+
- https://decodingml.substack.com/p/dml-8-types-of-mlops-tools-that-must?r=1ttoeh
|
46 |
+
- https://decodingml.substack.com/p/dml-this-is-what-you-need-to-build?r=1ttoeh
|
47 |
+
- https://decodingml.substack.com/p/dml-7-steps-on-how-to-fine-tune-an?r=1ttoeh
|
48 |
+
- https://decodingml.substack.com/p/dml-how-do-you-generate-a-q-and-a?r=1ttoeh
|
49 |
+
- https://decodingml.substack.com/p/dml-what-do-you-need-to-fine-tune?r=1ttoeh
|
50 |
+
- https://decodingml.substack.com/p/dml-why-and-when-do-you-need-to-fine?r=1ttoeh
|
51 |
+
- https://decodingml.substack.com/p/dml-how-to-implement-a-streaming?r=1ttoeh
|
52 |
+
- https://decodingml.substack.com/p/dml-why-and-what-do-you-need-a-streaming?r=1ttoeh
|
53 |
+
- https://decodingml.substack.com/p/dml-unwrapping-the-3-pipeline-design?r=1ttoeh
|
54 |
+
- https://decodingml.substack.com/p/dml-how-to-design-an-llm-system-for?r=1ttoeh
|
55 |
+
- https://decodingml.substack.com/p/dml-synced-vector-dbs-a-guide-to?r=1ttoeh
|
56 |
+
- https://decodingml.substack.com/p/dml-what-is-the-difference-between?r=1ttoeh
|
57 |
+
- https://decodingml.substack.com/p/dml-7-steps-to-build-a-production?r=1ttoeh
|
58 |
+
- https://decodingml.substack.com/p/dml-chain-of-thought-reasoning-write?r=1ttoeh
|
59 |
+
- https://decodingml.substack.com/p/dml-build-and-serve-a-production?r=1ttoeh
|
60 |
+
- https://decodingml.substack.com/p/dml-4-key-ideas-you-must-know-to?r=1ttoeh
|
61 |
+
- https://decodingml.substack.com/p/dml-how-to-add-real-time-monitoring?r=1ttoeh
|
62 |
+
- https://decodingml.substack.com/p/dml-top-6-ml-platform-features-you?r=1ttoeh
|
configs/end_to_end_data.yaml
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings:
|
2 |
+
docker:
|
3 |
+
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
|
4 |
+
skip_build: True
|
5 |
+
orchestrator.sagemaker:
|
6 |
+
synchronous: false
|
7 |
+
|
8 |
+
parameters:
|
9 |
+
# Data ETL & Feature engineering pipelines parameters
|
10 |
+
author_links:
|
11 |
+
- user_full_name: Paul Iusztin # [First Name(s)] [Last Name]
|
12 |
+
links:
|
13 |
+
# Medium (only articles that are not under the paid wall work)
|
14 |
+
- https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f
|
15 |
+
- https://medium.com/decodingml/a-real-time-retrieval-system-for-rag-on-social-media-data-9cc01d50a2a0
|
16 |
+
- https://medium.com/decodingml/sota-python-streaming-pipelines-for-fine-tuning-llms-and-rag-in-real-time-82eb07795b87
|
17 |
+
- https://medium.com/decodingml/the-4-advanced-rag-algorithms-you-must-know-to-implement-5d0c7f1199d2
|
18 |
+
- https://medium.com/decodingml/architect-scalable-and-cost-effective-llm-rag-inference-pipelines-73b94ef82a99
|
19 |
+
# Substack
|
20 |
+
- https://decodingml.substack.com/p/a-blueprint-for-designing-production?r=1ttoeh
|
21 |
+
- https://decodingml.substack.com/p/the-difference-between-development?r=1ttoeh
|
22 |
+
- https://decodingml.substack.com/p/architect-scalable-and-cost-effective?r=1ttoeh
|
23 |
+
- https://decodingml.substack.com/p/7-tips-to-reduce-your-vram-when-training?r=1ttoeh
|
24 |
+
- https://decodingml.substack.com/p/using-this-python-package-you-can?r=1ttoeh
|
25 |
+
- https://decodingml.substack.com/p/the-4-advanced-rag-algorithms-you?r=1ttoeh
|
26 |
+
- https://decodingml.substack.com/p/problems-deploying-your-ml-models?r=1ttoeh
|
27 |
+
- https://decodingml.substack.com/p/sota-python-streaming-pipelines-for?r=1ttoeh
|
28 |
+
- https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
|
29 |
+
- https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
|
30 |
+
- https://decodingml.substack.com/p/my-ml-monthly-learning-resource-recommendations?r=1ttoeh
|
31 |
+
- https://decodingml.substack.com/p/an-end-to-end-framework-for-production?r=1ttoeh
|
32 |
+
- https://decodingml.substack.com/p/upskill-your-llm-knowledge-base-with?r=1ttoeh
|
33 |
+
- https://decodingml.substack.com/p/want-to-learn-an-end-to-end-framework?r=1ttoeh
|
34 |
+
- https://decodingml.substack.com/p/my-favorite-way-to-implement-a-configuration?r=1ttoeh
|
35 |
+
- https://decodingml.substack.com/p/a-real-time-retrieval-system-for?r=1ttoeh
|
36 |
+
- https://decodingml.substack.com/p/4-key-decoding-strategies-for-llms?r=1ttoeh
|
37 |
+
- https://decodingml.substack.com/p/dml-new-year-the-new-and-improved?r=1ttoeh
|
38 |
+
- https://decodingml.substack.com/p/dml-8-types-of-mlops-tools-that-must?r=1ttoeh
|
39 |
+
- https://decodingml.substack.com/p/dml-this-is-what-you-need-to-build?r=1ttoeh
|
40 |
+
- https://decodingml.substack.com/p/dml-7-steps-on-how-to-fine-tune-an?r=1ttoeh
|
41 |
+
- https://decodingml.substack.com/p/dml-how-do-you-generate-a-q-and-a?r=1ttoeh
|
42 |
+
- https://decodingml.substack.com/p/dml-what-do-you-need-to-fine-tune?r=1ttoeh
|
43 |
+
- https://decodingml.substack.com/p/dml-why-and-when-do-you-need-to-fine?r=1ttoeh
|
44 |
+
- https://decodingml.substack.com/p/dml-how-to-implement-a-streaming?r=1ttoeh
|
45 |
+
- https://decodingml.substack.com/p/dml-why-and-what-do-you-need-a-streaming?r=1ttoeh
|
46 |
+
- https://decodingml.substack.com/p/dml-unwrapping-the-3-pipeline-design?r=1ttoeh
|
47 |
+
- https://decodingml.substack.com/p/dml-how-to-design-an-llm-system-for?r=1ttoeh
|
48 |
+
- https://decodingml.substack.com/p/dml-synced-vector-dbs-a-guide-to?r=1ttoeh
|
49 |
+
- https://decodingml.substack.com/p/dml-what-is-the-difference-between?r=1ttoeh
|
50 |
+
- https://decodingml.substack.com/p/dml-7-steps-to-build-a-production?r=1ttoeh
|
51 |
+
- https://decodingml.substack.com/p/dml-chain-of-thought-reasoning-write?r=1ttoeh
|
52 |
+
- https://decodingml.substack.com/p/dml-build-and-serve-a-production?r=1ttoeh
|
53 |
+
- https://decodingml.substack.com/p/dml-4-key-ideas-you-must-know-to?r=1ttoeh
|
54 |
+
- https://decodingml.substack.com/p/dml-how-to-add-real-time-monitoring?r=1ttoeh
|
55 |
+
- https://decodingml.substack.com/p/dml-top-6-ml-platform-features-you?r=1ttoeh
|
56 |
+
- user_full_name: Maxime Labonne # [First Name(s)] [Last Name]
|
57 |
+
links:
|
58 |
+
# Substack
|
59 |
+
- https://maximelabonne.substack.com/p/uncensor-any-llm-with-abliteration-d30148b7d43e
|
60 |
+
- https://maximelabonne.substack.com/p/create-mixtures-of-experts-with-mergekit-11b318c99562
|
61 |
+
- https://maximelabonne.substack.com/p/merge-large-language-models-with-mergekit-2118fb392b54
|
62 |
+
- https://maximelabonne.substack.com/p/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac
|
63 |
+
- https://maximelabonne.substack.com/p/exllamav2-the-fastest-library-to-run-llms-32aeda294d26
|
64 |
+
- https://maximelabonne.substack.com/p/quantize-llama-models-with-ggml-and-llama-cpp-3612dfbcc172
|
65 |
+
- https://maximelabonne.substack.com/p/a-beginners-guide-to-llm-fine-tuning-4bae7d4da672
|
66 |
+
- https://maximelabonne.substack.com/p/graph-convolutional-networks-introduction-to-gnns-24b3f60d6c95
|
67 |
+
- https://maximelabonne.substack.com/p/4-bit-quantization-with-gptq-36b0f4f02c34
|
68 |
+
- https://maximelabonne.substack.com/p/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32
|
69 |
+
- https://maximelabonne.substack.com/p/introduction-to-weight-quantization-2494701b9c0c
|
70 |
+
- https://maximelabonne.substack.com/p/decoding-strategies-in-large-language-models-9733a8f70539
|
71 |
+
- https://maximelabonne.substack.com/p/the-art-of-spending-optimizing-your-marketing-budget-with-nonlinear-optimization-6c8a39afb3c2
|
72 |
+
- https://maximelabonne.substack.com/p/create-a-bot-to-find-diamonds-in-minecraft-d836606a993a
|
73 |
+
- https://maximelabonne.substack.com/p/constraint-programming-67ac16fa0c81
|
74 |
+
- https://maximelabonne.substack.com/p/how-to-design-the-most-powerful-graph-neural-network-3d18b07a6e66
|
75 |
+
- https://maximelabonne.substack.com/p/introduction-to-graphsage-in-python-a9e7f9ecf9d7
|
76 |
+
- https://maximelabonne.substack.com/p/graph-attention-networks-in-python-975736ac5c0c
|
77 |
+
- https://maximelabonne.substack.com/p/integer-programming-vs-linear-programming-in-python-f1be5bb4e60e
|
78 |
+
- https://maximelabonne.substack.com/p/introduction-to-linear-programming-in-python-9261e7eb44b
|
79 |
+
- https://maximelabonne.substack.com/p/what-is-a-tensor-in-deep-learning-6dedd95d6507
|
80 |
+
- https://maximelabonne.substack.com/p/efficiently-iterating-over-rows-in-a-pandas-dataframe-7dd5f9992c01
|
81 |
+
- https://maximelabonne.substack.com/p/q-learning-for-beginners-2837b777741
|
82 |
+
- https://maximelabonne.substack.com/p/how-to-start-machine-learning-for-developers-in-2022-390af12b193f
|
83 |
+
# Generate instruct dataset pipeline parameters
|
84 |
+
test_split_size: 0.1
|
85 |
+
push_to_huggingface: false
|
86 |
+
dataset_id: pauliusztin/llmtwin
|
87 |
+
mock: false
|
configs/evaluating.yaml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings:
|
2 |
+
docker:
|
3 |
+
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
|
4 |
+
skip_build: True
|
5 |
+
orchestrator.sagemaker:
|
6 |
+
synchronous: false
|
7 |
+
|
8 |
+
parameters:
|
9 |
+
is_dummy: true # Change this to 'false' to run the evaluation on the full dataset.
|
configs/export_artifact_to_json.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings:
|
2 |
+
docker:
|
3 |
+
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
|
4 |
+
skip_build: True
|
5 |
+
orchestrator.sagemaker:
|
6 |
+
synchronous: false
|
7 |
+
|
8 |
+
parameters:
|
9 |
+
artifact_names:
|
10 |
+
- raw_documents
|
11 |
+
- cleaned_documents
|
12 |
+
- instruct_datasets
|
13 |
+
- preference_datasets
|
configs/feature_engineering.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings:
|
2 |
+
docker:
|
3 |
+
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
|
4 |
+
skip_build: True
|
5 |
+
orchestrator.sagemaker:
|
6 |
+
synchronous: false
|
7 |
+
|
8 |
+
parameters:
|
9 |
+
author_full_names:
|
10 |
+
- Maxime Labonne
|
11 |
+
- Paul Iusztin
|
configs/generate_instruct_datasets.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings:
|
2 |
+
docker:
|
3 |
+
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
|
4 |
+
skip_build: True
|
5 |
+
orchestrator.sagemaker:
|
6 |
+
synchronous: false
|
7 |
+
|
8 |
+
parameters:
|
9 |
+
test_split_size: 0.1
|
10 |
+
dataset_type: "instruction"
|
11 |
+
push_to_huggingface: true
|
12 |
+
dataset_id: pauliusztin/llmtwin
|
13 |
+
mock: false
|
configs/generate_preference_datasets.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings:
|
2 |
+
docker:
|
3 |
+
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
|
4 |
+
skip_build: True
|
5 |
+
orchestrator.sagemaker:
|
6 |
+
synchronous: false
|
7 |
+
|
8 |
+
parameters:
|
9 |
+
test_split_size: 0.05
|
10 |
+
dataset_type: "preference"
|
11 |
+
push_to_huggingface: true
|
12 |
+
dataset_id: pauliusztin/llmtwin-dpo
|
13 |
+
mock: false
|
configs/training.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings:
|
2 |
+
docker:
|
3 |
+
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
|
4 |
+
skip_build: True
|
5 |
+
orchestrator.sagemaker:
|
6 |
+
synchronous: false
|
7 |
+
|
8 |
+
parameters:
|
9 |
+
finetuning_type: sft
|
10 |
+
num_train_epochs: 3
|
11 |
+
per_device_train_batch_size: 2
|
12 |
+
learning_rate: 3e-4
|
13 |
+
dataset_huggingface_workspace: mlabonne
|
14 |
+
is_dummy: true # Change this to 'false' to run the training with the full dataset and epochs.
|
data/artifacts/cleaned_documents.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/artifacts/instruct_datasets.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/artifacts/preference_datasets.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/artifacts/raw_documents.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/data_warehouse_raw_data/ArticleDocument.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/data_warehouse_raw_data/PostDocument.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
data/data_warehouse_raw_data/RepositoryDocument.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
data/data_warehouse_raw_data/UserDocument.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[{"first_name": "Maxime", "last_name": "Labonne", "_id": "eff74089-0271-4319-8543-745c087f4f61"}, {"first_name": "Paul", "last_name": "Iusztin", "_id": "b5fa1f08-75f0-402d-8e88-d1357e346d9e"}]
|
demonstration.ipynb
ADDED
@@ -0,0 +1,1027 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "OBCrx5fSG4Qm"
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"# CS-UY 4613: Project\n",
|
10 |
+
"\n",
|
11 |
+
"Yufei Zhen\n",
|
12 |
+
"\n",
|
13 |
+
"macOS: Ventura 13.3.1 (a), GPU: Apple M2 Max"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "markdown",
|
18 |
+
"metadata": {
|
19 |
+
"id": "IptBGhoVG790"
|
20 |
+
},
|
21 |
+
"source": [
|
22 |
+
"## Setup\n",
|
23 |
+
"\n",
|
24 |
+
"* video source: [https://www.youtube.com/@pantelism](https://www.youtube.com/@pantelism)\n",
|
25 |
+
"\n",
|
26 |
+
"* **option 1** (repository source: [https://github.com/PacktPublishing/LLM-Engineers-Handbook](https://github.com/PacktPublishing/LLM-Engineers-Handbook))\n",
|
27 |
+
"\n"
|
28 |
+
]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"cell_type": "code",
|
32 |
+
"execution_count": null,
|
33 |
+
"metadata": {
|
34 |
+
"colab": {
|
35 |
+
"base_uri": "https://localhost:8080/"
|
36 |
+
},
|
37 |
+
"id": "8i3CcnpG_VPn",
|
38 |
+
"outputId": "597a492a-6305-43a6-e94e-b74fa8a12d7b"
|
39 |
+
},
|
40 |
+
"outputs": [
|
41 |
+
{
|
42 |
+
"name": "stdout",
|
43 |
+
"output_type": "stream",
|
44 |
+
"text": [
|
45 |
+
"Cloning into 'LLM-Engineers-Handbook'...\n",
|
46 |
+
"remote: Enumerating objects: 1970, done.\u001b[K\n",
|
47 |
+
"remote: Counting objects: 100% (515/515), done.\u001b[K\n",
|
48 |
+
"remote: Compressing objects: 100% (138/138), done.\u001b[K\n",
|
49 |
+
"remote: Total 1970 (delta 414), reused 377 (delta 377), pack-reused 1455 (from 2)\u001b[K\n",
|
50 |
+
"Receiving objects: 100% (1970/1970), 4.77 MiB | 21.22 MiB/s, done.\n",
|
51 |
+
"Resolving deltas: 100% (1263/1263), done.\n"
|
52 |
+
]
|
53 |
+
}
|
54 |
+
],
|
55 |
+
"source": [
|
56 |
+
"# !git clone https://github.com/PacktPublishing/LLM-Engineers-Handbook.git"
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"execution_count": null,
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [],
|
64 |
+
"source": [
|
65 |
+
"# !poetry env use 3.11\n",
|
66 |
+
"# !poetry install --without aws\n",
|
67 |
+
"# !poetry run pre-commit install"
|
68 |
+
]
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"cell_type": "code",
|
72 |
+
"execution_count": 1,
|
73 |
+
"metadata": {},
|
74 |
+
"outputs": [
|
75 |
+
{
|
76 |
+
"name": "stdout",
|
77 |
+
"output_type": "stream",
|
78 |
+
"text": [
|
79 |
+
"MPS available: True\n",
|
80 |
+
"CUDA available: False\n"
|
81 |
+
]
|
82 |
+
}
|
83 |
+
],
|
84 |
+
"source": [
|
85 |
+
"import torch\n",
|
86 |
+
"print(f\"MPS available: {torch.backends.mps.is_available()}\")\n",
|
87 |
+
"print(f\"CUDA available: {torch.cuda.is_available()}\")"
|
88 |
+
]
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"cell_type": "markdown",
|
92 |
+
"metadata": {
|
93 |
+
"id": "ufyNDhgOYiUh"
|
94 |
+
},
|
95 |
+
"source": [
|
96 |
+
"## RAG Architecture\n",
|
97 |
+
"\n",
|
98 |
+
"- Integrating into [https://github.com/PacktPublishing/LLM-Engineers-Handbook/tree/main/llm_engineering/application/rag](https://github.com/PacktPublishing/LLM-Engineers-Handbook/tree/main/llm_engineering/application/rag):\n",
|
99 |
+
"\n",
|
100 |
+
"- Directory overview: \n",
|
101 |
+
"\n",
|
102 |
+
"```\n",
|
103 |
+
".\n",
|
104 |
+
"├── ... \n",
|
105 |
+
"├── clips/ # Generated video clip responses\n",
|
106 |
+
"├── llm_engineering/ # Core project package\n",
|
107 |
+
"│ ├── application/\n",
|
108 |
+
"│ │ ├── ...\n",
|
109 |
+
"│ │ ├── rag # Main RAG architecture\n",
|
110 |
+
"│ │ │ ├── __init__.py\n",
|
111 |
+
"│ │ │ ├── base.py\n",
|
112 |
+
"│ │ │ ├── multimodel_dispatcher.py (new)\n",
|
113 |
+
"│ │ │ ├── pipeline.py (new)\n",
|
114 |
+
"│ │ │ ├── prompt_templates.py\n",
|
115 |
+
"│ │ │ ├── query_expansion.py\n",
|
116 |
+
"│ │ │ ├── reranking.py\n",
|
117 |
+
"│ │ │ ├── retriever.py (modified)\n",
|
118 |
+
"│ │ │ ├── self_query.py\n",
|
119 |
+
"│ │ │ ├── topic_retriever.py (new)\n",
|
120 |
+
"│ │ │ ├── video_ingetser.py (new)\n",
|
121 |
+
"│ │ │ ├── video_processor.py (new)\n",
|
122 |
+
"│ ├── domain/\n",
|
123 |
+
"│ │ ├── ...\n",
|
124 |
+
"│ │ ├── queries.py (modified)\n",
|
125 |
+
"│ │ ├── video_chunks.py (new)\n",
|
126 |
+
"├── demonstration.ipynb (YOU'RE HERE)\n",
|
127 |
+
"```"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "markdown",
|
132 |
+
"metadata": {},
|
133 |
+
"source": [
|
134 |
+
"## Video Ingestion"
|
135 |
+
]
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"cell_type": "code",
|
139 |
+
"execution_count": 1,
|
140 |
+
"metadata": {},
|
141 |
+
"outputs": [],
|
142 |
+
"source": [
|
143 |
+
"video_db = \"/Users/yufeizhen/Desktop/project/videos\""
|
144 |
+
]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"cell_type": "code",
|
148 |
+
"execution_count": 2,
|
149 |
+
"metadata": {},
|
150 |
+
"outputs": [
|
151 |
+
{
|
152 |
+
"name": "stderr",
|
153 |
+
"output_type": "stream",
|
154 |
+
"text": [
|
155 |
+
"\u001b[32m2025-05-04 03:25:21.777\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mllm_engineering.settings\u001b[0m:\u001b[36mload_settings\u001b[0m:\u001b[36m94\u001b[0m - \u001b[1mLoading settings from the ZenML secret store.\u001b[0m\n",
|
156 |
+
"\u001b[32m2025-05-04 03:25:21.929\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mllm_engineering.settings\u001b[0m:\u001b[36mload_settings\u001b[0m:\u001b[36m99\u001b[0m - \u001b[33m\u001b[1mFailed to load settings from the ZenML secret store. Defaulting to loading the settings from the '.env' file.\u001b[0m\n",
|
157 |
+
"\u001b[32m2025-05-04 03:25:22.015\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mllm_engineering.infrastructure.db.mongo\u001b[0m:\u001b[36m__new__\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mConnection to MongoDB with URI successful: mongodb://llm_engineering:[email protected]:27017\u001b[0m\n"
|
158 |
+
]
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"name": "stdout",
|
162 |
+
"output_type": "stream",
|
163 |
+
"text": [
|
164 |
+
"\u001b[1;35mPyTorch version 2.2.2 available.\u001b[0m\n"
|
165 |
+
]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"name": "stderr",
|
169 |
+
"output_type": "stream",
|
170 |
+
"text": [
|
171 |
+
"\u001b[32m2025-05-04 03:25:23.410\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mllm_engineering.infrastructure.db.qdrant\u001b[0m:\u001b[36m__new__\u001b[0m:\u001b[36m29\u001b[0m - \u001b[1mConnection to Qdrant DB with URI successful: str\u001b[0m\n"
|
172 |
+
]
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"name": "stdout",
|
176 |
+
"output_type": "stream",
|
177 |
+
"text": [
|
178 |
+
"\u001b[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2\u001b[0m\n",
|
179 |
+
"Initializing fallback TextEmbedder\n",
|
180 |
+
"\u001b[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2\u001b[0m\n",
|
181 |
+
"Loading CLIP model: openai/clip-vit-base-patch32\n",
|
182 |
+
"CLIP model loaded successfully\n",
|
183 |
+
"Initialized embedders\n",
|
184 |
+
"Loaded NLP model\n",
|
185 |
+
"Loaded BERTopic\n",
|
186 |
+
"Processing videos from: /Users/yufeizhen/Desktop/project/videos\n",
|
187 |
+
"Already processed 8 videos\n",
|
188 |
+
"Previously processed videos:\n",
|
189 |
+
" - 9CGGh6ivg68\n",
|
190 |
+
" - FCQ-rih6cHY\n",
|
191 |
+
" - TV-DjM8242s\n",
|
192 |
+
" - WXoOohWU28Y\n",
|
193 |
+
" - eFgkZKhNUdM\n",
|
194 |
+
" - eQ6UE968Xe4\n",
|
195 |
+
" - lb_5AdUpfuA\n",
|
196 |
+
" - rCVlIVKqqGE\n",
|
197 |
+
"Found 8 video folders\n",
|
198 |
+
"Will process 0 videos (8 skipped)\n",
|
199 |
+
"Skipping TV-DjM8242s (already processed)\n",
|
200 |
+
"Skipping eFgkZKhNUdM (already processed)\n",
|
201 |
+
"Skipping eQ6UE968Xe4 (already processed)\n",
|
202 |
+
"Skipping rCVlIVKqqGE (already processed)\n",
|
203 |
+
"Skipping lb_5AdUpfuA (already processed)\n",
|
204 |
+
"Skipping FCQ-rih6cHY (already processed)\n",
|
205 |
+
"Skipping 9CGGh6ivg68 (already processed)\n",
|
206 |
+
"Skipping WXoOohWU28Y (already processed)\n",
|
207 |
+
"\n",
|
208 |
+
"All videos processed!\n",
|
209 |
+
"Total processed videos: 8\n"
|
210 |
+
]
|
211 |
+
}
|
212 |
+
],
|
213 |
+
"source": [
|
214 |
+
"from llm_engineering.application.rag.video_ingester import VideoIngester\n",
|
215 |
+
"\n",
|
216 |
+
"ingester = VideoIngester(video_root=video_db)\n",
|
217 |
+
"# ingester.process_video_library(force_reprocess=True)\n",
|
218 |
+
"ingester.process_video_library()"
|
219 |
+
]
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"cell_type": "code",
|
223 |
+
"execution_count": 3,
|
224 |
+
"metadata": {},
|
225 |
+
"outputs": [
|
226 |
+
{
|
227 |
+
"name": "stdout",
|
228 |
+
"output_type": "stream",
|
229 |
+
"text": [
|
230 |
+
"Total stored vectors: 403\n"
|
231 |
+
]
|
232 |
+
}
|
233 |
+
],
|
234 |
+
"source": [
|
235 |
+
"from qdrant_client import QdrantClient\n",
|
236 |
+
"\n",
|
237 |
+
"client = QdrantClient(path=\"/Users/yufeizhen/Desktop/project/qdrant_storage\")\n",
|
238 |
+
"print(\"Total stored vectors:\", client.count(\"video_chunks\").count)"
|
239 |
+
]
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"attachments": {},
|
243 |
+
"cell_type": "markdown",
|
244 |
+
"metadata": {},
|
245 |
+
"source": [
|
246 |
+
"## Video Q&A"
|
247 |
+
]
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"cell_type": "code",
|
251 |
+
"execution_count": 3,
|
252 |
+
"metadata": {},
|
253 |
+
"outputs": [
|
254 |
+
{
|
255 |
+
"name": "stdout",
|
256 |
+
"output_type": "stream",
|
257 |
+
"text": [
|
258 |
+
"Initializing VideoQAEngine\n",
|
259 |
+
"Video root: /Users/yufeizhen/Desktop/project/videos\n",
|
260 |
+
"Qdrant storage path: /Users/yufeizhen/Desktop/project/qdrant_storage\n",
|
261 |
+
"Connected to Qdrant storage at: /Users/yufeizhen/Desktop/project/qdrant_storage\n",
|
262 |
+
"Available collections: collections=[CollectionDescription(name='video_chunks')]\n",
|
263 |
+
"Found video_chunks collection with 403 points\n",
|
264 |
+
"Initializing fallback TextEmbedder\n",
|
265 |
+
"\u001b[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2\u001b[0m\n",
|
266 |
+
"Loading CLIP model: openai/clip-vit-base-patch32\n",
|
267 |
+
"CLIP model loaded successfully\n",
|
268 |
+
"VideoQAEngine initialized successfully\n"
|
269 |
+
]
|
270 |
+
}
|
271 |
+
],
|
272 |
+
"source": [
|
273 |
+
"from llm_engineering.application.rag.pipeline import VideoQAEngine\n",
|
274 |
+
"\n",
|
275 |
+
"engine = VideoQAEngine(video_root=video_db)\n",
|
276 |
+
"\n",
|
277 |
+
"def respond(question):\n",
|
278 |
+
" clips = engine.ask(question)\n",
|
279 |
+
" return [(str(clip[\"path\"]), f\"Relevance: {clip['score']:.2f}\") for clip in clips]"
|
280 |
+
]
|
281 |
+
},
|
282 |
+
{
|
283 |
+
"cell_type": "code",
|
284 |
+
"execution_count": 4,
|
285 |
+
"metadata": {},
|
286 |
+
"outputs": [],
|
287 |
+
"source": [
|
288 |
+
"question = \"Using only the videos, explain the the binary cross entropy loss function.\""
|
289 |
+
]
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"cell_type": "code",
|
293 |
+
"execution_count": 5,
|
294 |
+
"metadata": {},
|
295 |
+
"outputs": [
|
296 |
+
{
|
297 |
+
"name": "stdout",
|
298 |
+
"output_type": "stream",
|
299 |
+
"text": [
|
300 |
+
"\n",
|
301 |
+
"--- Processing query: 'Using only the videos, explain the the binary cross entropy loss function.' ---\n",
|
302 |
+
"Retrieving relevant video segments...\n",
|
303 |
+
"Encoding query with CLIP: 'Using only the videos, explain the the binary cros...'\n",
|
304 |
+
"Cleaned text for CLIP: Using only the videos, explain the the binary cros...\n",
|
305 |
+
"Query embedded successfully\n",
|
306 |
+
"Sending search request to Qdrant (attempt 1/5)\n",
|
307 |
+
"Creating fresh connection to Qdrant...\n",
|
308 |
+
"Search successful, found 3 results\n",
|
309 |
+
"Retrieval completed in 0.07 seconds\n",
|
310 |
+
"Found 3 relevant video segments\n",
|
311 |
+
"\n",
|
312 |
+
"Processing result 1/3:\n",
|
313 |
+
" Video ID: eFgkZKhNUdM\n",
|
314 |
+
" Timestamps: 1270.0s - 1302.0s\n",
|
315 |
+
" Score: 0.8472\n",
|
316 |
+
" Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
|
317 |
+
" Creating clip to: clips/clip_eFgkZKhNUdM_1270_0.847.mp4\n",
|
318 |
+
" Clip created successfully\n",
|
319 |
+
"\n",
|
320 |
+
"Processing result 2/3:\n",
|
321 |
+
" Video ID: eFgkZKhNUdM\n",
|
322 |
+
" Timestamps: 642.0s - 647.0s\n",
|
323 |
+
" Score: 0.8467\n",
|
324 |
+
" Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
|
325 |
+
" Creating clip to: clips/clip_eFgkZKhNUdM_642_0.847.mp4\n",
|
326 |
+
" Clip created successfully\n",
|
327 |
+
"\n",
|
328 |
+
"Processing result 3/3:\n",
|
329 |
+
" Video ID: eFgkZKhNUdM\n",
|
330 |
+
" Timestamps: 874.0s - 882.0s\n",
|
331 |
+
" Score: 0.8379\n",
|
332 |
+
" Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
|
333 |
+
" Creating clip to: clips/clip_eFgkZKhNUdM_874_0.838.mp4\n",
|
334 |
+
" Clip created successfully\n",
|
335 |
+
"\n",
|
336 |
+
"Processed 3 clips successfully\n"
|
337 |
+
]
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"data": {
|
341 |
+
"text/plain": [
|
342 |
+
"[('clips/clip_eFgkZKhNUdM_1270_0.847.mp4', 'Relevance: 0.85'),\n",
|
343 |
+
" ('clips/clip_eFgkZKhNUdM_642_0.847.mp4', 'Relevance: 0.85'),\n",
|
344 |
+
" ('clips/clip_eFgkZKhNUdM_874_0.838.mp4', 'Relevance: 0.84')]"
|
345 |
+
]
|
346 |
+
},
|
347 |
+
"execution_count": 5,
|
348 |
+
"metadata": {},
|
349 |
+
"output_type": "execute_result"
|
350 |
+
}
|
351 |
+
],
|
352 |
+
"source": [
|
353 |
+
"respond(question)"
|
354 |
+
]
|
355 |
+
},
|
356 |
+
{
|
357 |
+
"cell_type": "markdown",
|
358 |
+
"metadata": {},
|
359 |
+
"source": [
|
360 |
+
"## Gradio App"
|
361 |
+
]
|
362 |
+
},
|
363 |
+
{
|
364 |
+
"cell_type": "code",
|
365 |
+
"execution_count": 4,
|
366 |
+
"metadata": {},
|
367 |
+
"outputs": [
|
368 |
+
{
|
369 |
+
"name": "stdout",
|
370 |
+
"output_type": "stream",
|
371 |
+
"text": [
|
372 |
+
"\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttps://api.gradio.app/pkg-version\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n"
|
373 |
+
]
|
374 |
+
}
|
375 |
+
],
|
376 |
+
"source": [
|
377 |
+
"import gradio as gr\n",
|
378 |
+
"\n",
|
379 |
+
"interface = gr.Interface(\n",
|
380 |
+
" fn=respond,\n",
|
381 |
+
" inputs=gr.Textbox(label=\"Ask about the video content\"),\n",
|
382 |
+
" outputs=gr.Gallery(label=\"Relevant Video Clips\"),\n",
|
383 |
+
" examples=[\n",
|
384 |
+
" [\"Using only the videos, explain how ResNets work.\"],\n",
|
385 |
+
" [\"Using only the videos, explain the advantages of CNNs over fully connected networks.\"],\n",
|
386 |
+
" [\"Using only the videos, explain the the binary cross entropy loss function.\"]\n",
|
387 |
+
" ]\n",
|
388 |
+
")"
|
389 |
+
]
|
390 |
+
},
|
391 |
+
{
|
392 |
+
"cell_type": "code",
|
393 |
+
"execution_count": 5,
|
394 |
+
"metadata": {},
|
395 |
+
"outputs": [
|
396 |
+
{
|
397 |
+
"name": "stdout",
|
398 |
+
"output_type": "stream",
|
399 |
+
"text": [
|
400 |
+
"* Running on local URL: http://127.0.0.1:7860\n",
|
401 |
+
"\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttp://127.0.0.1:7860/gradio_api/startup-events\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
|
402 |
+
"\u001b[1;35mHTTP Request: HEAD \u001b[0m\u001b[34mhttp://127.0.0.1:7860/\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
|
403 |
+
"\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttps://api.gradio.app/v3/tunnel-request\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
|
404 |
+
"* Running on public URL: https://382d4d0bacff86ee02.gradio.live\n",
|
405 |
+
"\n",
|
406 |
+
"This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n",
|
407 |
+
"\u001b[1;35mHTTP Request: HEAD \u001b[0m\u001b[34mhttps://382d4d0bacff86ee02.gradio.live\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n"
|
408 |
+
]
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"data": {
|
412 |
+
"text/html": [
|
413 |
+
"<div><iframe src=\"https://382d4d0bacff86ee02.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
414 |
+
],
|
415 |
+
"text/plain": [
|
416 |
+
"<IPython.core.display.HTML object>"
|
417 |
+
]
|
418 |
+
},
|
419 |
+
"metadata": {},
|
420 |
+
"output_type": "display_data"
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"data": {
|
424 |
+
"text/plain": []
|
425 |
+
},
|
426 |
+
"execution_count": 5,
|
427 |
+
"metadata": {},
|
428 |
+
"output_type": "execute_result"
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"name": "stdout",
|
432 |
+
"output_type": "stream",
|
433 |
+
"text": [
|
434 |
+
"\n",
|
435 |
+
"--- Processing query: 'Using only the videos, explain the the binary cross entropy loss function.' ---\n",
|
436 |
+
"Retrieving relevant video segments...\n",
|
437 |
+
"Encoding query with CLIP: 'Using only the videos, explain the the binary cros...'\n",
|
438 |
+
"Cleaned text for CLIP: Using only the videos, explain the the binary cross entropy loss function....\n",
|
439 |
+
"Cleaned text for CLIP: Using only the videos, explain the the binary cros...\n",
|
440 |
+
"Query embedded successfully\n",
|
441 |
+
"Sending search request to Qdrant (attempt 1/5)\n",
|
442 |
+
"Search successful, found 3 results\n",
|
443 |
+
"Retrieval completed in 0.34 seconds\n",
|
444 |
+
"Found 3 relevant video segments\n",
|
445 |
+
"\n",
|
446 |
+
"Processing result 1/3:\n",
|
447 |
+
" Video ID: eFgkZKhNUdM\n",
|
448 |
+
" Timestamps: 1270.0s - 1302.0s\n",
|
449 |
+
" Score: 0.8472\n",
|
450 |
+
" Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
|
451 |
+
" Creating clip to: clips/clip_eFgkZKhNUdM_1270_0.847.mp4\n",
|
452 |
+
" Clip created successfully\n",
|
453 |
+
"\n",
|
454 |
+
"Processing result 2/3:\n",
|
455 |
+
" Video ID: eFgkZKhNUdM\n",
|
456 |
+
" Timestamps: 642.0s - 647.0s\n",
|
457 |
+
" Score: 0.8467\n",
|
458 |
+
" Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
|
459 |
+
" Creating clip to: clips/clip_eFgkZKhNUdM_642_0.847.mp4\n",
|
460 |
+
" Clip created successfully\n",
|
461 |
+
"\n",
|
462 |
+
"Processing result 3/3:\n",
|
463 |
+
" Video ID: eFgkZKhNUdM\n",
|
464 |
+
" Timestamps: 874.0s - 882.0s\n",
|
465 |
+
" Score: 0.8379\n",
|
466 |
+
" Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
|
467 |
+
" Creating clip to: clips/clip_eFgkZKhNUdM_874_0.838.mp4\n",
|
468 |
+
" Clip created successfully\n",
|
469 |
+
"\n",
|
470 |
+
"Processed 3 clips successfully\n"
|
471 |
+
]
|
472 |
+
}
|
473 |
+
],
|
474 |
+
"source": [
|
475 |
+
"interface.launch(share=True)"
|
476 |
+
]
|
477 |
+
},
|
478 |
+
{
|
479 |
+
"cell_type": "code",
|
480 |
+
"execution_count": 9,
|
481 |
+
"metadata": {},
|
482 |
+
"outputs": [
|
483 |
+
{
|
484 |
+
"name": "stdout",
|
485 |
+
"output_type": "stream",
|
486 |
+
"text": [
|
487 |
+
"Initializing VideoQAEngine\n",
|
488 |
+
"Video root: /Users/yufeizhen/Desktop/project/videos\n",
|
489 |
+
"Qdrant storage path: /Users/yufeizhen/Desktop/project/qdrant_storage\n",
|
490 |
+
"Connected to Qdrant storage at: /Users/yufeizhen/Desktop/project/qdrant_storage\n",
|
491 |
+
"Available collections: collections=[CollectionDescription(name='video_chunks')]\n",
|
492 |
+
"Found video_chunks collection with 403 points\n",
|
493 |
+
"Initializing fallback TextEmbedder\n",
|
494 |
+
"\u001b[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2\u001b[0m\n",
|
495 |
+
"Loading CLIP model: openai/clip-vit-base-patch32\n",
|
496 |
+
"CLIP model loaded successfully\n",
|
497 |
+
"VideoQAEngine initialized successfully\n"
|
498 |
+
]
|
499 |
+
},
|
500 |
+
{
|
501 |
+
"name": "stdout",
|
502 |
+
"output_type": "stream",
|
503 |
+
"text": [
|
504 |
+
"\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttps://api.gradio.app/pkg-version\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n"
|
505 |
+
]
|
506 |
+
}
|
507 |
+
],
|
508 |
+
"source": [
|
509 |
+
"import gradio as gr\n",
|
510 |
+
"from llm_engineering.application.rag.pipeline import VideoQAEngine\n",
|
511 |
+
"\n",
|
512 |
+
"# Initialize the VideoQAEngine with the video database\n",
|
513 |
+
"video_db = \"/Users/yufeizhen/Desktop/project/videos\"\n",
|
514 |
+
"engine = VideoQAEngine(video_root=video_db)\n",
|
515 |
+
"\n",
|
516 |
+
"# Define the chat function that processes messages and returns relevant video clips\n",
|
517 |
+
"def chat(message, history):\n",
|
518 |
+
" # Process message to get relevant clips\n",
|
519 |
+
" clips = engine.ask(message)\n",
|
520 |
+
" \n",
|
521 |
+
" # Format for display\n",
|
522 |
+
" clips_gallery = [(str(clip[\"path\"]), \"Relevance: {:.2f}\".format(clip['score'])) for clip in clips]\n",
|
523 |
+
" \n",
|
524 |
+
" # Return both a text response and the clips\n",
|
525 |
+
" return \"Here are the relevant video clips for: '{}'\".format(message), clips_gallery\n",
|
526 |
+
"\n",
|
527 |
+
"# Create a more flexible interface using Blocks\n",
|
528 |
+
"with gr.Blocks(theme=\"soft\") as demo:\n",
|
529 |
+
" gr.Markdown(\"# Chat with your Video Library\")\n",
|
530 |
+
" gr.Markdown(\"Ask questions about the video content and get relevant clips. You can continue the conversation with follow-up questions.\")\n",
|
531 |
+
" \n",
|
532 |
+
" # Create chatbot for conversation history\n",
|
533 |
+
" chatbot = gr.Chatbot(height=300)\n",
|
534 |
+
" \n",
|
535 |
+
" # Create gallery to display video clips\n",
|
536 |
+
" gallery = gr.Gallery(label=\"Relevant Video Clips\", show_label=True)\n",
|
537 |
+
" \n",
|
538 |
+
" # Create message input\n",
|
539 |
+
" msg = gr.Textbox(\n",
|
540 |
+
" placeholder=\"Ask about the video content...\", \n",
|
541 |
+
" label=\"Your Question\",\n",
|
542 |
+
" show_label=False\n",
|
543 |
+
" )\n",
|
544 |
+
" \n",
|
545 |
+
" # Define clear button\n",
|
546 |
+
" clear = gr.Button(\"Clear\")\n",
|
547 |
+
" \n",
|
548 |
+
" # Example questions\n",
|
549 |
+
" gr.Examples(\n",
|
550 |
+
" examples=[\n",
|
551 |
+
" \"Using only the videos, explain how ResNets work.\",\n",
|
552 |
+
" \"Using only the videos, explain the advantages of CNNs over fully connected networks.\",\n",
|
553 |
+
" \"Using only the videos, explain the the binary cross entropy loss function.\"\n",
|
554 |
+
" ],\n",
|
555 |
+
" inputs=msg\n",
|
556 |
+
" )\n",
|
557 |
+
" \n",
|
558 |
+
" # Define the chat function that updates both chatbot and gallery\n",
|
559 |
+
" def respond(message, chat_history):\n",
|
560 |
+
" # Get text response and clips\n",
|
561 |
+
" response, clips = chat(message, chat_history)\n",
|
562 |
+
" \n",
|
563 |
+
" # Update chat history\n",
|
564 |
+
" chat_history.append((message, response))\n",
|
565 |
+
" \n",
|
566 |
+
" # Return updated chat history and gallery\n",
|
567 |
+
" return \"\", chat_history, clips\n",
|
568 |
+
" \n",
|
569 |
+
" # Set up the event handlers\n",
|
570 |
+
" msg.submit(respond, [msg, chatbot], [msg, chatbot, gallery])\n",
|
571 |
+
" clear.click(lambda: ([], [], None), None, [chatbot, gallery, msg])"
|
572 |
+
]
|
573 |
+
},
|
574 |
+
{
|
575 |
+
"cell_type": "code",
|
576 |
+
"execution_count": 10,
|
577 |
+
"metadata": {},
|
578 |
+
"outputs": [
|
579 |
+
{
|
580 |
+
"name": "stdout",
|
581 |
+
"output_type": "stream",
|
582 |
+
"text": [
|
583 |
+
"* Running on local URL: http://127.0.0.1:7861\n",
|
584 |
+
"\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttp://127.0.0.1:7861/gradio_api/startup-events\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
|
585 |
+
"\u001b[1;35mHTTP Request: HEAD \u001b[0m\u001b[34mhttp://127.0.0.1:7861/\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
|
586 |
+
"\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttps://api.gradio.app/v3/tunnel-request\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
|
587 |
+
"* Running on public URL: https://48d861a2319613eb9b.gradio.live\n",
|
588 |
+
"\n",
|
589 |
+
"This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n",
|
590 |
+
"\u001b[1;35mHTTP Request: HEAD \u001b[0m\u001b[34mhttps://48d861a2319613eb9b.gradio.live\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n"
|
591 |
+
]
|
592 |
+
},
|
593 |
+
{
|
594 |
+
"data": {
|
595 |
+
"text/html": [
|
596 |
+
"<div><iframe src=\"https://48d861a2319613eb9b.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
597 |
+
],
|
598 |
+
"text/plain": [
|
599 |
+
"<IPython.core.display.HTML object>"
|
600 |
+
]
|
601 |
+
},
|
602 |
+
"metadata": {},
|
603 |
+
"output_type": "display_data"
|
604 |
+
},
|
605 |
+
{
|
606 |
+
"data": {
|
607 |
+
"text/plain": []
|
608 |
+
},
|
609 |
+
"execution_count": 10,
|
610 |
+
"metadata": {},
|
611 |
+
"output_type": "execute_result"
|
612 |
+
},
|
613 |
+
{
|
614 |
+
"name": "stdout",
|
615 |
+
"output_type": "stream",
|
616 |
+
"text": [
|
617 |
+
"\n",
|
618 |
+
"--- Processing query: 'Using only the videos, explain the the binary cross entropy loss function.' ---\n",
|
619 |
+
"Retrieving relevant video segments...\n",
|
620 |
+
"Encoding query with CLIP: 'Using only the videos, explain the the binary cros...'\n",
|
621 |
+
"Cleaned text for CLIP: Using only the videos, explain the the binary cros...\n",
|
622 |
+
"Query embedded successfully\n",
|
623 |
+
"Sending search request to Qdrant (attempt 1/5)\n",
|
624 |
+
"Creating fresh connection to Qdrant...\n",
|
625 |
+
"Search successful, found 3 results\n",
|
626 |
+
"Retrieval completed in 0.07 seconds\n",
|
627 |
+
"Found 3 relevant video segments\n",
|
628 |
+
"\n",
|
629 |
+
"Processing result 1/3:\n",
|
630 |
+
" Video ID: eFgkZKhNUdM\n",
|
631 |
+
" Timestamps: 1270.0s - 1302.0s\n",
|
632 |
+
" Score: 0.8472\n",
|
633 |
+
" Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
|
634 |
+
" Creating clip to: clips/clip_eFgkZKhNUdM_1270_0.847.mp4\n"
|
635 |
+
]
|
636 |
+
}
|
637 |
+
],
|
638 |
+
"source": [
|
639 |
+
"demo.launch(share=True)"
|
640 |
+
]
|
641 |
+
}
|
642 |
+
],
|
643 |
+
"metadata": {
|
644 |
+
"accelerator": "GPU",
|
645 |
+
"colab": {
|
646 |
+
"collapsed_sections": [
|
647 |
+
"gFdZON-DKOlx",
|
648 |
+
"KVkt7HBUKTig",
|
649 |
+
"d9KkwKhjKXgr",
|
650 |
+
"6NQUu-rUKfln"
|
651 |
+
],
|
652 |
+
"gpuType": "A100",
|
653 |
+
"machine_shape": "hm",
|
654 |
+
"provenance": []
|
655 |
+
},
|
656 |
+
"kernelspec": {
|
657 |
+
"display_name": "llm-engineering---hH_lZm-py3.11",
|
658 |
+
"language": "python",
|
659 |
+
"name": "python3"
|
660 |
+
},
|
661 |
+
"language_info": {
|
662 |
+
"codemirror_mode": {
|
663 |
+
"name": "ipython",
|
664 |
+
"version": 3
|
665 |
+
},
|
666 |
+
"file_extension": ".py",
|
667 |
+
"mimetype": "text/x-python",
|
668 |
+
"name": "python",
|
669 |
+
"nbconvert_exporter": "python",
|
670 |
+
"pygments_lexer": "ipython3",
|
671 |
+
"version": "3.11.2"
|
672 |
+
},
|
673 |
+
"vscode": {
|
674 |
+
"interpreter": {
|
675 |
+
"hash": "b62b85bf5dbe6dfb396652aed826e60c089a2288076109f466cd63de941fd51e"
|
676 |
+
}
|
677 |
+
},
|
678 |
+
"widgets": {
|
679 |
+
"application/vnd.jupyter.widget-state+json": {
|
680 |
+
"01c5ce2d579c4b7d8ba2607fc6d76328": {
|
681 |
+
"model_module": "@jupyter-widgets/controls",
|
682 |
+
"model_module_version": "1.5.0",
|
683 |
+
"model_name": "HBoxModel",
|
684 |
+
"state": {
|
685 |
+
"_dom_classes": [],
|
686 |
+
"_model_module": "@jupyter-widgets/controls",
|
687 |
+
"_model_module_version": "1.5.0",
|
688 |
+
"_model_name": "HBoxModel",
|
689 |
+
"_view_count": null,
|
690 |
+
"_view_module": "@jupyter-widgets/controls",
|
691 |
+
"_view_module_version": "1.5.0",
|
692 |
+
"_view_name": "HBoxView",
|
693 |
+
"box_style": "",
|
694 |
+
"children": [
|
695 |
+
"IPY_MODEL_15bf9c9d1cf44b5abee88dddc74a927b",
|
696 |
+
"IPY_MODEL_05fb5c50098f4e5997c07afed3b9383e",
|
697 |
+
"IPY_MODEL_cf25f66f1ca943be8af0c7ce14727f21"
|
698 |
+
],
|
699 |
+
"layout": "IPY_MODEL_d3f93597cb6640a29656daa95153f07f"
|
700 |
+
}
|
701 |
+
},
|
702 |
+
"05fb5c50098f4e5997c07afed3b9383e": {
|
703 |
+
"model_module": "@jupyter-widgets/controls",
|
704 |
+
"model_module_version": "1.5.0",
|
705 |
+
"model_name": "FloatProgressModel",
|
706 |
+
"state": {
|
707 |
+
"_dom_classes": [],
|
708 |
+
"_model_module": "@jupyter-widgets/controls",
|
709 |
+
"_model_module_version": "1.5.0",
|
710 |
+
"_model_name": "FloatProgressModel",
|
711 |
+
"_view_count": null,
|
712 |
+
"_view_module": "@jupyter-widgets/controls",
|
713 |
+
"_view_module_version": "1.5.0",
|
714 |
+
"_view_name": "ProgressView",
|
715 |
+
"bar_style": "success",
|
716 |
+
"description": "",
|
717 |
+
"description_tooltip": null,
|
718 |
+
"layout": "IPY_MODEL_7d39d74185e549e483dd3bc9a5fe8c76",
|
719 |
+
"max": 1,
|
720 |
+
"min": 0,
|
721 |
+
"orientation": "horizontal",
|
722 |
+
"style": "IPY_MODEL_c66b78a449c6451fbe757d1082304de5",
|
723 |
+
"value": 1
|
724 |
+
}
|
725 |
+
},
|
726 |
+
"15bf9c9d1cf44b5abee88dddc74a927b": {
|
727 |
+
"model_module": "@jupyter-widgets/controls",
|
728 |
+
"model_module_version": "1.5.0",
|
729 |
+
"model_name": "HTMLModel",
|
730 |
+
"state": {
|
731 |
+
"_dom_classes": [],
|
732 |
+
"_model_module": "@jupyter-widgets/controls",
|
733 |
+
"_model_module_version": "1.5.0",
|
734 |
+
"_model_name": "HTMLModel",
|
735 |
+
"_view_count": null,
|
736 |
+
"_view_module": "@jupyter-widgets/controls",
|
737 |
+
"_view_module_version": "1.5.0",
|
738 |
+
"_view_name": "HTMLView",
|
739 |
+
"description": "",
|
740 |
+
"description_tooltip": null,
|
741 |
+
"layout": "IPY_MODEL_2aa0d8f2c2bc4b3083edfcc5dc7ccc10",
|
742 |
+
"placeholder": "",
|
743 |
+
"style": "IPY_MODEL_a0e715cd015d499ba015c54e810bf81d",
|
744 |
+
"value": "Batches: 100%"
|
745 |
+
}
|
746 |
+
},
|
747 |
+
"2aa0d8f2c2bc4b3083edfcc5dc7ccc10": {
|
748 |
+
"model_module": "@jupyter-widgets/base",
|
749 |
+
"model_module_version": "1.2.0",
|
750 |
+
"model_name": "LayoutModel",
|
751 |
+
"state": {
|
752 |
+
"_model_module": "@jupyter-widgets/base",
|
753 |
+
"_model_module_version": "1.2.0",
|
754 |
+
"_model_name": "LayoutModel",
|
755 |
+
"_view_count": null,
|
756 |
+
"_view_module": "@jupyter-widgets/base",
|
757 |
+
"_view_module_version": "1.2.0",
|
758 |
+
"_view_name": "LayoutView",
|
759 |
+
"align_content": null,
|
760 |
+
"align_items": null,
|
761 |
+
"align_self": null,
|
762 |
+
"border": null,
|
763 |
+
"bottom": null,
|
764 |
+
"display": null,
|
765 |
+
"flex": null,
|
766 |
+
"flex_flow": null,
|
767 |
+
"grid_area": null,
|
768 |
+
"grid_auto_columns": null,
|
769 |
+
"grid_auto_flow": null,
|
770 |
+
"grid_auto_rows": null,
|
771 |
+
"grid_column": null,
|
772 |
+
"grid_gap": null,
|
773 |
+
"grid_row": null,
|
774 |
+
"grid_template_areas": null,
|
775 |
+
"grid_template_columns": null,
|
776 |
+
"grid_template_rows": null,
|
777 |
+
"height": null,
|
778 |
+
"justify_content": null,
|
779 |
+
"justify_items": null,
|
780 |
+
"left": null,
|
781 |
+
"margin": null,
|
782 |
+
"max_height": null,
|
783 |
+
"max_width": null,
|
784 |
+
"min_height": null,
|
785 |
+
"min_width": null,
|
786 |
+
"object_fit": null,
|
787 |
+
"object_position": null,
|
788 |
+
"order": null,
|
789 |
+
"overflow": null,
|
790 |
+
"overflow_x": null,
|
791 |
+
"overflow_y": null,
|
792 |
+
"padding": null,
|
793 |
+
"right": null,
|
794 |
+
"top": null,
|
795 |
+
"visibility": null,
|
796 |
+
"width": null
|
797 |
+
}
|
798 |
+
},
|
799 |
+
"5072655a4a724daa9d7b660e1709fa11": {
|
800 |
+
"model_module": "@jupyter-widgets/controls",
|
801 |
+
"model_module_version": "1.5.0",
|
802 |
+
"model_name": "DescriptionStyleModel",
|
803 |
+
"state": {
|
804 |
+
"_model_module": "@jupyter-widgets/controls",
|
805 |
+
"_model_module_version": "1.5.0",
|
806 |
+
"_model_name": "DescriptionStyleModel",
|
807 |
+
"_view_count": null,
|
808 |
+
"_view_module": "@jupyter-widgets/base",
|
809 |
+
"_view_module_version": "1.2.0",
|
810 |
+
"_view_name": "StyleView",
|
811 |
+
"description_width": ""
|
812 |
+
}
|
813 |
+
},
|
814 |
+
"7d39d74185e549e483dd3bc9a5fe8c76": {
|
815 |
+
"model_module": "@jupyter-widgets/base",
|
816 |
+
"model_module_version": "1.2.0",
|
817 |
+
"model_name": "LayoutModel",
|
818 |
+
"state": {
|
819 |
+
"_model_module": "@jupyter-widgets/base",
|
820 |
+
"_model_module_version": "1.2.0",
|
821 |
+
"_model_name": "LayoutModel",
|
822 |
+
"_view_count": null,
|
823 |
+
"_view_module": "@jupyter-widgets/base",
|
824 |
+
"_view_module_version": "1.2.0",
|
825 |
+
"_view_name": "LayoutView",
|
826 |
+
"align_content": null,
|
827 |
+
"align_items": null,
|
828 |
+
"align_self": null,
|
829 |
+
"border": null,
|
830 |
+
"bottom": null,
|
831 |
+
"display": null,
|
832 |
+
"flex": null,
|
833 |
+
"flex_flow": null,
|
834 |
+
"grid_area": null,
|
835 |
+
"grid_auto_columns": null,
|
836 |
+
"grid_auto_flow": null,
|
837 |
+
"grid_auto_rows": null,
|
838 |
+
"grid_column": null,
|
839 |
+
"grid_gap": null,
|
840 |
+
"grid_row": null,
|
841 |
+
"grid_template_areas": null,
|
842 |
+
"grid_template_columns": null,
|
843 |
+
"grid_template_rows": null,
|
844 |
+
"height": null,
|
845 |
+
"justify_content": null,
|
846 |
+
"justify_items": null,
|
847 |
+
"left": null,
|
848 |
+
"margin": null,
|
849 |
+
"max_height": null,
|
850 |
+
"max_width": null,
|
851 |
+
"min_height": null,
|
852 |
+
"min_width": null,
|
853 |
+
"object_fit": null,
|
854 |
+
"object_position": null,
|
855 |
+
"order": null,
|
856 |
+
"overflow": null,
|
857 |
+
"overflow_x": null,
|
858 |
+
"overflow_y": null,
|
859 |
+
"padding": null,
|
860 |
+
"right": null,
|
861 |
+
"top": null,
|
862 |
+
"visibility": null,
|
863 |
+
"width": null
|
864 |
+
}
|
865 |
+
},
|
866 |
+
"a0e715cd015d499ba015c54e810bf81d": {
|
867 |
+
"model_module": "@jupyter-widgets/controls",
|
868 |
+
"model_module_version": "1.5.0",
|
869 |
+
"model_name": "DescriptionStyleModel",
|
870 |
+
"state": {
|
871 |
+
"_model_module": "@jupyter-widgets/controls",
|
872 |
+
"_model_module_version": "1.5.0",
|
873 |
+
"_model_name": "DescriptionStyleModel",
|
874 |
+
"_view_count": null,
|
875 |
+
"_view_module": "@jupyter-widgets/base",
|
876 |
+
"_view_module_version": "1.2.0",
|
877 |
+
"_view_name": "StyleView",
|
878 |
+
"description_width": ""
|
879 |
+
}
|
880 |
+
},
|
881 |
+
"c66b78a449c6451fbe757d1082304de5": {
|
882 |
+
"model_module": "@jupyter-widgets/controls",
|
883 |
+
"model_module_version": "1.5.0",
|
884 |
+
"model_name": "ProgressStyleModel",
|
885 |
+
"state": {
|
886 |
+
"_model_module": "@jupyter-widgets/controls",
|
887 |
+
"_model_module_version": "1.5.0",
|
888 |
+
"_model_name": "ProgressStyleModel",
|
889 |
+
"_view_count": null,
|
890 |
+
"_view_module": "@jupyter-widgets/base",
|
891 |
+
"_view_module_version": "1.2.0",
|
892 |
+
"_view_name": "StyleView",
|
893 |
+
"bar_color": null,
|
894 |
+
"description_width": ""
|
895 |
+
}
|
896 |
+
},
|
897 |
+
"cb41643480ca4834bcc03611c4783326": {
|
898 |
+
"model_module": "@jupyter-widgets/base",
|
899 |
+
"model_module_version": "1.2.0",
|
900 |
+
"model_name": "LayoutModel",
|
901 |
+
"state": {
|
902 |
+
"_model_module": "@jupyter-widgets/base",
|
903 |
+
"_model_module_version": "1.2.0",
|
904 |
+
"_model_name": "LayoutModel",
|
905 |
+
"_view_count": null,
|
906 |
+
"_view_module": "@jupyter-widgets/base",
|
907 |
+
"_view_module_version": "1.2.0",
|
908 |
+
"_view_name": "LayoutView",
|
909 |
+
"align_content": null,
|
910 |
+
"align_items": null,
|
911 |
+
"align_self": null,
|
912 |
+
"border": null,
|
913 |
+
"bottom": null,
|
914 |
+
"display": null,
|
915 |
+
"flex": null,
|
916 |
+
"flex_flow": null,
|
917 |
+
"grid_area": null,
|
918 |
+
"grid_auto_columns": null,
|
919 |
+
"grid_auto_flow": null,
|
920 |
+
"grid_auto_rows": null,
|
921 |
+
"grid_column": null,
|
922 |
+
"grid_gap": null,
|
923 |
+
"grid_row": null,
|
924 |
+
"grid_template_areas": null,
|
925 |
+
"grid_template_columns": null,
|
926 |
+
"grid_template_rows": null,
|
927 |
+
"height": null,
|
928 |
+
"justify_content": null,
|
929 |
+
"justify_items": null,
|
930 |
+
"left": null,
|
931 |
+
"margin": null,
|
932 |
+
"max_height": null,
|
933 |
+
"max_width": null,
|
934 |
+
"min_height": null,
|
935 |
+
"min_width": null,
|
936 |
+
"object_fit": null,
|
937 |
+
"object_position": null,
|
938 |
+
"order": null,
|
939 |
+
"overflow": null,
|
940 |
+
"overflow_x": null,
|
941 |
+
"overflow_y": null,
|
942 |
+
"padding": null,
|
943 |
+
"right": null,
|
944 |
+
"top": null,
|
945 |
+
"visibility": null,
|
946 |
+
"width": null
|
947 |
+
}
|
948 |
+
},
|
949 |
+
"cf25f66f1ca943be8af0c7ce14727f21": {
|
950 |
+
"model_module": "@jupyter-widgets/controls",
|
951 |
+
"model_module_version": "1.5.0",
|
952 |
+
"model_name": "HTMLModel",
|
953 |
+
"state": {
|
954 |
+
"_dom_classes": [],
|
955 |
+
"_model_module": "@jupyter-widgets/controls",
|
956 |
+
"_model_module_version": "1.5.0",
|
957 |
+
"_model_name": "HTMLModel",
|
958 |
+
"_view_count": null,
|
959 |
+
"_view_module": "@jupyter-widgets/controls",
|
960 |
+
"_view_module_version": "1.5.0",
|
961 |
+
"_view_name": "HTMLView",
|
962 |
+
"description": "",
|
963 |
+
"description_tooltip": null,
|
964 |
+
"layout": "IPY_MODEL_cb41643480ca4834bcc03611c4783326",
|
965 |
+
"placeholder": "",
|
966 |
+
"style": "IPY_MODEL_5072655a4a724daa9d7b660e1709fa11",
|
967 |
+
"value": " 1/1 [00:00<00:00, 3.12it/s]"
|
968 |
+
}
|
969 |
+
},
|
970 |
+
"d3f93597cb6640a29656daa95153f07f": {
|
971 |
+
"model_module": "@jupyter-widgets/base",
|
972 |
+
"model_module_version": "1.2.0",
|
973 |
+
"model_name": "LayoutModel",
|
974 |
+
"state": {
|
975 |
+
"_model_module": "@jupyter-widgets/base",
|
976 |
+
"_model_module_version": "1.2.0",
|
977 |
+
"_model_name": "LayoutModel",
|
978 |
+
"_view_count": null,
|
979 |
+
"_view_module": "@jupyter-widgets/base",
|
980 |
+
"_view_module_version": "1.2.0",
|
981 |
+
"_view_name": "LayoutView",
|
982 |
+
"align_content": null,
|
983 |
+
"align_items": null,
|
984 |
+
"align_self": null,
|
985 |
+
"border": null,
|
986 |
+
"bottom": null,
|
987 |
+
"display": null,
|
988 |
+
"flex": null,
|
989 |
+
"flex_flow": null,
|
990 |
+
"grid_area": null,
|
991 |
+
"grid_auto_columns": null,
|
992 |
+
"grid_auto_flow": null,
|
993 |
+
"grid_auto_rows": null,
|
994 |
+
"grid_column": null,
|
995 |
+
"grid_gap": null,
|
996 |
+
"grid_row": null,
|
997 |
+
"grid_template_areas": null,
|
998 |
+
"grid_template_columns": null,
|
999 |
+
"grid_template_rows": null,
|
1000 |
+
"height": null,
|
1001 |
+
"justify_content": null,
|
1002 |
+
"justify_items": null,
|
1003 |
+
"left": null,
|
1004 |
+
"margin": null,
|
1005 |
+
"max_height": null,
|
1006 |
+
"max_width": null,
|
1007 |
+
"min_height": null,
|
1008 |
+
"min_width": null,
|
1009 |
+
"object_fit": null,
|
1010 |
+
"object_position": null,
|
1011 |
+
"order": null,
|
1012 |
+
"overflow": null,
|
1013 |
+
"overflow_x": null,
|
1014 |
+
"overflow_y": null,
|
1015 |
+
"padding": null,
|
1016 |
+
"right": null,
|
1017 |
+
"top": null,
|
1018 |
+
"visibility": null,
|
1019 |
+
"width": null
|
1020 |
+
}
|
1021 |
+
}
|
1022 |
+
}
|
1023 |
+
}
|
1024 |
+
},
|
1025 |
+
"nbformat": 4,
|
1026 |
+
"nbformat_minor": 0
|
1027 |
+
}
|
docker-compose.yml
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
mongo:
|
3 |
+
image: mongo:latest
|
4 |
+
container_name: "llm_engineering_mongo"
|
5 |
+
logging:
|
6 |
+
options:
|
7 |
+
max-size: 1g
|
8 |
+
environment:
|
9 |
+
MONGO_INITDB_ROOT_USERNAME: "llm_engineering"
|
10 |
+
MONGO_INITDB_ROOT_PASSWORD: "llm_engineering"
|
11 |
+
ports:
|
12 |
+
- 27017:27017
|
13 |
+
volumes:
|
14 |
+
- mongo_data:/data/db
|
15 |
+
networks:
|
16 |
+
- local
|
17 |
+
restart: always
|
18 |
+
|
19 |
+
qdrant:
|
20 |
+
image: qdrant/qdrant:latest
|
21 |
+
container_name: "llm_engineering_qdrant"
|
22 |
+
ports:
|
23 |
+
- 6333:6333
|
24 |
+
- 6334:6334
|
25 |
+
expose:
|
26 |
+
- 6333
|
27 |
+
- 6334
|
28 |
+
volumes:
|
29 |
+
- qdrant_data:/qdrant/storage
|
30 |
+
networks:
|
31 |
+
- local
|
32 |
+
restart: always
|
33 |
+
|
34 |
+
volumes:
|
35 |
+
mongo_data:
|
36 |
+
qdrant_data:
|
37 |
+
|
38 |
+
networks:
|
39 |
+
local:
|
40 |
+
driver: bridge
|
en_core_web_sm-3.7.0-py3-none-any.whl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6215d71a3212690e9aec49408a27e3fe6ad7cd6c715476e93d70dc784041e93e
|
3 |
+
size 12803377
|
images/cover_plus.png
ADDED
![]() |
Git LFS Details
|
images/crazy_cat.jpg
ADDED
![]() |
Git LFS Details
|
llm_engineering/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from llm_engineering import application, domain, infrastructure
|
2 |
+
from llm_engineering.settings import settings
|
3 |
+
|
4 |
+
__all__ = ["settings", "application", "domain", "infrastructure"]
|
llm_engineering/application/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from . import utils
|
2 |
+
|
3 |
+
__all__ = ["utils"]
|
llm_engineering/application/crawlers/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .dispatcher import CrawlerDispatcher
|
2 |
+
from .github import GithubCrawler
|
3 |
+
from .linkedin import LinkedInCrawler
|
4 |
+
from .medium import MediumCrawler
|
5 |
+
|
6 |
+
__all__ = ["CrawlerDispatcher", "GithubCrawler", "LinkedInCrawler", "MediumCrawler"]
|
llm_engineering/application/crawlers/base.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from abc import ABC, abstractmethod
|
3 |
+
from tempfile import mkdtemp
|
4 |
+
|
5 |
+
import chromedriver_autoinstaller
|
6 |
+
from selenium import webdriver
|
7 |
+
from selenium.webdriver.chrome.options import Options
|
8 |
+
|
9 |
+
from llm_engineering.domain.documents import NoSQLBaseDocument
|
10 |
+
|
11 |
+
# Check if the current version of chromedriver exists
|
12 |
+
# and if it doesn't exist, download it automatically,
|
13 |
+
# then add chromedriver to path
|
14 |
+
chromedriver_autoinstaller.install()
|
15 |
+
|
16 |
+
|
17 |
+
class BaseCrawler(ABC):
|
18 |
+
model: type[NoSQLBaseDocument]
|
19 |
+
|
20 |
+
@abstractmethod
|
21 |
+
def extract(self, link: str, **kwargs) -> None: ...
|
22 |
+
|
23 |
+
|
24 |
+
class BaseSeleniumCrawler(BaseCrawler, ABC):
|
25 |
+
def __init__(self, scroll_limit: int = 5) -> None:
|
26 |
+
options = webdriver.ChromeOptions()
|
27 |
+
|
28 |
+
options.add_argument("--no-sandbox")
|
29 |
+
options.add_argument("--headless=new")
|
30 |
+
options.add_argument("--disable-dev-shm-usage")
|
31 |
+
options.add_argument("--log-level=3")
|
32 |
+
options.add_argument("--disable-popup-blocking")
|
33 |
+
options.add_argument("--disable-notifications")
|
34 |
+
options.add_argument("--disable-extensions")
|
35 |
+
options.add_argument("--disable-background-networking")
|
36 |
+
options.add_argument("--ignore-certificate-errors")
|
37 |
+
options.add_argument(f"--user-data-dir={mkdtemp()}")
|
38 |
+
options.add_argument(f"--data-path={mkdtemp()}")
|
39 |
+
options.add_argument(f"--disk-cache-dir={mkdtemp()}")
|
40 |
+
options.add_argument("--remote-debugging-port=9226")
|
41 |
+
|
42 |
+
self.set_extra_driver_options(options)
|
43 |
+
|
44 |
+
self.scroll_limit = scroll_limit
|
45 |
+
self.driver = webdriver.Chrome(
|
46 |
+
options=options,
|
47 |
+
)
|
48 |
+
|
49 |
+
def set_extra_driver_options(self, options: Options) -> None:
|
50 |
+
pass
|
51 |
+
|
52 |
+
def login(self) -> None:
|
53 |
+
pass
|
54 |
+
|
55 |
+
def scroll_page(self) -> None:
|
56 |
+
"""Scroll through the LinkedIn page based on the scroll limit."""
|
57 |
+
current_scroll = 0
|
58 |
+
last_height = self.driver.execute_script("return document.body.scrollHeight")
|
59 |
+
while True:
|
60 |
+
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
61 |
+
time.sleep(5)
|
62 |
+
new_height = self.driver.execute_script("return document.body.scrollHeight")
|
63 |
+
if new_height == last_height or (self.scroll_limit and current_scroll >= self.scroll_limit):
|
64 |
+
break
|
65 |
+
last_height = new_height
|
66 |
+
current_scroll += 1
|
llm_engineering/application/crawlers/custom_article.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from urllib.parse import urlparse
|
2 |
+
|
3 |
+
from langchain_community.document_loaders import AsyncHtmlLoader
|
4 |
+
from langchain_community.document_transformers.html2text import Html2TextTransformer
|
5 |
+
from loguru import logger
|
6 |
+
|
7 |
+
from llm_engineering.domain.documents import ArticleDocument
|
8 |
+
|
9 |
+
from .base import BaseCrawler
|
10 |
+
|
11 |
+
|
12 |
+
class CustomArticleCrawler(BaseCrawler):
|
13 |
+
model = ArticleDocument
|
14 |
+
|
15 |
+
def __init__(self) -> None:
|
16 |
+
super().__init__()
|
17 |
+
|
18 |
+
def extract(self, link: str, **kwargs) -> None:
|
19 |
+
old_model = self.model.find(link=link)
|
20 |
+
if old_model is not None:
|
21 |
+
logger.info(f"Article already exists in the database: {link}")
|
22 |
+
|
23 |
+
return
|
24 |
+
|
25 |
+
logger.info(f"Starting scrapping article: {link}")
|
26 |
+
|
27 |
+
loader = AsyncHtmlLoader([link])
|
28 |
+
docs = loader.load()
|
29 |
+
|
30 |
+
html2text = Html2TextTransformer()
|
31 |
+
docs_transformed = html2text.transform_documents(docs)
|
32 |
+
doc_transformed = docs_transformed[0]
|
33 |
+
|
34 |
+
content = {
|
35 |
+
"Title": doc_transformed.metadata.get("title"),
|
36 |
+
"Subtitle": doc_transformed.metadata.get("description"),
|
37 |
+
"Content": doc_transformed.page_content,
|
38 |
+
"language": doc_transformed.metadata.get("language"),
|
39 |
+
}
|
40 |
+
|
41 |
+
parsed_url = urlparse(link)
|
42 |
+
platform = parsed_url.netloc
|
43 |
+
|
44 |
+
user = kwargs["user"]
|
45 |
+
instance = self.model(
|
46 |
+
content=content,
|
47 |
+
link=link,
|
48 |
+
platform=platform,
|
49 |
+
author_id=user.id,
|
50 |
+
author_full_name=user.full_name,
|
51 |
+
)
|
52 |
+
instance.save()
|
53 |
+
|
54 |
+
logger.info(f"Finished scrapping custom article: {link}")
|
llm_engineering/application/crawlers/dispatcher.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from urllib.parse import urlparse
|
3 |
+
|
4 |
+
from loguru import logger
|
5 |
+
|
6 |
+
from .base import BaseCrawler
|
7 |
+
from .custom_article import CustomArticleCrawler
|
8 |
+
from .github import GithubCrawler
|
9 |
+
from .linkedin import LinkedInCrawler
|
10 |
+
from .medium import MediumCrawler
|
11 |
+
|
12 |
+
|
13 |
+
class CrawlerDispatcher:
|
14 |
+
def __init__(self) -> None:
|
15 |
+
self._crawlers = {}
|
16 |
+
|
17 |
+
@classmethod
|
18 |
+
def build(cls) -> "CrawlerDispatcher":
|
19 |
+
dispatcher = cls()
|
20 |
+
|
21 |
+
return dispatcher
|
22 |
+
|
23 |
+
def register_medium(self) -> "CrawlerDispatcher":
|
24 |
+
self.register("https://medium.com", MediumCrawler)
|
25 |
+
|
26 |
+
return self
|
27 |
+
|
28 |
+
def register_linkedin(self) -> "CrawlerDispatcher":
|
29 |
+
self.register("https://linkedin.com", LinkedInCrawler)
|
30 |
+
|
31 |
+
return self
|
32 |
+
|
33 |
+
def register_github(self) -> "CrawlerDispatcher":
|
34 |
+
self.register("https://github.com", GithubCrawler)
|
35 |
+
|
36 |
+
return self
|
37 |
+
|
38 |
+
def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
|
39 |
+
parsed_domain = urlparse(domain)
|
40 |
+
domain = parsed_domain.netloc
|
41 |
+
|
42 |
+
self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler
|
43 |
+
|
44 |
+
def get_crawler(self, url: str) -> BaseCrawler:
|
45 |
+
for pattern, crawler in self._crawlers.items():
|
46 |
+
if re.match(pattern, url):
|
47 |
+
return crawler()
|
48 |
+
else:
|
49 |
+
logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.")
|
50 |
+
|
51 |
+
return CustomArticleCrawler()
|
llm_engineering/application/crawlers/github.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import subprocess
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
from loguru import logger
|
7 |
+
|
8 |
+
from llm_engineering.domain.documents import RepositoryDocument
|
9 |
+
|
10 |
+
from .base import BaseCrawler
|
11 |
+
|
12 |
+
|
13 |
+
class GithubCrawler(BaseCrawler):
|
14 |
+
model = RepositoryDocument
|
15 |
+
|
16 |
+
def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
|
17 |
+
super().__init__()
|
18 |
+
self._ignore = ignore
|
19 |
+
|
20 |
+
def extract(self, link: str, **kwargs) -> None:
|
21 |
+
old_model = self.model.find(link=link)
|
22 |
+
if old_model is not None:
|
23 |
+
logger.info(f"Repository already exists in the database: {link}")
|
24 |
+
|
25 |
+
return
|
26 |
+
|
27 |
+
logger.info(f"Starting scrapping GitHub repository: {link}")
|
28 |
+
|
29 |
+
repo_name = link.rstrip("/").split("/")[-1]
|
30 |
+
|
31 |
+
local_temp = tempfile.mkdtemp()
|
32 |
+
|
33 |
+
try:
|
34 |
+
os.chdir(local_temp)
|
35 |
+
subprocess.run(["git", "clone", link])
|
36 |
+
|
37 |
+
repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118
|
38 |
+
|
39 |
+
tree = {}
|
40 |
+
for root, _, files in os.walk(repo_path):
|
41 |
+
dir = root.replace(repo_path, "").lstrip("/")
|
42 |
+
if dir.startswith(self._ignore):
|
43 |
+
continue
|
44 |
+
|
45 |
+
for file in files:
|
46 |
+
if file.endswith(self._ignore):
|
47 |
+
continue
|
48 |
+
file_path = os.path.join(dir, file) # noqa: PTH118
|
49 |
+
with open(os.path.join(root, file), "r", errors="ignore") as f: # noqa: PTH123, PTH118
|
50 |
+
tree[file_path] = f.read().replace(" ", "")
|
51 |
+
|
52 |
+
user = kwargs["user"]
|
53 |
+
instance = self.model(
|
54 |
+
content=tree,
|
55 |
+
name=repo_name,
|
56 |
+
link=link,
|
57 |
+
platform="github",
|
58 |
+
author_id=user.id,
|
59 |
+
author_full_name=user.full_name,
|
60 |
+
)
|
61 |
+
instance.save()
|
62 |
+
|
63 |
+
except Exception:
|
64 |
+
raise
|
65 |
+
finally:
|
66 |
+
shutil.rmtree(local_temp)
|
67 |
+
|
68 |
+
logger.info(f"Finished scrapping GitHub repository: {link}")
|