purpleriann commited on
Commit
a22e84b
·
verified ·
1 Parent(s): 66e87fd

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +28 -0
  2. .gitattributes +7 -0
  3. .github/workflows/cd.yaml +43 -0
  4. .github/workflows/ci.yaml +69 -0
  5. .gitignore +177 -0
  6. .gradio/certificate.pem +31 -0
  7. .pre-commit-config.yaml +10 -0
  8. .python-version +1 -0
  9. .vscode/settings.json +15 -0
  10. Chat with your Video Library – Engineering AI Agents.pdf +3 -0
  11. Dockerfile +47 -0
  12. LICENSE +21 -0
  13. README.md +661 -7
  14. clips/clip_eFgkZKhNUdM_1270_0.847.mp4 +3 -0
  15. clips/clip_eFgkZKhNUdM_642_0.847.mp4 +3 -0
  16. clips/clip_eFgkZKhNUdM_874_0.838.mp4 +3 -0
  17. code_snippets/03_custom_odm_example.py +10 -0
  18. code_snippets/03_orm.py +37 -0
  19. code_snippets/08_instructor_embeddings.py +18 -0
  20. code_snippets/08_text_embeddings.py +28 -0
  21. code_snippets/08_text_image_embeddings.py +37 -0
  22. configs/digital_data_etl_maxime_labonne.yaml +38 -0
  23. configs/digital_data_etl_paul_iusztin.yaml +62 -0
  24. configs/end_to_end_data.yaml +87 -0
  25. configs/evaluating.yaml +9 -0
  26. configs/export_artifact_to_json.yaml +13 -0
  27. configs/feature_engineering.yaml +11 -0
  28. configs/generate_instruct_datasets.yaml +13 -0
  29. configs/generate_preference_datasets.yaml +13 -0
  30. configs/training.yaml +14 -0
  31. data/artifacts/cleaned_documents.json +0 -0
  32. data/artifacts/instruct_datasets.json +0 -0
  33. data/artifacts/preference_datasets.json +0 -0
  34. data/artifacts/raw_documents.json +0 -0
  35. data/data_warehouse_raw_data/ArticleDocument.json +0 -0
  36. data/data_warehouse_raw_data/PostDocument.json +1 -0
  37. data/data_warehouse_raw_data/RepositoryDocument.json +1 -0
  38. data/data_warehouse_raw_data/UserDocument.json +1 -0
  39. demonstration.ipynb +1027 -0
  40. docker-compose.yml +40 -0
  41. en_core_web_sm-3.7.0-py3-none-any.whl +3 -0
  42. images/cover_plus.png +3 -0
  43. images/crazy_cat.jpg +3 -0
  44. llm_engineering/__init__.py +4 -0
  45. llm_engineering/application/__init__.py +3 -0
  46. llm_engineering/application/crawlers/__init__.py +6 -0
  47. llm_engineering/application/crawlers/base.py +66 -0
  48. llm_engineering/application/crawlers/custom_article.py +54 -0
  49. llm_engineering/application/crawlers/dispatcher.py +51 -0
  50. llm_engineering/application/crawlers/github.py +68 -0
.env.example ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Required settings even when working locally. ---
2
+
3
+ # OpenAI API Config
4
+ OPENAI_MODEL_ID=gpt-4o-mini
5
+ OPENAI_API_KEY=str
6
+
7
+ # Huggingface API Config
8
+ HUGGINGFACE_ACCESS_TOKEN=str
9
+
10
+ # Comet ML (during training and inference)
11
+ COMET_API_KEY=str
12
+
13
+ # --- Required settings when deploying the code. ---
14
+ # --- Otherwise, default values work fine. ---
15
+
16
+ # MongoDB database
17
+ DATABASE_HOST="mongodb://llm_engineering:[email protected]:27017"
18
+
19
+ # Qdrant vector database
20
+ USE_QDRANT_CLOUD=false
21
+ QDRANT_CLOUD_URL=str
22
+ QDRANT_APIKEY=str
23
+
24
+ # AWS Authentication
25
+ AWS_ARN_ROLE=str
26
+ AWS_REGION=eu-central-1
27
+ AWS_ACCESS_KEY=str
28
+ AWS_SECRET_KEY=str
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Chat[[:space:]]with[[:space:]]your[[:space:]]Video[[:space:]]Library[[:space:]]–[[:space:]]Engineering[[:space:]]AI[[:space:]]Agents.pdf filter=lfs diff=lfs merge=lfs -text
37
+ clips/clip_eFgkZKhNUdM_1270_0.847.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ clips/clip_eFgkZKhNUdM_642_0.847.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ clips/clip_eFgkZKhNUdM_874_0.838.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ en_core_web_sm-3.7.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
41
+ images/cover_plus.png filter=lfs diff=lfs merge=lfs -text
42
+ images/crazy_cat.jpg filter=lfs diff=lfs merge=lfs -text
.github/workflows/cd.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CD
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ concurrency:
9
+ group: ${{ github.workflow }}-${{ github.ref }}
10
+ cancel-in-progress: true
11
+
12
+ jobs:
13
+ build:
14
+ name: Build & Push Docker Image
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - name: Checkout Code
18
+ uses: actions/checkout@v3
19
+
20
+ - name: Set up Docker Buildx
21
+ uses: docker/setup-buildx-action@v3
22
+
23
+ - name: Configure AWS credentials
24
+ uses: aws-actions/configure-aws-credentials@v1
25
+ with:
26
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
27
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
28
+ aws-region: ${{ secrets.AWS_REGION }}
29
+
30
+ - name: Login to Amazon ECR
31
+ id: login-ecr
32
+ uses: aws-actions/amazon-ecr-login@v1
33
+
34
+ - name: Build images & push to ECR
35
+ id: build-image
36
+ uses: docker/build-push-action@v6
37
+ with:
38
+ context: .
39
+ file: ./Dockerfile
40
+ tags: |
41
+ ${{ steps.login-ecr.outputs.registry }}/${{ secrets.AWS_ECR_NAME }}:${{ github.sha }}
42
+ ${{ steps.login-ecr.outputs.registry }}/${{ secrets.AWS_ECR_NAME }}:latest
43
+ push: true
.github/workflows/ci.yaml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+
6
+ concurrency:
7
+ group: ${{ github.workflow }}-${{ github.ref }}
8
+ cancel-in-progress: true
9
+
10
+ jobs:
11
+ qa:
12
+ name: QA
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Checkout
17
+ uses: actions/checkout@v3
18
+
19
+ - name: Setup Python
20
+ uses: actions/setup-python@v3
21
+ with:
22
+ python-version: "3.11"
23
+
24
+ - name: Install poetry
25
+ uses: abatilo/actions-poetry@v2
26
+ with:
27
+ poetry-version: 1.8.3
28
+
29
+ - name: Install packages
30
+ run: |
31
+ poetry install --only dev
32
+ poetry self add 'poethepoet[poetry_plugin]'
33
+
34
+ - name: gitleaks check
35
+ run: poetry poe gitleaks-check
36
+
37
+ - name: Lint check [Python]
38
+ run: poetry poe lint-check
39
+
40
+ - name: Format check [Python]
41
+ run: poetry poe format-check
42
+
43
+ test:
44
+ name: Test
45
+ runs-on: ubuntu-latest
46
+
47
+ steps:
48
+ - name: Checkout
49
+ uses: actions/checkout@v3
50
+
51
+ - name: Setup Python
52
+ uses: actions/setup-python@v3
53
+ with:
54
+ python-version: "3.11"
55
+
56
+ - name: Install poetry
57
+ uses: abatilo/actions-poetry@v2
58
+ with:
59
+ poetry-version: 1.8.3
60
+
61
+ - name: Install packages
62
+ run: |
63
+ poetry install
64
+ poetry self add 'poethepoet[poetry_plugin]'
65
+
66
+ - name: Run tests
67
+ run: |
68
+ echo "Running tests..."
69
+ poetry poe test
.gitignore ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # IDEs
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
163
+
164
+ # MacOs
165
+ .DS_Store
166
+
167
+ # VS Code
168
+ .vscode/**/launch.json
169
+
170
+ # Data
171
+ output/
172
+ sagemaker_*.json
173
+ run_ids.txt
174
+
175
+ # Virtual environments
176
+ *_venv
177
+ *_myenv
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
.pre-commit-config.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.3.5
4
+ hooks:
5
+ - id: ruff # Run the linter.
6
+ - id: ruff-format # Run the formatter.
7
+ - repo: https://github.com/gitleaks/gitleaks
8
+ rev: v8.18.2
9
+ hooks:
10
+ - id: gitleaks
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11.8
.vscode/settings.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.formatOnSave": true,
4
+ "editor.codeActionsOnSave": {
5
+ "source.fixAll": "explicit",
6
+ "source.organizeImports": "explicit"
7
+ },
8
+ "editor.defaultFormatter": "charliermarsh.ruff"
9
+ },
10
+ "notebook.formatOnSave.enabled": true,
11
+ "notebook.codeActionsOnSave": {
12
+ "notebook.source.fixAll": "explicit",
13
+ "notebook.source.organizeImports": "explicit"
14
+ },
15
+ }
Chat with your Video Library – Engineering AI Agents.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c32250914f76da5d6294d4dd86e6540cff5278bee30cbc4e4d1571fe26403c46
3
+ size 2216057
Dockerfile ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim-bullseye AS release
2
+
3
+ ENV WORKSPACE_ROOT=/app/
4
+ ENV PYTHONDONTWRITEBYTECODE=1
5
+ ENV PYTHONUNBUFFERED=1
6
+ ENV POETRY_VERSION=1.8.3
7
+ ENV DEBIAN_FRONTEND=noninteractive
8
+ ENV POETRY_NO_INTERACTION=1
9
+
10
+ # Install Google Chrome
11
+ RUN apt-get update -y && \
12
+ apt-get install -y gnupg wget curl --no-install-recommends && \
13
+ wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/google-linux-signing-key.gpg && \
14
+ echo "deb [signed-by=/usr/share/keyrings/google-linux-signing-key.gpg] https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
15
+ apt-get update -y && \
16
+ apt-get install -y google-chrome-stable && \
17
+ rm -rf /var/lib/apt/lists/*
18
+
19
+ # Install other system dependencies.
20
+ RUN apt-get update -y \
21
+ && apt-get install -y --no-install-recommends build-essential \
22
+ gcc \
23
+ python3-dev \
24
+ build-essential \
25
+ libglib2.0-dev \
26
+ libnss3-dev \
27
+ && apt-get clean \
28
+ && rm -rf /var/lib/apt/lists/*
29
+
30
+ # Install Poetry using pip and clear cache
31
+ RUN pip install --no-cache-dir "poetry==$POETRY_VERSION"
32
+ RUN poetry config installer.max-workers 20
33
+
34
+ WORKDIR $WORKSPACE_ROOT
35
+
36
+ # Copy the poetry lock file and pyproject.toml file to install dependencies
37
+ COPY pyproject.toml poetry.lock $WORKSPACE_ROOT
38
+
39
+ # Install the dependencies and clear cache
40
+ RUN poetry config virtualenvs.create false && \
41
+ poetry install --no-root --no-interaction --no-cache --without dev && \
42
+ poetry self add 'poethepoet[poetry_plugin]' && \
43
+ rm -rf ~/.cache/pypoetry/cache/ && \
44
+ rm -rf ~/.cache/pypoetry/artifacts/
45
+
46
+ # Copy the rest of the code.
47
+ COPY . $WORKSPACE_ROOT
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Packt
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,666 @@
1
  ---
2
- title: LLM Engineers Handbook
3
- emoji: 🦀
4
- colorFrom: indigo
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.29.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: LLM-Engineers-Handbook
3
+ app_file: demonstration.ipynb
 
 
4
  sdk: gradio
5
  sdk_version: 5.29.0
 
 
6
  ---
7
+ <div align="center">
8
+ <h1>👷 LLM Engineer's Handbook</h1>
9
+ <p class="tagline">Official repository of the <a href="https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/">LLM Engineer's Handbook</a> by <a href="https://github.com/iusztinpaul">Paul Iusztin</a> and <a href="https://github.com/mlabonne">Maxime Labonne</a></p>
10
+ </div>
11
+ </br>
12
 
13
+ <p align="center">
14
+ <a href="https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/">
15
+ <img src="images/cover_plus.png" alt="Book cover">
16
+ </a>
17
+ </p>
18
+
19
+ <p align="center">
20
+ Find the book on <a href="https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/">Amazon</a> or <a href="https://www.packtpub.com/en-us/product/llm-engineers-handbook-9781836200062">Packt</a>
21
+ </p>
22
+
23
+ ## 🌟 Features
24
+
25
+ The goal of this book is to create your own end-to-end LLM-based system using best practices:
26
+
27
+ - 📝 Data collection & generation
28
+ - 🔄 LLM training pipeline
29
+ - 📊 Simple RAG system
30
+ - 🚀 Production-ready AWS deployment
31
+ - 🔍 Comprehensive monitoring
32
+ - 🧪 Testing and evaluation framework
33
+
34
+ You can download and use the final trained model on [Hugging Face](https://huggingface.co/mlabonne/TwinLlama-3.1-8B-DPO).
35
+
36
+ > [!IMPORTANT]
37
+ > The code in this GitHub repository is actively maintained and may contain updates not reflected in the book. **Always refer to this repository for the latest version of the code.**
38
+
39
+ ## 🔗 Dependencies
40
+
41
+ ### Local dependencies
42
+
43
+ To install and run the project locally, you need the following dependencies.
44
+
45
+ | Tool | Version | Purpose | Installation Link |
46
+ |------|---------|---------|------------------|
47
+ | pyenv | ≥2.3.36 | Multiple Python versions (optional) | [Install Guide](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) |
48
+ | Python | 3.11 | Runtime environment | [Download](https://www.python.org/downloads/) |
49
+ | Poetry | >= 1.8.3 and < 2.0 | Package management | [Install Guide](https://python-poetry.org/docs/#installation) |
50
+ | Docker | ≥27.1.1 | Containerization | [Install Guide](https://docs.docker.com/engine/install/) |
51
+ | AWS CLI | ≥2.15.42 | Cloud management | [Install Guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) |
52
+ | Git | ≥2.44.0 | Version control | [Download](https://git-scm.com/downloads) |
53
+
54
+ ### Cloud services
55
+
56
+ The code also uses and depends on the following cloud services. For now, you don't have to do anything. We will guide you in the installation and deployment sections on how to use them:
57
+
58
+ | Service | Purpose |
59
+ |---------|---------|
60
+ | [HuggingFace](https://huggingface.com/) | Model registry |
61
+ | [Comet ML](https://www.comet.com/site/products/opik/?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik) | Experiment tracker |
62
+ | [Opik](https://www.comet.com/site/products/opik/?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik) | Prompt monitoring |
63
+ | [ZenML](https://www.zenml.io/) | Orchestrator and artifacts layer |
64
+ | [AWS](https://aws.amazon.com/) | Compute and storage |
65
+ | [MongoDB](https://www.mongodb.com/) | NoSQL database |
66
+ | [Qdrant](https://qdrant.tech/) | Vector database |
67
+ | [GitHub Actions](https://github.com/features/actions) | CI/CD pipeline |
68
+
69
+ In the [LLM Engineer's Handbook](https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/), Chapter 2 will walk you through each tool. Chapters 10 and 11 provide step-by-step guides on how to set up everything you need.
70
+
71
+ ## 🗂️ Project Structure
72
+
73
+ Here is the directory overview:
74
+
75
+ ```bash
76
+ .
77
+ ├── code_snippets/ # Standalone example code
78
+ ├── configs/ # Pipeline configuration files
79
+ ├── llm_engineering/ # Core project package
80
+ │ ├── application/
81
+ │ ├── domain/
82
+ │ ├── infrastructure/
83
+ │ ├── model/
84
+ ├── pipelines/ # ML pipeline definitions
85
+ ├── steps/ # Pipeline components
86
+ ├── tests/ # Test examples
87
+ ├── tools/ # Utility scripts
88
+ │ ├── run.py
89
+ │ ├── ml_service.py
90
+ │ ├── rag.py
91
+ │ ├── data_warehouse.py
92
+ ```
93
+
94
+ `llm_engineering/` is the main Python package implementing LLM and RAG functionality. It follows Domain-Driven Design (DDD) principles:
95
+
96
+ - `domain/`: Core business entities and structures
97
+ - `application/`: Business logic, crawlers, and RAG implementation
98
+ - `model/`: LLM training and inference
99
+ - `infrastructure/`: External service integrations (AWS, Qdrant, MongoDB, FastAPI)
100
+
101
+ The code logic and imports flow as follows: `infrastructure` → `model` → `application` → `domain`
102
+
103
+ `pipelines/`: Contains the ZenML ML pipelines, which serve as the entry point for all the ML pipelines. Coordinates the data processing and model training stages of the ML lifecycle.
104
+
105
+ `steps/`: Contains individual ZenML steps, which are reusable components for building and customizing ZenML pipelines. Steps perform specific tasks (e.g., data loading, preprocessing) and can be combined within the ML pipelines.
106
+
107
+ `tests/`: Covers a few sample tests used as examples within the CI pipeline.
108
+
109
+ `tools/`: Utility scripts used to call the ZenML pipelines and inference code:
110
+ - `run.py`: Entry point script to run ZenML pipelines.
111
+ - `ml_service.py`: Starts the REST API inference server.
112
+ - `rag.py`: Demonstrates usage of the RAG retrieval module.
113
+ - `data_warehouse.py`: Used to export or import data from the MongoDB data warehouse through JSON files.
114
+
115
+ `configs/`: ZenML YAML configuration files to control the execution of pipelines and steps.
116
+
117
+ `code_snippets/`: Independent code examples that can be executed independently.
118
+
119
+ ## 💻 Installation
120
+
121
+ > [!NOTE]
122
+ > If you are experiencing issues while installing and running the repository, consider checking the [Issues](https://github.com/PacktPublishing/LLM-Engineers-Handbook/issues) GitHub section for other people who solved similar problems or directly asking us for help.
123
+
124
+ ### 1. Clone the Repository
125
+
126
+ Start by cloning the repository and navigating to the project directory:
127
+
128
+ ```bash
129
+ git clone https://github.com/PacktPublishing/LLM-Engineers-Handbook.git
130
+ cd LLM-Engineers-Handbook
131
+ ```
132
+
133
+ Next, we have to prepare your Python environment and its adjacent dependencies.
134
+
135
+ ### 2. Set Up Python Environment
136
+
137
+ The project requires Python 3.11. You can either use your global Python installation or set up a project-specific version using pyenv.
138
+
139
+ #### Option A: Using Global Python (if version 3.11 is installed)
140
+
141
+ Verify your Python version:
142
+
143
+ ```bash
144
+ python --version # Should show Python 3.11.x
145
+ ```
146
+
147
+ #### Option B: Using pyenv (recommended)
148
+
149
+ 1. Verify pyenv installation:
150
+
151
+ ```bash
152
+ pyenv --version # Should show pyenv 2.3.36 or later
153
+ ```
154
+
155
+ 2. Install Python 3.11.8:
156
+
157
+ ```bash
158
+ pyenv install 3.11.8
159
+ ```
160
+
161
+ 3. Verify the installation:
162
+
163
+ ```bash
164
+ python --version # Should show Python 3.11.8
165
+ ```
166
+
167
+ 4. Confirm Python version in the project directory:
168
+
169
+ ```bash
170
+ python --version
171
+ # Output: Python 3.11.8
172
+ ```
173
+
174
+ > [!NOTE]
175
+ > The project includes a `.python-version` file that automatically sets the correct Python version when you're in the project directory.
176
+
177
+ ### 3. Install Dependencies
178
+
179
+ The project uses Poetry for dependency management.
180
+
181
+ 1. Verify Poetry installation:
182
+
183
+ ```bash
184
+ poetry --version # Should show Poetry version 1.8.3 or later
185
+ ```
186
+
187
+ 2. Set up the project environment and install dependencies:
188
+
189
+ ```bash
190
+ poetry env use 3.11
191
+ poetry install --without aws
192
+ poetry run pre-commit install
193
+ ```
194
+
195
+ This will:
196
+
197
+ - Configure Poetry to use Python 3.11
198
+ - Install project dependencies (excluding AWS-specific packages)
199
+ - Set up pre-commit hooks for code verification
200
+
201
+ ### 4. Activate the Environment
202
+
203
+ As our task manager, we run all the scripts using [Poe the Poet](https://poethepoet.natn.io/index.html).
204
+
205
+ 1. Start a Poetry shell:
206
+
207
+ ```bash
208
+ poetry shell
209
+ ```
210
+
211
+ 2. Run project commands using Poe the Poet:
212
+
213
+ ```bash
214
+ poetry poe ...
215
+ ```
216
+
217
+ <details>
218
+ <summary>🔧 Troubleshooting Poe the Poet Installation</summary>
219
+
220
+ ### Alternative Command Execution
221
+
222
+ If you're experiencing issues with `poethepoet`, you can still run the project commands directly through Poetry. Here's how:
223
+
224
+ 1. Look up the command definition in `pyproject.toml`
225
+ 2. Use `poetry run` with the underlying command
226
+
227
+ #### Example:
228
+ Instead of:
229
+ ```bash
230
+ poetry poe local-infrastructure-up
231
+ ```
232
+ Use the direct command from pyproject.toml:
233
+ ```bash
234
+ poetry run <actual-command-from-pyproject-toml>
235
+ ```
236
+ Note: All project commands are defined in the [tool.poe.tasks] section of pyproject.toml
237
+ </details>
238
+
239
+ Now, let's configure our local project with all the necessary credentials and tokens to run the code locally.
240
+
241
+ ### 5. Local Development Setup
242
+
243
+ After you have installed all the dependencies, you must create and fill a `.env` file with your credentials to appropriately interact with other services and run the project. Setting your sensitive credentials in a `.env` file is a good security practice, as this file won't be committed to GitHub or shared with anyone else.
244
+
245
+ 1. First, copy our example by running the following:
246
+
247
+ ```bash
248
+ cp .env.example .env # The file must be at your repository's root!
249
+ ```
250
+
251
+ 2. Now, let's understand how to fill in all the essential variables within the `.env` file to get you started. The following are the mandatory settings we must complete when working locally:
252
+
253
+ #### OpenAI
254
+
255
+ To authenticate to OpenAI's API, you must fill out the `OPENAI_API_KEY` env var with an authentication token.
256
+
257
+ ```env
258
+ OPENAI_API_KEY=your_api_key_here
259
+ ```
260
+
261
+ → Check out this [tutorial](https://platform.openai.com/docs/quickstart) to learn how to provide one from OpenAI.
262
+
263
+ #### Hugging Face
264
+
265
+ To authenticate to Hugging Face, you must fill out the `HUGGINGFACE_ACCESS_TOKEN` env var with an authentication token.
266
+
267
+ ```env
268
+ HUGGINGFACE_ACCESS_TOKEN=your_token_here
269
+ ```
270
+
271
+ → Check out this [tutorial](https://huggingface.co/docs/hub/en/security-tokens) to learn how to provide one from Hugging Face.
272
+
273
+ #### Comet ML & Opik
274
+
275
+ To authenticate to Comet ML (required only during training) and Opik, you must fill out the `COMET_API_KEY` env var with your authentication token.
276
+
277
+ ```env
278
+ COMET_API_KEY=your_api_key_here
279
+ ```
280
+
281
+ → Check out this [tutorial](https://www.comet.com/docs/opik/?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik) to learn how to get started with Opik. You can also access Opik's dashboard using 🔗[this link](https://www.comet.com/opik?utm_source=llm_handbook&utm_medium=github&utm_content=opik).
282
+
283
+ ### 6. Deployment Setup
284
+
285
+ When deploying the project to the cloud, we must set additional settings for Mongo, Qdrant, and AWS. If you are just working locally, the default values of these env vars will work out of the box. Detailed deployment instructions are available in Chapter 11 of the [LLM Engineer's Handbook](https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/).
286
+
287
+ #### MongoDB
288
+
289
+ We must change the `DATABASE_HOST` env var with the URL pointing to your cloud MongoDB cluster.
290
+
291
+ ```env
292
+ DATABASE_HOST=your_mongodb_url
293
+ ```
294
+
295
+ → Check out this [tutorial](https://www.mongodb.com/resources/products/fundamentals/mongodb-cluster-setup) to learn how to create and host a MongoDB cluster for free.
296
+
297
+ #### Qdrant
298
+
299
+ Change `USE_QDRANT_CLOUD` to `true`, `QDRANT_CLOUD_URL` with the URL point to your cloud Qdrant cluster, and `QDRANT_APIKEY` with its API key.
300
+
301
+ ```env
302
+ USE_QDRANT_CLOUD=true
303
+ QDRANT_CLOUD_URL=your_qdrant_cloud_url
304
+ QDRANT_APIKEY=your_qdrant_api_key
305
+ ```
306
+
307
+ → Check out this [tutorial](https://qdrant.tech/documentation/cloud/create-cluster/) to learn how to create a Qdrant cluster for free
308
+
309
+ #### AWS
310
+
311
+ For your AWS set-up to work correctly, you need the AWS CLI installed on your local machine and properly configured with an admin user (or a user with enough permissions to create new SageMaker, ECR, and S3 resources; using an admin user will make everything more straightforward).
312
+
313
+ Chapter 2 provides step-by-step instructions on how to install the AWS CLI, create an admin user on AWS, and get an access key to set up the `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` environment variables. If you already have an AWS admin user in place, you have to configure the following env vars in your `.env` file:
314
+
315
+ ```bash
316
+ AWS_REGION=eu-central-1 # Change it with your AWS region.
317
+ AWS_ACCESS_KEY=your_aws_access_key
318
+ AWS_SECRET_KEY=your_aws_secret_key
319
+ ```
320
+
321
+ AWS credentials are typically stored in `~/.aws/credentials`. You can view this file directly using `cat` or similar commands:
322
+
323
+ ```bash
324
+ cat ~/.aws/credentials
325
+ ```
326
+
327
+ > [!IMPORTANT]
328
+ > Additional configuration options are available in [settings.py](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/llm_engineering/settings.py). Any variable in the `Settings` class can be configured through the `.env` file.
329
+
330
+ ## 🏗️ Infrastructure
331
+
332
+ ### Local infrastructure (for testing and development)
333
+
334
+ When running the project locally, we host a MongoDB and Qdrant database using Docker. Also, a testing ZenML server is made available through their Python package.
335
+
336
+ > [!WARNING]
337
+ > You need Docker installed (>= v27.1.1)
338
+
339
+ For ease of use, you can start the whole local development infrastructure with the following command:
340
+ ```bash
341
+ poetry poe local-infrastructure-up
342
+ ```
343
+
344
+ Also, you can stop the ZenML server and all the Docker containers using the following command:
345
+ ```bash
346
+ poetry poe local-infrastructure-down
347
+ ```
348
+
349
+ > [!WARNING]
350
+ > When running on MacOS, before starting the server, export the following environment variable:
351
+ > `export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`
352
+ > Otherwise, the connection between the local server and pipeline will break. 🔗 More details in [this issue](https://github.com/zenml-io/zenml/issues/2369).
353
+ > This is done by default when using Poe the Poet.
354
+
355
+ Start the inference real-time RESTful API:
356
+ ```bash
357
+ poetry poe run-inference-ml-service
358
+ ```
359
+
360
+ > [!IMPORTANT]
361
+ > The LLM microservice, called by the RESTful API, will work only after deploying the LLM to AWS SageMaker.
362
+
363
+ #### ZenML
364
+
365
+ Dashboard URL: `localhost:8237`
366
+
367
+ Default credentials:
368
+ - `username`: default
369
+ - `password`:
370
+
371
+ → Find out more about using and setting up [ZenML](https://docs.zenml.io/).
372
+
373
+ #### Qdrant
374
+
375
+ REST API URL: `localhost:6333`
376
+
377
+ Dashboard URL: `localhost:6333/dashboard`
378
+
379
+ → Find out more about using and setting up [Qdrant with Docker](https://qdrant.tech/documentation/quick-start/).
380
+
381
+ #### MongoDB
382
+
383
+ Database URI: `mongodb://llm_engineering:[email protected]:27017`
384
+
385
+ Database name: `twin`
386
+
387
+ Default credentials:
388
+ - `username`: llm_engineering
389
+ - `password`: llm_engineering
390
+
391
+ → Find out more about using and setting up [MongoDB with Docker](https://www.mongodb.com/docs/manual/tutorial/install-mongodb-community-with-docker).
392
+
393
+ You can search your MongoDB collections using your **IDEs MongoDB plugin** (which you have to install separately), where you have to use the database URI to connect to the MongoDB database hosted within the Docker container: `mongodb://llm_engineering:[email protected]:27017`
394
+
395
+ > [!IMPORTANT]
396
+ > Everything related to training or running the LLMs (e.g., training, evaluation, inference) can only be run if you set up AWS SageMaker, as explained in the next section on cloud infrastructure.
397
+
398
+ ### Cloud infrastructure (for production)
399
+
400
+ Here we will quickly present how to deploy the project to AWS and other serverless services. We won't go into the details (as everything is presented in the book) but only point out the main steps you have to go through.
401
+
402
+ First, reinstall your Python dependencies with the AWS group:
403
+ ```bash
404
+ poetry install --with aws
405
+ ```
406
+
407
+ #### AWS SageMaker
408
+
409
+ > [!NOTE]
410
+ > Chapter 10 provides step-by-step instructions in the section "Implementing the LLM microservice using AWS SageMaker".
411
+
412
+ By this point, we expect you to have AWS CLI installed and your AWS CLI and project's env vars (within the `.env` file) properly configured with an AWS admin user.
413
+
414
+ To ensure best practices, we must create a new AWS user restricted to creating and deleting only resources related to AWS SageMaker. Create it by running:
415
+ ```bash
416
+ poetry poe create-sagemaker-role
417
+ ```
418
+ It will create a `sagemaker_user_credentials.json` file at the root of your repository with your new `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` values. **But before replacing your new AWS credentials, also run the following command to create the execution role (to create it using your admin credentials).**
419
+
420
+ To create the IAM execution role used by AWS SageMaker to access other AWS resources on our behalf, run the following:
421
+ ```bash
422
+ poetry poe create-sagemaker-execution-role
423
+ ```
424
+ It will create a `sagemaker_execution_role.json` file at the root of your repository with your new `AWS_ARN_ROLE` value. Add it to your `.env` file.
425
+
426
+ Once you've updated the `AWS_ACCESS_KEY`, `AWS_SECRET_KEY`, and `AWS_ARN_ROLE` values in your `.env` file, you can use AWS SageMaker. **Note that this step is crucial to complete the AWS setup.**
427
+
428
+ #### Training
429
+
430
+ We start the training pipeline through ZenML by running the following:
431
+ ```bash
432
+ poetry poe run-training-pipeline
433
+ ```
434
+ This will start the training code using the configs from `configs/training.yaml` directly in SageMaker. You can visualize the results in Comet ML's dashboard.
435
+
436
+ We start the evaluation pipeline through ZenML by running the following:
437
+ ```bash
438
+ poetry poe run-evaluation-pipeline
439
+ ```
440
+ This will start the evaluation code using the configs from `configs/evaluating.yaml` directly in SageMaker. You can visualize the results in `*-results` datasets saved to your Hugging Face profile.
441
+
442
+ #### Inference
443
+
444
+ To create an AWS SageMaker Inference Endpoint, run:
445
+ ```bash
446
+ poetry poe deploy-inference-endpoint
447
+ ```
448
+ To test it out, run:
449
+ ```bash
450
+ poetry poe test-sagemaker-endpoint
451
+ ```
452
+ To delete it, run:
453
+ ```bash
454
+ poetry poe delete-inference-endpoint
455
+ ```
456
+
457
+ #### AWS: ML pipelines, artifacts, and containers
458
+
459
+ The ML pipelines, artifacts, and containers are deployed to AWS by leveraging ZenML's deployment features. Thus, you must create an account with ZenML Cloud and follow their guide on deploying a ZenML stack to AWS. Otherwise, we provide step-by-step instructions in **Chapter 11**, section **Deploying the LLM Twin's pipelines to the cloud** on what you must do.
460
+
461
+ #### Qdrant & MongoDB
462
+
463
+ We leverage Qdrant's and MongoDB's serverless options when deploying the project. Thus, you can either follow [Qdrant's](https://qdrant.tech/documentation/cloud/create-cluster/) and [MongoDB's](https://www.mongodb.com/resources/products/fundamentals/mongodb-cluster-setup) tutorials on how to create a freemium cluster for each or go through **Chapter 11**, section **Deploying the LLM Twin's pipelines to the cloud** and follow our step-by-step instructions.
464
+
465
+ #### GitHub Actions
466
+
467
+ We use GitHub Actions to implement our CI/CD pipelines. To implement your own, you have to fork our repository and set the following env vars as Actions secrets in your forked repository:
468
+ - `AWS_ACCESS_KEY_ID`
469
+ - `AWS_SECRET_ACCESS_KEY`
470
+ - `AWS_ECR_NAME`
471
+ - `AWS_REGION`
472
+
473
+ Also, we provide instructions on how to set everything up in **Chapter 11**, section **Adding LLMOps to the LLM Twin**.
474
+
475
+ #### Comet ML & Opik
476
+
477
+ You can visualize the results on their self-hosted dashboards if you create a Comet account and correctly set the `COMET_API_KEY` env var. As Opik is powered by Comet, you don't have to set up anything else along Comet:
478
+ - [Comet ML (for experiment tracking)](https://www.comet.com/?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik)
479
+ - [Opik (for prompt monitoring)](https://www.comet.com/opik?utm_source=llm_handbook&utm_medium=github&utm_campaign=opik)
480
+
481
+ ### 💰 Running the Project Costs
482
+
483
+ We will mostly stick to free tiers for all the services except for AWS and OpenAI's API, which are both pay-as-you-go services. The cost of running the project once, with our default values, will be roughly ~$25 (most of it comes from using AWS SageMaker for training and inference).
484
+
485
+ ## ⚡ Pipelines
486
+
487
+ All the ML pipelines will be orchestrated behind the scenes by [ZenML](https://www.zenml.io/). A few exceptions exist when running utility scrips, such as exporting or importing from the data warehouse.
488
+
489
+ The ZenML pipelines are the entry point for most processes throughout this project. They are under the `pipelines/` folder. Thus, when you want to understand or debug a workflow, starting with the ZenML pipeline is the best approach.
490
+
491
+ To see the pipelines running and their results:
492
+ - go to your ZenML dashboard
493
+ - go to the `Pipelines` section
494
+ - click on a specific pipeline (e.g., `feature_engineering`)
495
+ - click on a specific run (e.g., `feature_engineering_run_2024_06_20_18_40_24`)
496
+ - click on a specific step or artifact of the DAG to find more details about it
497
+
498
+ Now, let's explore all the pipelines you can run. From data collection to training, we will present them in their natural order to go through the LLM project end-to-end.
499
+
500
+ ### Data pipelines
501
+
502
+ Run the data collection ETL:
503
+ ```bash
504
+ poetry poe run-digital-data-etl
505
+ ```
506
+
507
+ > [!WARNING]
508
+ > You must have Chrome (or another Chromium-based browser) installed on your system for LinkedIn and Medium crawlers to work (which use Selenium under the hood). Based on your Chrome version, the Chromedriver will be automatically installed to enable Selenium support. Another option is to run everything using our Docker image if you don't want to install Chrome. For example, to run all the pipelines combined you can run `poetry poe run-docker-end-to-end-data-pipeline`. Note that the command can be tweaked to support any other pipeline.
509
+ >
510
+ > If, for any other reason, you don't have a Chromium-based browser installed and don't want to use Docker, you have two other options to bypass this Selenium issue:
511
+ > - Comment out all the code related to Selenium, Chrome and all the links that use Selenium to crawl them (e.g., Medium), such as the `chromedriver_autoinstaller.install()` command from [application.crawlers.base](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/llm_engineering/application/crawlers/base.py) and other static calls that check for Chrome drivers and Selenium.
512
+ > - Install Google Chrome using your CLI in environments such as GitHub Codespaces or other cloud VMs using the same command as in our [Docker file](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/Dockerfile#L10).
513
+
514
+ To add additional links to collect from, go to `configs/digital_data_etl_[author_name].yaml` and add them to the `links` field. Also, you can create a completely new file and specify it at run time, like this: `python -m llm_engineering.interfaces.orchestrator.run --run-etl --etl-config-filename configs/digital_data_etl_[your_name].yaml`
515
+
516
+ Run the feature engineering pipeline:
517
+ ```bash
518
+ poetry poe run-feature-engineering-pipeline
519
+ ```
520
+
521
+ Generate the instruct dataset:
522
+ ```bash
523
+ poetry poe run-generate-instruct-datasets-pipeline
524
+ ```
525
+
526
+ Generate the preference dataset:
527
+ ```bash
528
+ poetry poe run-generate-preference-datasets-pipeline
529
+ ```
530
+
531
+ Run all of the above compressed into a single pipeline:
532
+ ```bash
533
+ poetry poe run-end-to-end-data-pipeline
534
+ ```
535
+
536
+ ### Utility pipelines
537
+
538
+ Export the data from the data warehouse to JSON files:
539
+ ```bash
540
+ poetry poe run-export-data-warehouse-to-json
541
+ ```
542
+
543
+ Import data to the data warehouse from JSON files (by default, it imports the data from the `data/data_warehouse_raw_data` directory):
544
+ ```bash
545
+ poetry poe run-import-data-warehouse-from-json
546
+ ```
547
+
548
+ Export ZenML artifacts to JSON:
549
+ ```bash
550
+ poetry poe run-export-artifact-to-json-pipeline
551
+ ```
552
+
553
+ This will export the following ZenML artifacts to the `output` folder as JSON files (it will take their latest version):
554
+ - cleaned_documents.json
555
+ - instruct_datasets.json
556
+ - preference_datasets.json
557
+ - raw_documents.json
558
+
559
+ You can configure what artifacts to export by tweaking the `configs/export_artifact_to_json.yaml` configuration file.
560
+
561
+ ### Training pipelines
562
+
563
+ Run the training pipeline:
564
+ ```bash
565
+ poetry poe run-training-pipeline
566
+ ```
567
+
568
+ Run the evaluation pipeline:
569
+ ```bash
570
+ poetry poe run-evaluation-pipeline
571
+ ```
572
+
573
+ > [!WARNING]
574
+ > For this to work, make sure you properly configured AWS SageMaker as described in [Set up cloud infrastructure (for production)](#set-up-cloud-infrastructure-for-production).
575
+
576
+ ### Inference pipelines
577
+
578
+ Call the RAG retrieval module with a test query:
579
+ ```bash
580
+ poetry poe call-rag-retrieval-module
581
+ ```
582
+
583
+ Start the inference real-time RESTful API:
584
+ ```bash
585
+ poetry poe run-inference-ml-service
586
+ ```
587
+
588
+ Call the inference real-time RESTful API with a test query:
589
+ ```bash
590
+ poetry poe call-inference-ml-service
591
+ ```
592
+
593
+ Remember that you can monitor the prompt traces on [Opik](https://www.comet.com/opik).
594
+
595
+ > [!WARNING]
596
+ > For the inference service to work, you must have the LLM microservice deployed to AWS SageMaker, as explained in the setup cloud infrastructure section.
597
+
598
+ ### Linting & formatting (QA)
599
+
600
+ Check or fix your linting issues:
601
+ ```bash
602
+ poetry poe lint-check
603
+ poetry poe lint-fix
604
+ ```
605
+
606
+ Check or fix your formatting issues:
607
+ ```bash
608
+ poetry poe format-check
609
+ poetry poe format-fix
610
+ ```
611
+
612
+ Check the code for leaked credentials:
613
+ ```bash
614
+ poetry poe gitleaks-check
615
+ ```
616
+
617
+ ### Tests
618
+
619
+ Run all the tests using the following command:
620
+ ```bash
621
+ poetry poe test
622
+ ```
623
+
624
+ ## 🏃 Run project
625
+
626
+ Based on the setup and usage steps described above, assuming the local and cloud infrastructure works and the `.env` is filled as expected, follow the next steps to run the LLM system end-to-end:
627
+
628
+ ### Data
629
+
630
+ 1. Collect data: `poetry poe run-digital-data-etl`
631
+
632
+ 2. Compute features: `poetry poe run-feature-engineering-pipeline`
633
+
634
+ 3. Compute instruct dataset: `poetry poe run-generate-instruct-datasets-pipeline`
635
+
636
+ 4. Compute preference alignment dataset: `poetry poe run-generate-preference-datasets-pipeline`
637
+
638
+ ### Training
639
+
640
+ > [!IMPORTANT]
641
+ > From now on, for these steps to work, you need to properly set up AWS SageMaker, such as running `poetry install --with aws` and filling in the AWS-related environment variables and configs.
642
+
643
+ 5. SFT fine-tuning Llamma 3.1: `poetry poe run-training-pipeline`
644
+
645
+ 6. For DPO, go to `configs/training.yaml`, change `finetuning_type` to `dpo`, and run `poetry poe run-training-pipeline` again
646
+
647
+ 7. Evaluate fine-tuned models: `poetry poe run-evaluation-pipeline`
648
+
649
+ ### Inference
650
+
651
+ > [!IMPORTANT]
652
+ > From now on, for these steps to work, you need to properly set up AWS SageMaker, such as running `poetry install --with aws` and filling in the AWS-related environment variables and configs.
653
+
654
+ 8. Call only the RAG retrieval module: `poetry poe call-rag-retrieval-module`
655
+
656
+ 9. Deploy the LLM Twin microservice to SageMaker: `poetry poe deploy-inference-endpoint`
657
+
658
+ 10. Test the LLM Twin microservice: `poetry poe test-sagemaker-endpoint`
659
+
660
+ 11. Start end-to-end RAG server: `poetry poe run-inference-ml-service`
661
+
662
+ 12. Test RAG server: `poetry poe call-inference-ml-service`
663
+
664
+ ## 📄 License
665
+
666
+ This course is an open-source project released under the MIT license. Thus, as long you distribute our LICENSE and acknowledge our work, you can safely clone or fork this project and use it as a source of inspiration for whatever you want (e.g., university projects, college degree projects, personal projects, etc.).
clips/clip_eFgkZKhNUdM_1270_0.847.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e29cb777c6b6984f5599a455d760923147593388b1a9076e6f9afe0bbf679d3c
3
+ size 7315834
clips/clip_eFgkZKhNUdM_642_0.847.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5653d4b661233328997880cd40df4aba4e74d3aca5198685f3b4aaaac0e03a3
3
+ size 1628484
clips/clip_eFgkZKhNUdM_874_0.838.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be8086177badfba4783ccca4040c990ea0a16ebc8d33185dce7a354e43720317
3
+ size 3585253
code_snippets/03_custom_odm_example.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from llm_engineering.domain.documents import ArticleDocument, UserDocument
2
+
3
+ if __name__ == "__main__":
4
+ user = UserDocument.get_or_create(first_name="Paul", last_name="Iusztin")
5
+ articles = ArticleDocument.bulk_find(author_id=str(user.id))
6
+
7
+ print(f"User ID: {user.id}") # noqa
8
+ print(f"User name: {user.first_name} {user.last_name}") # noqa
9
+ print(f"Number of articles: {len(articles)}") # noqa
10
+ print("First article link:", articles[0].link) # noqa
code_snippets/03_orm.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, create_engine
2
+ from sqlalchemy.orm import declarative_base, sessionmaker
3
+
4
+ # Create virtual environment, install dependencies and run the code:
5
+ # 1. Create: python3 -m venv orm_venv
6
+ # 2. Activate: source orm_venv/bin/activate
7
+ # 3. Install: pip install sqlalchemy==2.0.35
8
+ # 4. Run the code: python code_snippets/03_orm.py
9
+
10
+ if __name__ == "__main__":
11
+ Base = declarative_base()
12
+
13
+ # Define a class that maps to the users table.
14
+ class User(Base):
15
+ __tablename__ = "users"
16
+
17
+ id = Column(Integer, primary_key=True)
18
+ name = Column(String)
19
+
20
+ # Create an SQLite database in memory.
21
+ engine = create_engine("sqlite:///:memory:")
22
+ Base.metadata.create_all(engine)
23
+
24
+ # Create a session used to interact with the database.
25
+ Session = sessionmaker(bind=engine)
26
+ session = Session()
27
+
28
+ # Add a new user.
29
+ new_user = User(name="Alice")
30
+ session.add(new_user)
31
+ session.commit()
32
+
33
+ # Query the database.
34
+ user = session.query(User).first()
35
+ if user:
36
+ print(f"User ID: {user.id}") # noqa
37
+ print(f"User name: {user.name}") # noqa
code_snippets/08_instructor_embeddings.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+ # Create virtual environment, install dependencies and run the code:
4
+ # 1. Create: python3 -m venv instructor_venv
5
+ # 2. Activate: source instructor_venv/bin/activate
6
+ # 3. Install: pip install sentence-transformers==3.3.0
7
+ # 4. Run the code: python code_snippets/08_instructor_embeddings.py
8
+
9
+ if __name__ == "__main__":
10
+ model = SentenceTransformer("hkunlp/instructor-base")
11
+
12
+ sentence = "RAG Fundamentals First"
13
+
14
+ instruction = "Represent the title of an article about AI:"
15
+
16
+ embeddings = model.encode([[instruction, sentence]])
17
+ print(embeddings.shape) # noqa
18
+ # Output: (1, 768)
code_snippets/08_text_embeddings.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+ # Leverage the Poetry virtual environment to run the code:
4
+ # poetry run python code_snippets/08_text_embeddings.py
5
+
6
+ if __name__ == "__main__":
7
+ # 1. Load a pretrained Sentence Transformer model.
8
+ model = SentenceTransformer("all-MiniLM-L6-v2")
9
+
10
+ # The sentences to encode.
11
+ sentences = ["The dog sits outside waiting for a treat.", "I am going swimming.", "The dog is swimming."]
12
+
13
+ # 2. Calculate embeddings.
14
+ embeddings = model.encode(sentences)
15
+ print(embeddings.shape) # noqa
16
+ # Output: [3, 384]
17
+
18
+ # 3. Calculate the embedding similarities using cosine similarity.
19
+ similarities = model.similarity(embeddings, embeddings)
20
+ print(similarities) # noqa
21
+ # Output:
22
+ # tensor([[ 1.0000, -0.0389, 0.2692],
23
+ # [-0.0389, 1.0000, 0.3837],
24
+ # [ 0.2692, 0.3837, 1.0000]])
25
+ #
26
+ # similarities[0, 0] = The similarity between the first sentence and itself.
27
+ # similarities[0, 1] = The similarity between the first and second sentence.
28
+ # similarities[2, 1] = The similarity between the third and second sentence.
code_snippets/08_text_image_embeddings.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+
3
+ import requests
4
+ from PIL import Image
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ # Leverage the Poetry virtual environment to run the code:
8
+ # poetry run python code_snippets/08_text_image_embeddings.py
9
+
10
+ if __name__ == "__main__":
11
+ # Load an image with a crazy cat.
12
+ response = requests.get(
13
+ "https://github.com/PacktPublishing/LLM-Engineering/blob/main/images/crazy_cat.jpg?raw=true"
14
+ )
15
+ image = Image.open(BytesIO(response.content))
16
+
17
+ # Load CLIP model.
18
+ model = SentenceTransformer("clip-ViT-B-32")
19
+
20
+ # Encode the loaded image.
21
+ img_emb = model.encode(image)
22
+
23
+ # Encode text descriptions.
24
+ text_emb = model.encode(
25
+ [
26
+ "A crazy cat smiling.",
27
+ "A white and brown cat with a yellow bandana.",
28
+ "A man eating in the garden.",
29
+ ]
30
+ )
31
+ print(text_emb.shape) # noqa
32
+ # Output: (3, 512)
33
+
34
+ # Compute similarities.
35
+ similarity_scores = model.similarity(img_emb, text_emb)
36
+ print(similarity_scores) # noqa
37
+ # Output: tensor([[0.3068, 0.3300, 0.1719]])
configs/digital_data_etl_maxime_labonne.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ user_full_name: Maxime Labonne # [First Name(s)] [Last Name]
10
+ links:
11
+ # Personal Blog
12
+ - https://mlabonne.github.io/blog/posts/2024-07-29_Finetune_Llama31.html
13
+ - https://mlabonne.github.io/blog/posts/2024-07-15_The_Rise_of_Agentic_Data_Generation.html
14
+ # Substack
15
+ - https://maximelabonne.substack.com/p/uncensor-any-llm-with-abliteration-d30148b7d43e
16
+ - https://maximelabonne.substack.com/p/create-mixtures-of-experts-with-mergekit-11b318c99562
17
+ - https://maximelabonne.substack.com/p/merge-large-language-models-with-mergekit-2118fb392b54
18
+ - https://maximelabonne.substack.com/p/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac
19
+ - https://maximelabonne.substack.com/p/exllamav2-the-fastest-library-to-run-llms-32aeda294d26
20
+ - https://maximelabonne.substack.com/p/quantize-llama-models-with-ggml-and-llama-cpp-3612dfbcc172
21
+ - https://maximelabonne.substack.com/p/a-beginners-guide-to-llm-fine-tuning-4bae7d4da672
22
+ - https://maximelabonne.substack.com/p/graph-convolutional-networks-introduction-to-gnns-24b3f60d6c95
23
+ - https://maximelabonne.substack.com/p/4-bit-quantization-with-gptq-36b0f4f02c34
24
+ - https://maximelabonne.substack.com/p/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32
25
+ - https://maximelabonne.substack.com/p/introduction-to-weight-quantization-2494701b9c0c
26
+ - https://maximelabonne.substack.com/p/decoding-strategies-in-large-language-models-9733a8f70539
27
+ - https://maximelabonne.substack.com/p/the-art-of-spending-optimizing-your-marketing-budget-with-nonlinear-optimization-6c8a39afb3c2
28
+ - https://maximelabonne.substack.com/p/create-a-bot-to-find-diamonds-in-minecraft-d836606a993a
29
+ - https://maximelabonne.substack.com/p/constraint-programming-67ac16fa0c81
30
+ - https://maximelabonne.substack.com/p/how-to-design-the-most-powerful-graph-neural-network-3d18b07a6e66
31
+ - https://maximelabonne.substack.com/p/introduction-to-graphsage-in-python-a9e7f9ecf9d7
32
+ - https://maximelabonne.substack.com/p/graph-attention-networks-in-python-975736ac5c0c
33
+ - https://maximelabonne.substack.com/p/integer-programming-vs-linear-programming-in-python-f1be5bb4e60e
34
+ - https://maximelabonne.substack.com/p/introduction-to-linear-programming-in-python-9261e7eb44b
35
+ - https://maximelabonne.substack.com/p/what-is-a-tensor-in-deep-learning-6dedd95d6507
36
+ - https://maximelabonne.substack.com/p/efficiently-iterating-over-rows-in-a-pandas-dataframe-7dd5f9992c01
37
+ - https://maximelabonne.substack.com/p/q-learning-for-beginners-2837b777741
38
+ - https://maximelabonne.substack.com/p/how-to-start-machine-learning-for-developers-in-2022-390af12b193f
configs/digital_data_etl_paul_iusztin.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ user_full_name: Paul Iusztin # [First Name(s)] [Last Name]
10
+ links:
11
+ # Medium (only articles that are not under the paid wall work)
12
+ - https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f
13
+ - https://medium.com/decodingml/a-real-time-retrieval-system-for-rag-on-social-media-data-9cc01d50a2a0
14
+ - https://medium.com/decodingml/sota-python-streaming-pipelines-for-fine-tuning-llms-and-rag-in-real-time-82eb07795b87
15
+ - https://medium.com/decodingml/the-4-advanced-rag-algorithms-you-must-know-to-implement-5d0c7f1199d2
16
+ - https://medium.com/decodingml/architect-scalable-and-cost-effective-llm-rag-inference-pipelines-73b94ef82a99
17
+ # Substack
18
+ - https://decodingml.substack.com/p/real-time-feature-pipelines-with?r=1ttoeh
19
+ - https://decodingml.substack.com/p/building-ml-systems-the-right-way?r=1ttoeh
20
+ - https://decodingml.substack.com/p/reduce-your-pytorchs-code-latency?r=1ttoeh
21
+ - https://decodingml.substack.com/p/llm-agents-demystified?r=1ttoeh
22
+ - https://decodingml.substack.com/p/scalable-rag-ingestion-pipeline-using?r=1ttoeh
23
+ - https://decodingml.substack.com/p/the-ultimate-mlops-tool?r=1ttoeh
24
+ - https://decodingml.substack.com/p/the-new-king-of-infrastructure-as?r=1ttoeh
25
+ - https://decodingml.substack.com/p/highly-scalable-data-ingestion-architecture?r=1ttoeh
26
+ - https://decodingml.substack.com/p/2-key-llmops-concepts?r=1ttoeh
27
+ - https://decodingml.substack.com/p/the-llm-twin-free-course-on-production?r=1ttoeh
28
+ - https://decodingml.substack.com/p/a-blueprint-for-designing-production?r=1ttoeh
29
+ - https://decodingml.substack.com/p/the-difference-between-development?r=1ttoeh
30
+ - https://decodingml.substack.com/p/architect-scalable-and-cost-effective?r=1ttoeh
31
+ - https://decodingml.substack.com/p/7-tips-to-reduce-your-vram-when-training?r=1ttoeh
32
+ - https://decodingml.substack.com/p/using-this-python-package-you-can?r=1ttoeh
33
+ - https://decodingml.substack.com/p/the-4-advanced-rag-algorithms-you?r=1ttoeh
34
+ - https://decodingml.substack.com/p/problems-deploying-your-ml-models?r=1ttoeh
35
+ - https://decodingml.substack.com/p/sota-python-streaming-pipelines-for?r=1ttoeh
36
+ - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
37
+ - https://decodingml.substack.com/p/my-ml-monthly-learning-resource-recommendations?r=1ttoeh
38
+ - https://decodingml.substack.com/p/an-end-to-end-framework-for-production?r=1ttoeh
39
+ - https://decodingml.substack.com/p/upskill-your-llm-knowledge-base-with?r=1ttoeh
40
+ - https://decodingml.substack.com/p/want-to-learn-an-end-to-end-framework?r=1ttoeh
41
+ - https://decodingml.substack.com/p/my-favorite-way-to-implement-a-configuration?r=1ttoeh
42
+ - https://decodingml.substack.com/p/a-real-time-retrieval-system-for?r=1ttoeh
43
+ - https://decodingml.substack.com/p/4-key-decoding-strategies-for-llms?r=1ttoeh
44
+ - https://decodingml.substack.com/p/dml-new-year-the-new-and-improved?r=1ttoeh
45
+ - https://decodingml.substack.com/p/dml-8-types-of-mlops-tools-that-must?r=1ttoeh
46
+ - https://decodingml.substack.com/p/dml-this-is-what-you-need-to-build?r=1ttoeh
47
+ - https://decodingml.substack.com/p/dml-7-steps-on-how-to-fine-tune-an?r=1ttoeh
48
+ - https://decodingml.substack.com/p/dml-how-do-you-generate-a-q-and-a?r=1ttoeh
49
+ - https://decodingml.substack.com/p/dml-what-do-you-need-to-fine-tune?r=1ttoeh
50
+ - https://decodingml.substack.com/p/dml-why-and-when-do-you-need-to-fine?r=1ttoeh
51
+ - https://decodingml.substack.com/p/dml-how-to-implement-a-streaming?r=1ttoeh
52
+ - https://decodingml.substack.com/p/dml-why-and-what-do-you-need-a-streaming?r=1ttoeh
53
+ - https://decodingml.substack.com/p/dml-unwrapping-the-3-pipeline-design?r=1ttoeh
54
+ - https://decodingml.substack.com/p/dml-how-to-design-an-llm-system-for?r=1ttoeh
55
+ - https://decodingml.substack.com/p/dml-synced-vector-dbs-a-guide-to?r=1ttoeh
56
+ - https://decodingml.substack.com/p/dml-what-is-the-difference-between?r=1ttoeh
57
+ - https://decodingml.substack.com/p/dml-7-steps-to-build-a-production?r=1ttoeh
58
+ - https://decodingml.substack.com/p/dml-chain-of-thought-reasoning-write?r=1ttoeh
59
+ - https://decodingml.substack.com/p/dml-build-and-serve-a-production?r=1ttoeh
60
+ - https://decodingml.substack.com/p/dml-4-key-ideas-you-must-know-to?r=1ttoeh
61
+ - https://decodingml.substack.com/p/dml-how-to-add-real-time-monitoring?r=1ttoeh
62
+ - https://decodingml.substack.com/p/dml-top-6-ml-platform-features-you?r=1ttoeh
configs/end_to_end_data.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ # Data ETL & Feature engineering pipelines parameters
10
+ author_links:
11
+ - user_full_name: Paul Iusztin # [First Name(s)] [Last Name]
12
+ links:
13
+ # Medium (only articles that are not under the paid wall work)
14
+ - https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f
15
+ - https://medium.com/decodingml/a-real-time-retrieval-system-for-rag-on-social-media-data-9cc01d50a2a0
16
+ - https://medium.com/decodingml/sota-python-streaming-pipelines-for-fine-tuning-llms-and-rag-in-real-time-82eb07795b87
17
+ - https://medium.com/decodingml/the-4-advanced-rag-algorithms-you-must-know-to-implement-5d0c7f1199d2
18
+ - https://medium.com/decodingml/architect-scalable-and-cost-effective-llm-rag-inference-pipelines-73b94ef82a99
19
+ # Substack
20
+ - https://decodingml.substack.com/p/a-blueprint-for-designing-production?r=1ttoeh
21
+ - https://decodingml.substack.com/p/the-difference-between-development?r=1ttoeh
22
+ - https://decodingml.substack.com/p/architect-scalable-and-cost-effective?r=1ttoeh
23
+ - https://decodingml.substack.com/p/7-tips-to-reduce-your-vram-when-training?r=1ttoeh
24
+ - https://decodingml.substack.com/p/using-this-python-package-you-can?r=1ttoeh
25
+ - https://decodingml.substack.com/p/the-4-advanced-rag-algorithms-you?r=1ttoeh
26
+ - https://decodingml.substack.com/p/problems-deploying-your-ml-models?r=1ttoeh
27
+ - https://decodingml.substack.com/p/sota-python-streaming-pipelines-for?r=1ttoeh
28
+ - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
29
+ - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
30
+ - https://decodingml.substack.com/p/my-ml-monthly-learning-resource-recommendations?r=1ttoeh
31
+ - https://decodingml.substack.com/p/an-end-to-end-framework-for-production?r=1ttoeh
32
+ - https://decodingml.substack.com/p/upskill-your-llm-knowledge-base-with?r=1ttoeh
33
+ - https://decodingml.substack.com/p/want-to-learn-an-end-to-end-framework?r=1ttoeh
34
+ - https://decodingml.substack.com/p/my-favorite-way-to-implement-a-configuration?r=1ttoeh
35
+ - https://decodingml.substack.com/p/a-real-time-retrieval-system-for?r=1ttoeh
36
+ - https://decodingml.substack.com/p/4-key-decoding-strategies-for-llms?r=1ttoeh
37
+ - https://decodingml.substack.com/p/dml-new-year-the-new-and-improved?r=1ttoeh
38
+ - https://decodingml.substack.com/p/dml-8-types-of-mlops-tools-that-must?r=1ttoeh
39
+ - https://decodingml.substack.com/p/dml-this-is-what-you-need-to-build?r=1ttoeh
40
+ - https://decodingml.substack.com/p/dml-7-steps-on-how-to-fine-tune-an?r=1ttoeh
41
+ - https://decodingml.substack.com/p/dml-how-do-you-generate-a-q-and-a?r=1ttoeh
42
+ - https://decodingml.substack.com/p/dml-what-do-you-need-to-fine-tune?r=1ttoeh
43
+ - https://decodingml.substack.com/p/dml-why-and-when-do-you-need-to-fine?r=1ttoeh
44
+ - https://decodingml.substack.com/p/dml-how-to-implement-a-streaming?r=1ttoeh
45
+ - https://decodingml.substack.com/p/dml-why-and-what-do-you-need-a-streaming?r=1ttoeh
46
+ - https://decodingml.substack.com/p/dml-unwrapping-the-3-pipeline-design?r=1ttoeh
47
+ - https://decodingml.substack.com/p/dml-how-to-design-an-llm-system-for?r=1ttoeh
48
+ - https://decodingml.substack.com/p/dml-synced-vector-dbs-a-guide-to?r=1ttoeh
49
+ - https://decodingml.substack.com/p/dml-what-is-the-difference-between?r=1ttoeh
50
+ - https://decodingml.substack.com/p/dml-7-steps-to-build-a-production?r=1ttoeh
51
+ - https://decodingml.substack.com/p/dml-chain-of-thought-reasoning-write?r=1ttoeh
52
+ - https://decodingml.substack.com/p/dml-build-and-serve-a-production?r=1ttoeh
53
+ - https://decodingml.substack.com/p/dml-4-key-ideas-you-must-know-to?r=1ttoeh
54
+ - https://decodingml.substack.com/p/dml-how-to-add-real-time-monitoring?r=1ttoeh
55
+ - https://decodingml.substack.com/p/dml-top-6-ml-platform-features-you?r=1ttoeh
56
+ - user_full_name: Maxime Labonne # [First Name(s)] [Last Name]
57
+ links:
58
+ # Substack
59
+ - https://maximelabonne.substack.com/p/uncensor-any-llm-with-abliteration-d30148b7d43e
60
+ - https://maximelabonne.substack.com/p/create-mixtures-of-experts-with-mergekit-11b318c99562
61
+ - https://maximelabonne.substack.com/p/merge-large-language-models-with-mergekit-2118fb392b54
62
+ - https://maximelabonne.substack.com/p/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac
63
+ - https://maximelabonne.substack.com/p/exllamav2-the-fastest-library-to-run-llms-32aeda294d26
64
+ - https://maximelabonne.substack.com/p/quantize-llama-models-with-ggml-and-llama-cpp-3612dfbcc172
65
+ - https://maximelabonne.substack.com/p/a-beginners-guide-to-llm-fine-tuning-4bae7d4da672
66
+ - https://maximelabonne.substack.com/p/graph-convolutional-networks-introduction-to-gnns-24b3f60d6c95
67
+ - https://maximelabonne.substack.com/p/4-bit-quantization-with-gptq-36b0f4f02c34
68
+ - https://maximelabonne.substack.com/p/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32
69
+ - https://maximelabonne.substack.com/p/introduction-to-weight-quantization-2494701b9c0c
70
+ - https://maximelabonne.substack.com/p/decoding-strategies-in-large-language-models-9733a8f70539
71
+ - https://maximelabonne.substack.com/p/the-art-of-spending-optimizing-your-marketing-budget-with-nonlinear-optimization-6c8a39afb3c2
72
+ - https://maximelabonne.substack.com/p/create-a-bot-to-find-diamonds-in-minecraft-d836606a993a
73
+ - https://maximelabonne.substack.com/p/constraint-programming-67ac16fa0c81
74
+ - https://maximelabonne.substack.com/p/how-to-design-the-most-powerful-graph-neural-network-3d18b07a6e66
75
+ - https://maximelabonne.substack.com/p/introduction-to-graphsage-in-python-a9e7f9ecf9d7
76
+ - https://maximelabonne.substack.com/p/graph-attention-networks-in-python-975736ac5c0c
77
+ - https://maximelabonne.substack.com/p/integer-programming-vs-linear-programming-in-python-f1be5bb4e60e
78
+ - https://maximelabonne.substack.com/p/introduction-to-linear-programming-in-python-9261e7eb44b
79
+ - https://maximelabonne.substack.com/p/what-is-a-tensor-in-deep-learning-6dedd95d6507
80
+ - https://maximelabonne.substack.com/p/efficiently-iterating-over-rows-in-a-pandas-dataframe-7dd5f9992c01
81
+ - https://maximelabonne.substack.com/p/q-learning-for-beginners-2837b777741
82
+ - https://maximelabonne.substack.com/p/how-to-start-machine-learning-for-developers-in-2022-390af12b193f
83
+ # Generate instruct dataset pipeline parameters
84
+ test_split_size: 0.1
85
+ push_to_huggingface: false
86
+ dataset_id: pauliusztin/llmtwin
87
+ mock: false
configs/evaluating.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ is_dummy: true # Change this to 'false' to run the evaluation on the full dataset.
configs/export_artifact_to_json.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ artifact_names:
10
+ - raw_documents
11
+ - cleaned_documents
12
+ - instruct_datasets
13
+ - preference_datasets
configs/feature_engineering.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ author_full_names:
10
+ - Maxime Labonne
11
+ - Paul Iusztin
configs/generate_instruct_datasets.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ test_split_size: 0.1
10
+ dataset_type: "instruction"
11
+ push_to_huggingface: true
12
+ dataset_id: pauliusztin/llmtwin
13
+ mock: false
configs/generate_preference_datasets.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ test_split_size: 0.05
10
+ dataset_type: "preference"
11
+ push_to_huggingface: true
12
+ dataset_id: pauliusztin/llmtwin-dpo
13
+ mock: false
configs/training.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ finetuning_type: sft
10
+ num_train_epochs: 3
11
+ per_device_train_batch_size: 2
12
+ learning_rate: 3e-4
13
+ dataset_huggingface_workspace: mlabonne
14
+ is_dummy: true # Change this to 'false' to run the training with the full dataset and epochs.
data/artifacts/cleaned_documents.json ADDED
The diff for this file is too large to render. See raw diff
 
data/artifacts/instruct_datasets.json ADDED
The diff for this file is too large to render. See raw diff
 
data/artifacts/preference_datasets.json ADDED
The diff for this file is too large to render. See raw diff
 
data/artifacts/raw_documents.json ADDED
The diff for this file is too large to render. See raw diff
 
data/data_warehouse_raw_data/ArticleDocument.json ADDED
The diff for this file is too large to render. See raw diff
 
data/data_warehouse_raw_data/PostDocument.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []
data/data_warehouse_raw_data/RepositoryDocument.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []
data/data_warehouse_raw_data/UserDocument.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [{"first_name": "Maxime", "last_name": "Labonne", "_id": "eff74089-0271-4319-8543-745c087f4f61"}, {"first_name": "Paul", "last_name": "Iusztin", "_id": "b5fa1f08-75f0-402d-8e88-d1357e346d9e"}]
demonstration.ipynb ADDED
@@ -0,0 +1,1027 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "OBCrx5fSG4Qm"
7
+ },
8
+ "source": [
9
+ "# CS-UY 4613: Project\n",
10
+ "\n",
11
+ "Yufei Zhen\n",
12
+ "\n",
13
+ "macOS: Ventura 13.3.1 (a), GPU: Apple M2 Max"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "markdown",
18
+ "metadata": {
19
+ "id": "IptBGhoVG790"
20
+ },
21
+ "source": [
22
+ "## Setup\n",
23
+ "\n",
24
+ "* video source: [https://www.youtube.com/@pantelism](https://www.youtube.com/@pantelism)\n",
25
+ "\n",
26
+ "* **option 1** (repository source: [https://github.com/PacktPublishing/LLM-Engineers-Handbook](https://github.com/PacktPublishing/LLM-Engineers-Handbook))\n",
27
+ "\n"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": null,
33
+ "metadata": {
34
+ "colab": {
35
+ "base_uri": "https://localhost:8080/"
36
+ },
37
+ "id": "8i3CcnpG_VPn",
38
+ "outputId": "597a492a-6305-43a6-e94e-b74fa8a12d7b"
39
+ },
40
+ "outputs": [
41
+ {
42
+ "name": "stdout",
43
+ "output_type": "stream",
44
+ "text": [
45
+ "Cloning into 'LLM-Engineers-Handbook'...\n",
46
+ "remote: Enumerating objects: 1970, done.\u001b[K\n",
47
+ "remote: Counting objects: 100% (515/515), done.\u001b[K\n",
48
+ "remote: Compressing objects: 100% (138/138), done.\u001b[K\n",
49
+ "remote: Total 1970 (delta 414), reused 377 (delta 377), pack-reused 1455 (from 2)\u001b[K\n",
50
+ "Receiving objects: 100% (1970/1970), 4.77 MiB | 21.22 MiB/s, done.\n",
51
+ "Resolving deltas: 100% (1263/1263), done.\n"
52
+ ]
53
+ }
54
+ ],
55
+ "source": [
56
+ "# !git clone https://github.com/PacktPublishing/LLM-Engineers-Handbook.git"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "# !poetry env use 3.11\n",
66
+ "# !poetry install --without aws\n",
67
+ "# !poetry run pre-commit install"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 1,
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stdout",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "MPS available: True\n",
80
+ "CUDA available: False\n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "import torch\n",
86
+ "print(f\"MPS available: {torch.backends.mps.is_available()}\")\n",
87
+ "print(f\"CUDA available: {torch.cuda.is_available()}\")"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "markdown",
92
+ "metadata": {
93
+ "id": "ufyNDhgOYiUh"
94
+ },
95
+ "source": [
96
+ "## RAG Architecture\n",
97
+ "\n",
98
+ "- Integrating into [https://github.com/PacktPublishing/LLM-Engineers-Handbook/tree/main/llm_engineering/application/rag](https://github.com/PacktPublishing/LLM-Engineers-Handbook/tree/main/llm_engineering/application/rag):\n",
99
+ "\n",
100
+ "- Directory overview: \n",
101
+ "\n",
102
+ "```\n",
103
+ ".\n",
104
+ "├── ... \n",
105
+ "├── clips/ # Generated video clip responses\n",
106
+ "├── llm_engineering/ # Core project package\n",
107
+ "│ ├── application/\n",
108
+ "│ │ ├── ...\n",
109
+ "│ │ ├── rag # Main RAG architecture\n",
110
+ "│ │ │ ├── __init__.py\n",
111
+ "│ │ │ ├── base.py\n",
112
+ "│ │ │ ├── multimodel_dispatcher.py (new)\n",
113
+ "│ │ │ ├── pipeline.py (new)\n",
114
+ "│ │ │ ├── prompt_templates.py\n",
115
+ "│ │ │ ├── query_expansion.py\n",
116
+ "│ │ │ ├── reranking.py\n",
117
+ "│ │ │ ├── retriever.py (modified)\n",
118
+ "│ │ │ ├── self_query.py\n",
119
+ "│ │ │ ├── topic_retriever.py (new)\n",
120
+ "│ │ │ ├── video_ingetser.py (new)\n",
121
+ "│ │ │ ├── video_processor.py (new)\n",
122
+ "│ ├── domain/\n",
123
+ "│ │ ├── ...\n",
124
+ "│ │ ├── queries.py (modified)\n",
125
+ "│ │ ├── video_chunks.py (new)\n",
126
+ "├── demonstration.ipynb (YOU'RE HERE)\n",
127
+ "```"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "markdown",
132
+ "metadata": {},
133
+ "source": [
134
+ "## Video Ingestion"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 1,
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "video_db = \"/Users/yufeizhen/Desktop/project/videos\""
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 2,
149
+ "metadata": {},
150
+ "outputs": [
151
+ {
152
+ "name": "stderr",
153
+ "output_type": "stream",
154
+ "text": [
155
+ "\u001b[32m2025-05-04 03:25:21.777\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mllm_engineering.settings\u001b[0m:\u001b[36mload_settings\u001b[0m:\u001b[36m94\u001b[0m - \u001b[1mLoading settings from the ZenML secret store.\u001b[0m\n",
156
+ "\u001b[32m2025-05-04 03:25:21.929\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mllm_engineering.settings\u001b[0m:\u001b[36mload_settings\u001b[0m:\u001b[36m99\u001b[0m - \u001b[33m\u001b[1mFailed to load settings from the ZenML secret store. Defaulting to loading the settings from the '.env' file.\u001b[0m\n",
157
+ "\u001b[32m2025-05-04 03:25:22.015\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mllm_engineering.infrastructure.db.mongo\u001b[0m:\u001b[36m__new__\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mConnection to MongoDB with URI successful: mongodb://llm_engineering:[email protected]:27017\u001b[0m\n"
158
+ ]
159
+ },
160
+ {
161
+ "name": "stdout",
162
+ "output_type": "stream",
163
+ "text": [
164
+ "\u001b[1;35mPyTorch version 2.2.2 available.\u001b[0m\n"
165
+ ]
166
+ },
167
+ {
168
+ "name": "stderr",
169
+ "output_type": "stream",
170
+ "text": [
171
+ "\u001b[32m2025-05-04 03:25:23.410\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mllm_engineering.infrastructure.db.qdrant\u001b[0m:\u001b[36m__new__\u001b[0m:\u001b[36m29\u001b[0m - \u001b[1mConnection to Qdrant DB with URI successful: str\u001b[0m\n"
172
+ ]
173
+ },
174
+ {
175
+ "name": "stdout",
176
+ "output_type": "stream",
177
+ "text": [
178
+ "\u001b[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2\u001b[0m\n",
179
+ "Initializing fallback TextEmbedder\n",
180
+ "\u001b[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2\u001b[0m\n",
181
+ "Loading CLIP model: openai/clip-vit-base-patch32\n",
182
+ "CLIP model loaded successfully\n",
183
+ "Initialized embedders\n",
184
+ "Loaded NLP model\n",
185
+ "Loaded BERTopic\n",
186
+ "Processing videos from: /Users/yufeizhen/Desktop/project/videos\n",
187
+ "Already processed 8 videos\n",
188
+ "Previously processed videos:\n",
189
+ " - 9CGGh6ivg68\n",
190
+ " - FCQ-rih6cHY\n",
191
+ " - TV-DjM8242s\n",
192
+ " - WXoOohWU28Y\n",
193
+ " - eFgkZKhNUdM\n",
194
+ " - eQ6UE968Xe4\n",
195
+ " - lb_5AdUpfuA\n",
196
+ " - rCVlIVKqqGE\n",
197
+ "Found 8 video folders\n",
198
+ "Will process 0 videos (8 skipped)\n",
199
+ "Skipping TV-DjM8242s (already processed)\n",
200
+ "Skipping eFgkZKhNUdM (already processed)\n",
201
+ "Skipping eQ6UE968Xe4 (already processed)\n",
202
+ "Skipping rCVlIVKqqGE (already processed)\n",
203
+ "Skipping lb_5AdUpfuA (already processed)\n",
204
+ "Skipping FCQ-rih6cHY (already processed)\n",
205
+ "Skipping 9CGGh6ivg68 (already processed)\n",
206
+ "Skipping WXoOohWU28Y (already processed)\n",
207
+ "\n",
208
+ "All videos processed!\n",
209
+ "Total processed videos: 8\n"
210
+ ]
211
+ }
212
+ ],
213
+ "source": [
214
+ "from llm_engineering.application.rag.video_ingester import VideoIngester\n",
215
+ "\n",
216
+ "ingester = VideoIngester(video_root=video_db)\n",
217
+ "# ingester.process_video_library(force_reprocess=True)\n",
218
+ "ingester.process_video_library()"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 3,
224
+ "metadata": {},
225
+ "outputs": [
226
+ {
227
+ "name": "stdout",
228
+ "output_type": "stream",
229
+ "text": [
230
+ "Total stored vectors: 403\n"
231
+ ]
232
+ }
233
+ ],
234
+ "source": [
235
+ "from qdrant_client import QdrantClient\n",
236
+ "\n",
237
+ "client = QdrantClient(path=\"/Users/yufeizhen/Desktop/project/qdrant_storage\")\n",
238
+ "print(\"Total stored vectors:\", client.count(\"video_chunks\").count)"
239
+ ]
240
+ },
241
+ {
242
+ "attachments": {},
243
+ "cell_type": "markdown",
244
+ "metadata": {},
245
+ "source": [
246
+ "## Video Q&A"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": 3,
252
+ "metadata": {},
253
+ "outputs": [
254
+ {
255
+ "name": "stdout",
256
+ "output_type": "stream",
257
+ "text": [
258
+ "Initializing VideoQAEngine\n",
259
+ "Video root: /Users/yufeizhen/Desktop/project/videos\n",
260
+ "Qdrant storage path: /Users/yufeizhen/Desktop/project/qdrant_storage\n",
261
+ "Connected to Qdrant storage at: /Users/yufeizhen/Desktop/project/qdrant_storage\n",
262
+ "Available collections: collections=[CollectionDescription(name='video_chunks')]\n",
263
+ "Found video_chunks collection with 403 points\n",
264
+ "Initializing fallback TextEmbedder\n",
265
+ "\u001b[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2\u001b[0m\n",
266
+ "Loading CLIP model: openai/clip-vit-base-patch32\n",
267
+ "CLIP model loaded successfully\n",
268
+ "VideoQAEngine initialized successfully\n"
269
+ ]
270
+ }
271
+ ],
272
+ "source": [
273
+ "from llm_engineering.application.rag.pipeline import VideoQAEngine\n",
274
+ "\n",
275
+ "engine = VideoQAEngine(video_root=video_db)\n",
276
+ "\n",
277
+ "def respond(question):\n",
278
+ " clips = engine.ask(question)\n",
279
+ " return [(str(clip[\"path\"]), f\"Relevance: {clip['score']:.2f}\") for clip in clips]"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": 4,
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "question = \"Using only the videos, explain the the binary cross entropy loss function.\""
289
+ ]
290
+ },
291
+ {
292
+ "cell_type": "code",
293
+ "execution_count": 5,
294
+ "metadata": {},
295
+ "outputs": [
296
+ {
297
+ "name": "stdout",
298
+ "output_type": "stream",
299
+ "text": [
300
+ "\n",
301
+ "--- Processing query: 'Using only the videos, explain the the binary cross entropy loss function.' ---\n",
302
+ "Retrieving relevant video segments...\n",
303
+ "Encoding query with CLIP: 'Using only the videos, explain the the binary cros...'\n",
304
+ "Cleaned text for CLIP: Using only the videos, explain the the binary cros...\n",
305
+ "Query embedded successfully\n",
306
+ "Sending search request to Qdrant (attempt 1/5)\n",
307
+ "Creating fresh connection to Qdrant...\n",
308
+ "Search successful, found 3 results\n",
309
+ "Retrieval completed in 0.07 seconds\n",
310
+ "Found 3 relevant video segments\n",
311
+ "\n",
312
+ "Processing result 1/3:\n",
313
+ " Video ID: eFgkZKhNUdM\n",
314
+ " Timestamps: 1270.0s - 1302.0s\n",
315
+ " Score: 0.8472\n",
316
+ " Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
317
+ " Creating clip to: clips/clip_eFgkZKhNUdM_1270_0.847.mp4\n",
318
+ " Clip created successfully\n",
319
+ "\n",
320
+ "Processing result 2/3:\n",
321
+ " Video ID: eFgkZKhNUdM\n",
322
+ " Timestamps: 642.0s - 647.0s\n",
323
+ " Score: 0.8467\n",
324
+ " Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
325
+ " Creating clip to: clips/clip_eFgkZKhNUdM_642_0.847.mp4\n",
326
+ " Clip created successfully\n",
327
+ "\n",
328
+ "Processing result 3/3:\n",
329
+ " Video ID: eFgkZKhNUdM\n",
330
+ " Timestamps: 874.0s - 882.0s\n",
331
+ " Score: 0.8379\n",
332
+ " Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
333
+ " Creating clip to: clips/clip_eFgkZKhNUdM_874_0.838.mp4\n",
334
+ " Clip created successfully\n",
335
+ "\n",
336
+ "Processed 3 clips successfully\n"
337
+ ]
338
+ },
339
+ {
340
+ "data": {
341
+ "text/plain": [
342
+ "[('clips/clip_eFgkZKhNUdM_1270_0.847.mp4', 'Relevance: 0.85'),\n",
343
+ " ('clips/clip_eFgkZKhNUdM_642_0.847.mp4', 'Relevance: 0.85'),\n",
344
+ " ('clips/clip_eFgkZKhNUdM_874_0.838.mp4', 'Relevance: 0.84')]"
345
+ ]
346
+ },
347
+ "execution_count": 5,
348
+ "metadata": {},
349
+ "output_type": "execute_result"
350
+ }
351
+ ],
352
+ "source": [
353
+ "respond(question)"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "markdown",
358
+ "metadata": {},
359
+ "source": [
360
+ "## Gradio App"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": 4,
366
+ "metadata": {},
367
+ "outputs": [
368
+ {
369
+ "name": "stdout",
370
+ "output_type": "stream",
371
+ "text": [
372
+ "\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttps://api.gradio.app/pkg-version\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n"
373
+ ]
374
+ }
375
+ ],
376
+ "source": [
377
+ "import gradio as gr\n",
378
+ "\n",
379
+ "interface = gr.Interface(\n",
380
+ " fn=respond,\n",
381
+ " inputs=gr.Textbox(label=\"Ask about the video content\"),\n",
382
+ " outputs=gr.Gallery(label=\"Relevant Video Clips\"),\n",
383
+ " examples=[\n",
384
+ " [\"Using only the videos, explain how ResNets work.\"],\n",
385
+ " [\"Using only the videos, explain the advantages of CNNs over fully connected networks.\"],\n",
386
+ " [\"Using only the videos, explain the the binary cross entropy loss function.\"]\n",
387
+ " ]\n",
388
+ ")"
389
+ ]
390
+ },
391
+ {
392
+ "cell_type": "code",
393
+ "execution_count": 5,
394
+ "metadata": {},
395
+ "outputs": [
396
+ {
397
+ "name": "stdout",
398
+ "output_type": "stream",
399
+ "text": [
400
+ "* Running on local URL: http://127.0.0.1:7860\n",
401
+ "\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttp://127.0.0.1:7860/gradio_api/startup-events\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
402
+ "\u001b[1;35mHTTP Request: HEAD \u001b[0m\u001b[34mhttp://127.0.0.1:7860/\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
403
+ "\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttps://api.gradio.app/v3/tunnel-request\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
404
+ "* Running on public URL: https://382d4d0bacff86ee02.gradio.live\n",
405
+ "\n",
406
+ "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n",
407
+ "\u001b[1;35mHTTP Request: HEAD \u001b[0m\u001b[34mhttps://382d4d0bacff86ee02.gradio.live\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n"
408
+ ]
409
+ },
410
+ {
411
+ "data": {
412
+ "text/html": [
413
+ "<div><iframe src=\"https://382d4d0bacff86ee02.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
414
+ ],
415
+ "text/plain": [
416
+ "<IPython.core.display.HTML object>"
417
+ ]
418
+ },
419
+ "metadata": {},
420
+ "output_type": "display_data"
421
+ },
422
+ {
423
+ "data": {
424
+ "text/plain": []
425
+ },
426
+ "execution_count": 5,
427
+ "metadata": {},
428
+ "output_type": "execute_result"
429
+ },
430
+ {
431
+ "name": "stdout",
432
+ "output_type": "stream",
433
+ "text": [
434
+ "\n",
435
+ "--- Processing query: 'Using only the videos, explain the the binary cross entropy loss function.' ---\n",
436
+ "Retrieving relevant video segments...\n",
437
+ "Encoding query with CLIP: 'Using only the videos, explain the the binary cros...'\n",
438
+ "Cleaned text for CLIP: Using only the videos, explain the the binary cross entropy loss function....\n",
439
+ "Cleaned text for CLIP: Using only the videos, explain the the binary cros...\n",
440
+ "Query embedded successfully\n",
441
+ "Sending search request to Qdrant (attempt 1/5)\n",
442
+ "Search successful, found 3 results\n",
443
+ "Retrieval completed in 0.34 seconds\n",
444
+ "Found 3 relevant video segments\n",
445
+ "\n",
446
+ "Processing result 1/3:\n",
447
+ " Video ID: eFgkZKhNUdM\n",
448
+ " Timestamps: 1270.0s - 1302.0s\n",
449
+ " Score: 0.8472\n",
450
+ " Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
451
+ " Creating clip to: clips/clip_eFgkZKhNUdM_1270_0.847.mp4\n",
452
+ " Clip created successfully\n",
453
+ "\n",
454
+ "Processing result 2/3:\n",
455
+ " Video ID: eFgkZKhNUdM\n",
456
+ " Timestamps: 642.0s - 647.0s\n",
457
+ " Score: 0.8467\n",
458
+ " Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
459
+ " Creating clip to: clips/clip_eFgkZKhNUdM_642_0.847.mp4\n",
460
+ " Clip created successfully\n",
461
+ "\n",
462
+ "Processing result 3/3:\n",
463
+ " Video ID: eFgkZKhNUdM\n",
464
+ " Timestamps: 874.0s - 882.0s\n",
465
+ " Score: 0.8379\n",
466
+ " Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
467
+ " Creating clip to: clips/clip_eFgkZKhNUdM_874_0.838.mp4\n",
468
+ " Clip created successfully\n",
469
+ "\n",
470
+ "Processed 3 clips successfully\n"
471
+ ]
472
+ }
473
+ ],
474
+ "source": [
475
+ "interface.launch(share=True)"
476
+ ]
477
+ },
478
+ {
479
+ "cell_type": "code",
480
+ "execution_count": 9,
481
+ "metadata": {},
482
+ "outputs": [
483
+ {
484
+ "name": "stdout",
485
+ "output_type": "stream",
486
+ "text": [
487
+ "Initializing VideoQAEngine\n",
488
+ "Video root: /Users/yufeizhen/Desktop/project/videos\n",
489
+ "Qdrant storage path: /Users/yufeizhen/Desktop/project/qdrant_storage\n",
490
+ "Connected to Qdrant storage at: /Users/yufeizhen/Desktop/project/qdrant_storage\n",
491
+ "Available collections: collections=[CollectionDescription(name='video_chunks')]\n",
492
+ "Found video_chunks collection with 403 points\n",
493
+ "Initializing fallback TextEmbedder\n",
494
+ "\u001b[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2\u001b[0m\n",
495
+ "Loading CLIP model: openai/clip-vit-base-patch32\n",
496
+ "CLIP model loaded successfully\n",
497
+ "VideoQAEngine initialized successfully\n"
498
+ ]
499
+ },
500
+ {
501
+ "name": "stdout",
502
+ "output_type": "stream",
503
+ "text": [
504
+ "\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttps://api.gradio.app/pkg-version\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n"
505
+ ]
506
+ }
507
+ ],
508
+ "source": [
509
+ "import gradio as gr\n",
510
+ "from llm_engineering.application.rag.pipeline import VideoQAEngine\n",
511
+ "\n",
512
+ "# Initialize the VideoQAEngine with the video database\n",
513
+ "video_db = \"/Users/yufeizhen/Desktop/project/videos\"\n",
514
+ "engine = VideoQAEngine(video_root=video_db)\n",
515
+ "\n",
516
+ "# Define the chat function that processes messages and returns relevant video clips\n",
517
+ "def chat(message, history):\n",
518
+ " # Process message to get relevant clips\n",
519
+ " clips = engine.ask(message)\n",
520
+ " \n",
521
+ " # Format for display\n",
522
+ " clips_gallery = [(str(clip[\"path\"]), \"Relevance: {:.2f}\".format(clip['score'])) for clip in clips]\n",
523
+ " \n",
524
+ " # Return both a text response and the clips\n",
525
+ " return \"Here are the relevant video clips for: '{}'\".format(message), clips_gallery\n",
526
+ "\n",
527
+ "# Create a more flexible interface using Blocks\n",
528
+ "with gr.Blocks(theme=\"soft\") as demo:\n",
529
+ " gr.Markdown(\"# Chat with your Video Library\")\n",
530
+ " gr.Markdown(\"Ask questions about the video content and get relevant clips. You can continue the conversation with follow-up questions.\")\n",
531
+ " \n",
532
+ " # Create chatbot for conversation history\n",
533
+ " chatbot = gr.Chatbot(height=300)\n",
534
+ " \n",
535
+ " # Create gallery to display video clips\n",
536
+ " gallery = gr.Gallery(label=\"Relevant Video Clips\", show_label=True)\n",
537
+ " \n",
538
+ " # Create message input\n",
539
+ " msg = gr.Textbox(\n",
540
+ " placeholder=\"Ask about the video content...\", \n",
541
+ " label=\"Your Question\",\n",
542
+ " show_label=False\n",
543
+ " )\n",
544
+ " \n",
545
+ " # Define clear button\n",
546
+ " clear = gr.Button(\"Clear\")\n",
547
+ " \n",
548
+ " # Example questions\n",
549
+ " gr.Examples(\n",
550
+ " examples=[\n",
551
+ " \"Using only the videos, explain how ResNets work.\",\n",
552
+ " \"Using only the videos, explain the advantages of CNNs over fully connected networks.\",\n",
553
+ " \"Using only the videos, explain the the binary cross entropy loss function.\"\n",
554
+ " ],\n",
555
+ " inputs=msg\n",
556
+ " )\n",
557
+ " \n",
558
+ " # Define the chat function that updates both chatbot and gallery\n",
559
+ " def respond(message, chat_history):\n",
560
+ " # Get text response and clips\n",
561
+ " response, clips = chat(message, chat_history)\n",
562
+ " \n",
563
+ " # Update chat history\n",
564
+ " chat_history.append((message, response))\n",
565
+ " \n",
566
+ " # Return updated chat history and gallery\n",
567
+ " return \"\", chat_history, clips\n",
568
+ " \n",
569
+ " # Set up the event handlers\n",
570
+ " msg.submit(respond, [msg, chatbot], [msg, chatbot, gallery])\n",
571
+ " clear.click(lambda: ([], [], None), None, [chatbot, gallery, msg])"
572
+ ]
573
+ },
574
+ {
575
+ "cell_type": "code",
576
+ "execution_count": 10,
577
+ "metadata": {},
578
+ "outputs": [
579
+ {
580
+ "name": "stdout",
581
+ "output_type": "stream",
582
+ "text": [
583
+ "* Running on local URL: http://127.0.0.1:7861\n",
584
+ "\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttp://127.0.0.1:7861/gradio_api/startup-events\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
585
+ "\u001b[1;35mHTTP Request: HEAD \u001b[0m\u001b[34mhttp://127.0.0.1:7861/\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
586
+ "\u001b[1;35mHTTP Request: GET \u001b[0m\u001b[34mhttps://api.gradio.app/v3/tunnel-request\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n",
587
+ "* Running on public URL: https://48d861a2319613eb9b.gradio.live\n",
588
+ "\n",
589
+ "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n",
590
+ "\u001b[1;35mHTTP Request: HEAD \u001b[0m\u001b[34mhttps://48d861a2319613eb9b.gradio.live\u001b[1;35m \"HTTP/1.1 200 OK\"\u001b[0m\n"
591
+ ]
592
+ },
593
+ {
594
+ "data": {
595
+ "text/html": [
596
+ "<div><iframe src=\"https://48d861a2319613eb9b.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
597
+ ],
598
+ "text/plain": [
599
+ "<IPython.core.display.HTML object>"
600
+ ]
601
+ },
602
+ "metadata": {},
603
+ "output_type": "display_data"
604
+ },
605
+ {
606
+ "data": {
607
+ "text/plain": []
608
+ },
609
+ "execution_count": 10,
610
+ "metadata": {},
611
+ "output_type": "execute_result"
612
+ },
613
+ {
614
+ "name": "stdout",
615
+ "output_type": "stream",
616
+ "text": [
617
+ "\n",
618
+ "--- Processing query: 'Using only the videos, explain the the binary cross entropy loss function.' ---\n",
619
+ "Retrieving relevant video segments...\n",
620
+ "Encoding query with CLIP: 'Using only the videos, explain the the binary cros...'\n",
621
+ "Cleaned text for CLIP: Using only the videos, explain the the binary cros...\n",
622
+ "Query embedded successfully\n",
623
+ "Sending search request to Qdrant (attempt 1/5)\n",
624
+ "Creating fresh connection to Qdrant...\n",
625
+ "Search successful, found 3 results\n",
626
+ "Retrieval completed in 0.07 seconds\n",
627
+ "Found 3 relevant video segments\n",
628
+ "\n",
629
+ "Processing result 1/3:\n",
630
+ " Video ID: eFgkZKhNUdM\n",
631
+ " Timestamps: 1270.0s - 1302.0s\n",
632
+ " Score: 0.8472\n",
633
+ " Found alternative video path: /Users/yufeizhen/Desktop/project/videos/eFgkZKhNUdM/eFgkZKhNUdM.mp4\n",
634
+ " Creating clip to: clips/clip_eFgkZKhNUdM_1270_0.847.mp4\n"
635
+ ]
636
+ }
637
+ ],
638
+ "source": [
639
+ "demo.launch(share=True)"
640
+ ]
641
+ }
642
+ ],
643
+ "metadata": {
644
+ "accelerator": "GPU",
645
+ "colab": {
646
+ "collapsed_sections": [
647
+ "gFdZON-DKOlx",
648
+ "KVkt7HBUKTig",
649
+ "d9KkwKhjKXgr",
650
+ "6NQUu-rUKfln"
651
+ ],
652
+ "gpuType": "A100",
653
+ "machine_shape": "hm",
654
+ "provenance": []
655
+ },
656
+ "kernelspec": {
657
+ "display_name": "llm-engineering---hH_lZm-py3.11",
658
+ "language": "python",
659
+ "name": "python3"
660
+ },
661
+ "language_info": {
662
+ "codemirror_mode": {
663
+ "name": "ipython",
664
+ "version": 3
665
+ },
666
+ "file_extension": ".py",
667
+ "mimetype": "text/x-python",
668
+ "name": "python",
669
+ "nbconvert_exporter": "python",
670
+ "pygments_lexer": "ipython3",
671
+ "version": "3.11.2"
672
+ },
673
+ "vscode": {
674
+ "interpreter": {
675
+ "hash": "b62b85bf5dbe6dfb396652aed826e60c089a2288076109f466cd63de941fd51e"
676
+ }
677
+ },
678
+ "widgets": {
679
+ "application/vnd.jupyter.widget-state+json": {
680
+ "01c5ce2d579c4b7d8ba2607fc6d76328": {
681
+ "model_module": "@jupyter-widgets/controls",
682
+ "model_module_version": "1.5.0",
683
+ "model_name": "HBoxModel",
684
+ "state": {
685
+ "_dom_classes": [],
686
+ "_model_module": "@jupyter-widgets/controls",
687
+ "_model_module_version": "1.5.0",
688
+ "_model_name": "HBoxModel",
689
+ "_view_count": null,
690
+ "_view_module": "@jupyter-widgets/controls",
691
+ "_view_module_version": "1.5.0",
692
+ "_view_name": "HBoxView",
693
+ "box_style": "",
694
+ "children": [
695
+ "IPY_MODEL_15bf9c9d1cf44b5abee88dddc74a927b",
696
+ "IPY_MODEL_05fb5c50098f4e5997c07afed3b9383e",
697
+ "IPY_MODEL_cf25f66f1ca943be8af0c7ce14727f21"
698
+ ],
699
+ "layout": "IPY_MODEL_d3f93597cb6640a29656daa95153f07f"
700
+ }
701
+ },
702
+ "05fb5c50098f4e5997c07afed3b9383e": {
703
+ "model_module": "@jupyter-widgets/controls",
704
+ "model_module_version": "1.5.0",
705
+ "model_name": "FloatProgressModel",
706
+ "state": {
707
+ "_dom_classes": [],
708
+ "_model_module": "@jupyter-widgets/controls",
709
+ "_model_module_version": "1.5.0",
710
+ "_model_name": "FloatProgressModel",
711
+ "_view_count": null,
712
+ "_view_module": "@jupyter-widgets/controls",
713
+ "_view_module_version": "1.5.0",
714
+ "_view_name": "ProgressView",
715
+ "bar_style": "success",
716
+ "description": "",
717
+ "description_tooltip": null,
718
+ "layout": "IPY_MODEL_7d39d74185e549e483dd3bc9a5fe8c76",
719
+ "max": 1,
720
+ "min": 0,
721
+ "orientation": "horizontal",
722
+ "style": "IPY_MODEL_c66b78a449c6451fbe757d1082304de5",
723
+ "value": 1
724
+ }
725
+ },
726
+ "15bf9c9d1cf44b5abee88dddc74a927b": {
727
+ "model_module": "@jupyter-widgets/controls",
728
+ "model_module_version": "1.5.0",
729
+ "model_name": "HTMLModel",
730
+ "state": {
731
+ "_dom_classes": [],
732
+ "_model_module": "@jupyter-widgets/controls",
733
+ "_model_module_version": "1.5.0",
734
+ "_model_name": "HTMLModel",
735
+ "_view_count": null,
736
+ "_view_module": "@jupyter-widgets/controls",
737
+ "_view_module_version": "1.5.0",
738
+ "_view_name": "HTMLView",
739
+ "description": "",
740
+ "description_tooltip": null,
741
+ "layout": "IPY_MODEL_2aa0d8f2c2bc4b3083edfcc5dc7ccc10",
742
+ "placeholder": "​",
743
+ "style": "IPY_MODEL_a0e715cd015d499ba015c54e810bf81d",
744
+ "value": "Batches: 100%"
745
+ }
746
+ },
747
+ "2aa0d8f2c2bc4b3083edfcc5dc7ccc10": {
748
+ "model_module": "@jupyter-widgets/base",
749
+ "model_module_version": "1.2.0",
750
+ "model_name": "LayoutModel",
751
+ "state": {
752
+ "_model_module": "@jupyter-widgets/base",
753
+ "_model_module_version": "1.2.0",
754
+ "_model_name": "LayoutModel",
755
+ "_view_count": null,
756
+ "_view_module": "@jupyter-widgets/base",
757
+ "_view_module_version": "1.2.0",
758
+ "_view_name": "LayoutView",
759
+ "align_content": null,
760
+ "align_items": null,
761
+ "align_self": null,
762
+ "border": null,
763
+ "bottom": null,
764
+ "display": null,
765
+ "flex": null,
766
+ "flex_flow": null,
767
+ "grid_area": null,
768
+ "grid_auto_columns": null,
769
+ "grid_auto_flow": null,
770
+ "grid_auto_rows": null,
771
+ "grid_column": null,
772
+ "grid_gap": null,
773
+ "grid_row": null,
774
+ "grid_template_areas": null,
775
+ "grid_template_columns": null,
776
+ "grid_template_rows": null,
777
+ "height": null,
778
+ "justify_content": null,
779
+ "justify_items": null,
780
+ "left": null,
781
+ "margin": null,
782
+ "max_height": null,
783
+ "max_width": null,
784
+ "min_height": null,
785
+ "min_width": null,
786
+ "object_fit": null,
787
+ "object_position": null,
788
+ "order": null,
789
+ "overflow": null,
790
+ "overflow_x": null,
791
+ "overflow_y": null,
792
+ "padding": null,
793
+ "right": null,
794
+ "top": null,
795
+ "visibility": null,
796
+ "width": null
797
+ }
798
+ },
799
+ "5072655a4a724daa9d7b660e1709fa11": {
800
+ "model_module": "@jupyter-widgets/controls",
801
+ "model_module_version": "1.5.0",
802
+ "model_name": "DescriptionStyleModel",
803
+ "state": {
804
+ "_model_module": "@jupyter-widgets/controls",
805
+ "_model_module_version": "1.5.0",
806
+ "_model_name": "DescriptionStyleModel",
807
+ "_view_count": null,
808
+ "_view_module": "@jupyter-widgets/base",
809
+ "_view_module_version": "1.2.0",
810
+ "_view_name": "StyleView",
811
+ "description_width": ""
812
+ }
813
+ },
814
+ "7d39d74185e549e483dd3bc9a5fe8c76": {
815
+ "model_module": "@jupyter-widgets/base",
816
+ "model_module_version": "1.2.0",
817
+ "model_name": "LayoutModel",
818
+ "state": {
819
+ "_model_module": "@jupyter-widgets/base",
820
+ "_model_module_version": "1.2.0",
821
+ "_model_name": "LayoutModel",
822
+ "_view_count": null,
823
+ "_view_module": "@jupyter-widgets/base",
824
+ "_view_module_version": "1.2.0",
825
+ "_view_name": "LayoutView",
826
+ "align_content": null,
827
+ "align_items": null,
828
+ "align_self": null,
829
+ "border": null,
830
+ "bottom": null,
831
+ "display": null,
832
+ "flex": null,
833
+ "flex_flow": null,
834
+ "grid_area": null,
835
+ "grid_auto_columns": null,
836
+ "grid_auto_flow": null,
837
+ "grid_auto_rows": null,
838
+ "grid_column": null,
839
+ "grid_gap": null,
840
+ "grid_row": null,
841
+ "grid_template_areas": null,
842
+ "grid_template_columns": null,
843
+ "grid_template_rows": null,
844
+ "height": null,
845
+ "justify_content": null,
846
+ "justify_items": null,
847
+ "left": null,
848
+ "margin": null,
849
+ "max_height": null,
850
+ "max_width": null,
851
+ "min_height": null,
852
+ "min_width": null,
853
+ "object_fit": null,
854
+ "object_position": null,
855
+ "order": null,
856
+ "overflow": null,
857
+ "overflow_x": null,
858
+ "overflow_y": null,
859
+ "padding": null,
860
+ "right": null,
861
+ "top": null,
862
+ "visibility": null,
863
+ "width": null
864
+ }
865
+ },
866
+ "a0e715cd015d499ba015c54e810bf81d": {
867
+ "model_module": "@jupyter-widgets/controls",
868
+ "model_module_version": "1.5.0",
869
+ "model_name": "DescriptionStyleModel",
870
+ "state": {
871
+ "_model_module": "@jupyter-widgets/controls",
872
+ "_model_module_version": "1.5.0",
873
+ "_model_name": "DescriptionStyleModel",
874
+ "_view_count": null,
875
+ "_view_module": "@jupyter-widgets/base",
876
+ "_view_module_version": "1.2.0",
877
+ "_view_name": "StyleView",
878
+ "description_width": ""
879
+ }
880
+ },
881
+ "c66b78a449c6451fbe757d1082304de5": {
882
+ "model_module": "@jupyter-widgets/controls",
883
+ "model_module_version": "1.5.0",
884
+ "model_name": "ProgressStyleModel",
885
+ "state": {
886
+ "_model_module": "@jupyter-widgets/controls",
887
+ "_model_module_version": "1.5.0",
888
+ "_model_name": "ProgressStyleModel",
889
+ "_view_count": null,
890
+ "_view_module": "@jupyter-widgets/base",
891
+ "_view_module_version": "1.2.0",
892
+ "_view_name": "StyleView",
893
+ "bar_color": null,
894
+ "description_width": ""
895
+ }
896
+ },
897
+ "cb41643480ca4834bcc03611c4783326": {
898
+ "model_module": "@jupyter-widgets/base",
899
+ "model_module_version": "1.2.0",
900
+ "model_name": "LayoutModel",
901
+ "state": {
902
+ "_model_module": "@jupyter-widgets/base",
903
+ "_model_module_version": "1.2.0",
904
+ "_model_name": "LayoutModel",
905
+ "_view_count": null,
906
+ "_view_module": "@jupyter-widgets/base",
907
+ "_view_module_version": "1.2.0",
908
+ "_view_name": "LayoutView",
909
+ "align_content": null,
910
+ "align_items": null,
911
+ "align_self": null,
912
+ "border": null,
913
+ "bottom": null,
914
+ "display": null,
915
+ "flex": null,
916
+ "flex_flow": null,
917
+ "grid_area": null,
918
+ "grid_auto_columns": null,
919
+ "grid_auto_flow": null,
920
+ "grid_auto_rows": null,
921
+ "grid_column": null,
922
+ "grid_gap": null,
923
+ "grid_row": null,
924
+ "grid_template_areas": null,
925
+ "grid_template_columns": null,
926
+ "grid_template_rows": null,
927
+ "height": null,
928
+ "justify_content": null,
929
+ "justify_items": null,
930
+ "left": null,
931
+ "margin": null,
932
+ "max_height": null,
933
+ "max_width": null,
934
+ "min_height": null,
935
+ "min_width": null,
936
+ "object_fit": null,
937
+ "object_position": null,
938
+ "order": null,
939
+ "overflow": null,
940
+ "overflow_x": null,
941
+ "overflow_y": null,
942
+ "padding": null,
943
+ "right": null,
944
+ "top": null,
945
+ "visibility": null,
946
+ "width": null
947
+ }
948
+ },
949
+ "cf25f66f1ca943be8af0c7ce14727f21": {
950
+ "model_module": "@jupyter-widgets/controls",
951
+ "model_module_version": "1.5.0",
952
+ "model_name": "HTMLModel",
953
+ "state": {
954
+ "_dom_classes": [],
955
+ "_model_module": "@jupyter-widgets/controls",
956
+ "_model_module_version": "1.5.0",
957
+ "_model_name": "HTMLModel",
958
+ "_view_count": null,
959
+ "_view_module": "@jupyter-widgets/controls",
960
+ "_view_module_version": "1.5.0",
961
+ "_view_name": "HTMLView",
962
+ "description": "",
963
+ "description_tooltip": null,
964
+ "layout": "IPY_MODEL_cb41643480ca4834bcc03611c4783326",
965
+ "placeholder": "​",
966
+ "style": "IPY_MODEL_5072655a4a724daa9d7b660e1709fa11",
967
+ "value": " 1/1 [00:00&lt;00:00,  3.12it/s]"
968
+ }
969
+ },
970
+ "d3f93597cb6640a29656daa95153f07f": {
971
+ "model_module": "@jupyter-widgets/base",
972
+ "model_module_version": "1.2.0",
973
+ "model_name": "LayoutModel",
974
+ "state": {
975
+ "_model_module": "@jupyter-widgets/base",
976
+ "_model_module_version": "1.2.0",
977
+ "_model_name": "LayoutModel",
978
+ "_view_count": null,
979
+ "_view_module": "@jupyter-widgets/base",
980
+ "_view_module_version": "1.2.0",
981
+ "_view_name": "LayoutView",
982
+ "align_content": null,
983
+ "align_items": null,
984
+ "align_self": null,
985
+ "border": null,
986
+ "bottom": null,
987
+ "display": null,
988
+ "flex": null,
989
+ "flex_flow": null,
990
+ "grid_area": null,
991
+ "grid_auto_columns": null,
992
+ "grid_auto_flow": null,
993
+ "grid_auto_rows": null,
994
+ "grid_column": null,
995
+ "grid_gap": null,
996
+ "grid_row": null,
997
+ "grid_template_areas": null,
998
+ "grid_template_columns": null,
999
+ "grid_template_rows": null,
1000
+ "height": null,
1001
+ "justify_content": null,
1002
+ "justify_items": null,
1003
+ "left": null,
1004
+ "margin": null,
1005
+ "max_height": null,
1006
+ "max_width": null,
1007
+ "min_height": null,
1008
+ "min_width": null,
1009
+ "object_fit": null,
1010
+ "object_position": null,
1011
+ "order": null,
1012
+ "overflow": null,
1013
+ "overflow_x": null,
1014
+ "overflow_y": null,
1015
+ "padding": null,
1016
+ "right": null,
1017
+ "top": null,
1018
+ "visibility": null,
1019
+ "width": null
1020
+ }
1021
+ }
1022
+ }
1023
+ }
1024
+ },
1025
+ "nbformat": 4,
1026
+ "nbformat_minor": 0
1027
+ }
docker-compose.yml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ mongo:
3
+ image: mongo:latest
4
+ container_name: "llm_engineering_mongo"
5
+ logging:
6
+ options:
7
+ max-size: 1g
8
+ environment:
9
+ MONGO_INITDB_ROOT_USERNAME: "llm_engineering"
10
+ MONGO_INITDB_ROOT_PASSWORD: "llm_engineering"
11
+ ports:
12
+ - 27017:27017
13
+ volumes:
14
+ - mongo_data:/data/db
15
+ networks:
16
+ - local
17
+ restart: always
18
+
19
+ qdrant:
20
+ image: qdrant/qdrant:latest
21
+ container_name: "llm_engineering_qdrant"
22
+ ports:
23
+ - 6333:6333
24
+ - 6334:6334
25
+ expose:
26
+ - 6333
27
+ - 6334
28
+ volumes:
29
+ - qdrant_data:/qdrant/storage
30
+ networks:
31
+ - local
32
+ restart: always
33
+
34
+ volumes:
35
+ mongo_data:
36
+ qdrant_data:
37
+
38
+ networks:
39
+ local:
40
+ driver: bridge
en_core_web_sm-3.7.0-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6215d71a3212690e9aec49408a27e3fe6ad7cd6c715476e93d70dc784041e93e
3
+ size 12803377
images/cover_plus.png ADDED

Git LFS Details

  • SHA256: 6c1fb593200bd1869523b5a0f98fa3a6347e767bb857d9fbdfc2ca2d5b506589
  • Pointer size: 131 Bytes
  • Size of remote file: 453 kB
images/crazy_cat.jpg ADDED

Git LFS Details

  • SHA256: dbbfa6caf6fd4653be260ecb7863c485d092c23e4ca60d4fc6f1bf6b8e8340ec
  • Pointer size: 131 Bytes
  • Size of remote file: 123 kB
llm_engineering/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from llm_engineering import application, domain, infrastructure
2
+ from llm_engineering.settings import settings
3
+
4
+ __all__ = ["settings", "application", "domain", "infrastructure"]
llm_engineering/application/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from . import utils
2
+
3
+ __all__ = ["utils"]
llm_engineering/application/crawlers/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .dispatcher import CrawlerDispatcher
2
+ from .github import GithubCrawler
3
+ from .linkedin import LinkedInCrawler
4
+ from .medium import MediumCrawler
5
+
6
+ __all__ = ["CrawlerDispatcher", "GithubCrawler", "LinkedInCrawler", "MediumCrawler"]
llm_engineering/application/crawlers/base.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from abc import ABC, abstractmethod
3
+ from tempfile import mkdtemp
4
+
5
+ import chromedriver_autoinstaller
6
+ from selenium import webdriver
7
+ from selenium.webdriver.chrome.options import Options
8
+
9
+ from llm_engineering.domain.documents import NoSQLBaseDocument
10
+
11
+ # Check if the current version of chromedriver exists
12
+ # and if it doesn't exist, download it automatically,
13
+ # then add chromedriver to path
14
+ chromedriver_autoinstaller.install()
15
+
16
+
17
+ class BaseCrawler(ABC):
18
+ model: type[NoSQLBaseDocument]
19
+
20
+ @abstractmethod
21
+ def extract(self, link: str, **kwargs) -> None: ...
22
+
23
+
24
+ class BaseSeleniumCrawler(BaseCrawler, ABC):
25
+ def __init__(self, scroll_limit: int = 5) -> None:
26
+ options = webdriver.ChromeOptions()
27
+
28
+ options.add_argument("--no-sandbox")
29
+ options.add_argument("--headless=new")
30
+ options.add_argument("--disable-dev-shm-usage")
31
+ options.add_argument("--log-level=3")
32
+ options.add_argument("--disable-popup-blocking")
33
+ options.add_argument("--disable-notifications")
34
+ options.add_argument("--disable-extensions")
35
+ options.add_argument("--disable-background-networking")
36
+ options.add_argument("--ignore-certificate-errors")
37
+ options.add_argument(f"--user-data-dir={mkdtemp()}")
38
+ options.add_argument(f"--data-path={mkdtemp()}")
39
+ options.add_argument(f"--disk-cache-dir={mkdtemp()}")
40
+ options.add_argument("--remote-debugging-port=9226")
41
+
42
+ self.set_extra_driver_options(options)
43
+
44
+ self.scroll_limit = scroll_limit
45
+ self.driver = webdriver.Chrome(
46
+ options=options,
47
+ )
48
+
49
+ def set_extra_driver_options(self, options: Options) -> None:
50
+ pass
51
+
52
+ def login(self) -> None:
53
+ pass
54
+
55
+ def scroll_page(self) -> None:
56
+ """Scroll through the LinkedIn page based on the scroll limit."""
57
+ current_scroll = 0
58
+ last_height = self.driver.execute_script("return document.body.scrollHeight")
59
+ while True:
60
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
61
+ time.sleep(5)
62
+ new_height = self.driver.execute_script("return document.body.scrollHeight")
63
+ if new_height == last_height or (self.scroll_limit and current_scroll >= self.scroll_limit):
64
+ break
65
+ last_height = new_height
66
+ current_scroll += 1
llm_engineering/application/crawlers/custom_article.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.parse import urlparse
2
+
3
+ from langchain_community.document_loaders import AsyncHtmlLoader
4
+ from langchain_community.document_transformers.html2text import Html2TextTransformer
5
+ from loguru import logger
6
+
7
+ from llm_engineering.domain.documents import ArticleDocument
8
+
9
+ from .base import BaseCrawler
10
+
11
+
12
+ class CustomArticleCrawler(BaseCrawler):
13
+ model = ArticleDocument
14
+
15
+ def __init__(self) -> None:
16
+ super().__init__()
17
+
18
+ def extract(self, link: str, **kwargs) -> None:
19
+ old_model = self.model.find(link=link)
20
+ if old_model is not None:
21
+ logger.info(f"Article already exists in the database: {link}")
22
+
23
+ return
24
+
25
+ logger.info(f"Starting scrapping article: {link}")
26
+
27
+ loader = AsyncHtmlLoader([link])
28
+ docs = loader.load()
29
+
30
+ html2text = Html2TextTransformer()
31
+ docs_transformed = html2text.transform_documents(docs)
32
+ doc_transformed = docs_transformed[0]
33
+
34
+ content = {
35
+ "Title": doc_transformed.metadata.get("title"),
36
+ "Subtitle": doc_transformed.metadata.get("description"),
37
+ "Content": doc_transformed.page_content,
38
+ "language": doc_transformed.metadata.get("language"),
39
+ }
40
+
41
+ parsed_url = urlparse(link)
42
+ platform = parsed_url.netloc
43
+
44
+ user = kwargs["user"]
45
+ instance = self.model(
46
+ content=content,
47
+ link=link,
48
+ platform=platform,
49
+ author_id=user.id,
50
+ author_full_name=user.full_name,
51
+ )
52
+ instance.save()
53
+
54
+ logger.info(f"Finished scrapping custom article: {link}")
llm_engineering/application/crawlers/dispatcher.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from urllib.parse import urlparse
3
+
4
+ from loguru import logger
5
+
6
+ from .base import BaseCrawler
7
+ from .custom_article import CustomArticleCrawler
8
+ from .github import GithubCrawler
9
+ from .linkedin import LinkedInCrawler
10
+ from .medium import MediumCrawler
11
+
12
+
13
+ class CrawlerDispatcher:
14
+ def __init__(self) -> None:
15
+ self._crawlers = {}
16
+
17
+ @classmethod
18
+ def build(cls) -> "CrawlerDispatcher":
19
+ dispatcher = cls()
20
+
21
+ return dispatcher
22
+
23
+ def register_medium(self) -> "CrawlerDispatcher":
24
+ self.register("https://medium.com", MediumCrawler)
25
+
26
+ return self
27
+
28
+ def register_linkedin(self) -> "CrawlerDispatcher":
29
+ self.register("https://linkedin.com", LinkedInCrawler)
30
+
31
+ return self
32
+
33
+ def register_github(self) -> "CrawlerDispatcher":
34
+ self.register("https://github.com", GithubCrawler)
35
+
36
+ return self
37
+
38
+ def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
39
+ parsed_domain = urlparse(domain)
40
+ domain = parsed_domain.netloc
41
+
42
+ self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler
43
+
44
+ def get_crawler(self, url: str) -> BaseCrawler:
45
+ for pattern, crawler in self._crawlers.items():
46
+ if re.match(pattern, url):
47
+ return crawler()
48
+ else:
49
+ logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.")
50
+
51
+ return CustomArticleCrawler()
llm_engineering/application/crawlers/github.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ import tempfile
5
+
6
+ from loguru import logger
7
+
8
+ from llm_engineering.domain.documents import RepositoryDocument
9
+
10
+ from .base import BaseCrawler
11
+
12
+
13
+ class GithubCrawler(BaseCrawler):
14
+ model = RepositoryDocument
15
+
16
+ def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
17
+ super().__init__()
18
+ self._ignore = ignore
19
+
20
+ def extract(self, link: str, **kwargs) -> None:
21
+ old_model = self.model.find(link=link)
22
+ if old_model is not None:
23
+ logger.info(f"Repository already exists in the database: {link}")
24
+
25
+ return
26
+
27
+ logger.info(f"Starting scrapping GitHub repository: {link}")
28
+
29
+ repo_name = link.rstrip("/").split("/")[-1]
30
+
31
+ local_temp = tempfile.mkdtemp()
32
+
33
+ try:
34
+ os.chdir(local_temp)
35
+ subprocess.run(["git", "clone", link])
36
+
37
+ repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118
38
+
39
+ tree = {}
40
+ for root, _, files in os.walk(repo_path):
41
+ dir = root.replace(repo_path, "").lstrip("/")
42
+ if dir.startswith(self._ignore):
43
+ continue
44
+
45
+ for file in files:
46
+ if file.endswith(self._ignore):
47
+ continue
48
+ file_path = os.path.join(dir, file) # noqa: PTH118
49
+ with open(os.path.join(root, file), "r", errors="ignore") as f: # noqa: PTH123, PTH118
50
+ tree[file_path] = f.read().replace(" ", "")
51
+
52
+ user = kwargs["user"]
53
+ instance = self.model(
54
+ content=tree,
55
+ name=repo_name,
56
+ link=link,
57
+ platform="github",
58
+ author_id=user.id,
59
+ author_full_name=user.full_name,
60
+ )
61
+ instance.save()
62
+
63
+ except Exception:
64
+ raise
65
+ finally:
66
+ shutil.rmtree(local_temp)
67
+
68
+ logger.info(f"Finished scrapping GitHub repository: {link}")