feat: upgrade to streamlit, lcel, langsmith, and huggingface deployment
Browse files- .chainlit/config.toml +0 -78
- .devcontainer/devcontainer.json +3 -4
- .devcontainer/setup.sh +95 -0
- .env.sample +10 -1
- .github/workflows/deploy-to-hf.yml +23 -0
- .streamlit/config.toml +3 -0
- .vscode/settings.json +8 -6
- README.md +34 -5
- app/app.py +287 -161
- app/prompt.py +20 -22
- app/utils.py +106 -0
- chainlit.md +0 -8
- requirements.txt +8 -8
.chainlit/config.toml
DELETED
@@ -1,78 +0,0 @@
|
|
1 |
-
[project]
|
2 |
-
# Whether to enable telemetry (default: true). No personal data is collected.
|
3 |
-
enable_telemetry = true
|
4 |
-
|
5 |
-
# List of environment variables to be provided by each user to use the app.
|
6 |
-
user_env = []
|
7 |
-
|
8 |
-
# Duration (in seconds) during which the session is saved when the connection is lost
|
9 |
-
session_timeout = 3600
|
10 |
-
|
11 |
-
# Enable third parties caching (e.g LangChain cache)
|
12 |
-
cache = false
|
13 |
-
|
14 |
-
# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
|
15 |
-
# follow_symlink = false
|
16 |
-
|
17 |
-
[features]
|
18 |
-
# Show the prompt playground
|
19 |
-
prompt_playground = true
|
20 |
-
|
21 |
-
# Authorize users to upload files with messages
|
22 |
-
multi_modal = true
|
23 |
-
|
24 |
-
# Allows user to use speech to text
|
25 |
-
[features.speech_to_text]
|
26 |
-
enabled = false
|
27 |
-
# See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
|
28 |
-
# language = "en-US"
|
29 |
-
|
30 |
-
[UI]
|
31 |
-
# Name of the app and chatbot.
|
32 |
-
name = "Chatbot"
|
33 |
-
|
34 |
-
# Show the readme while the conversation is empty.
|
35 |
-
show_readme_as_default = true
|
36 |
-
|
37 |
-
# Description of the app and chatbot. This is used for HTML tags.
|
38 |
-
# description = ""
|
39 |
-
|
40 |
-
# Large size content are by default collapsed for a cleaner ui
|
41 |
-
default_collapse_content = true
|
42 |
-
|
43 |
-
# The default value for the expand messages settings.
|
44 |
-
default_expand_messages = false
|
45 |
-
|
46 |
-
# Hide the chain of thought details from the user in the UI.
|
47 |
-
hide_cot = false
|
48 |
-
|
49 |
-
# Link to your github repo. This will add a github button in the UI's header.
|
50 |
-
github = "https://github.com/LinkedInLearning/hands-on-ai-building-and-deploying-llm-powered-apps-4511409"
|
51 |
-
|
52 |
-
# Specify a CSS file that can be used to customize the user interface.
|
53 |
-
# The CSS file can be served from the public directory or via an external link.
|
54 |
-
# custom_css = "/public/test.css"
|
55 |
-
|
56 |
-
# Override default MUI light theme. (Check theme.ts)
|
57 |
-
[UI.theme.light]
|
58 |
-
#background = "#FAFAFA"
|
59 |
-
#paper = "#FFFFFF"
|
60 |
-
|
61 |
-
[UI.theme.light.primary]
|
62 |
-
#main = "#F80061"
|
63 |
-
#dark = "#980039"
|
64 |
-
#light = "#FFE7EB"
|
65 |
-
|
66 |
-
# Override default MUI dark theme. (Check theme.ts)
|
67 |
-
[UI.theme.dark]
|
68 |
-
#background = "#FAFAFA"
|
69 |
-
#paper = "#FFFFFF"
|
70 |
-
|
71 |
-
[UI.theme.dark.primary]
|
72 |
-
#main = "#F80061"
|
73 |
-
#dark = "#980039"
|
74 |
-
#light = "#FFE7EB"
|
75 |
-
|
76 |
-
|
77 |
-
[meta]
|
78 |
-
generated_by = "0.7.501"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.devcontainer/devcontainer.json
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
{
|
|
|
2 |
"extensions": [
|
3 |
"GitHub.github-vscode-theme",
|
4 |
"ms-toolsai.jupyter",
|
5 |
"ms-python.python"
|
6 |
-
// Additional Extensions Here
|
7 |
],
|
8 |
-
"onCreateCommand"
|
9 |
}
|
10 |
-
|
11 |
-
// DevContainer Reference: https://code.visualstudio.com/docs/remote/devcontainerjson-reference
|
|
|
1 |
{
|
2 |
+
"image": "mcr.microsoft.com/devcontainers/python:3.11",
|
3 |
"extensions": [
|
4 |
"GitHub.github-vscode-theme",
|
5 |
"ms-toolsai.jupyter",
|
6 |
"ms-python.python"
|
|
|
7 |
],
|
8 |
+
"onCreateCommand": "bash .devcontainer/setup.sh"
|
9 |
}
|
10 |
+
// DevContainer Reference: https://code.visualstudio.com/docs/remote/devcontainerjson-reference
|
|
.devcontainer/setup.sh
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -euo pipefail
|
3 |
+
|
4 |
+
echo "Upgrading pip..."
|
5 |
+
pip install --upgrade pip || {
|
6 |
+
echo "Failed to upgrade pip"
|
7 |
+
exit 1
|
8 |
+
}
|
9 |
+
|
10 |
+
echo "π§ Installing NVM..."
|
11 |
+
export NVM_DIR="$HOME/.nvm"
|
12 |
+
mkdir -p "$NVM_DIR"
|
13 |
+
|
14 |
+
# Download and install NVM
|
15 |
+
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash || {
|
16 |
+
echo "Failed to download NVM installer"
|
17 |
+
exit 1
|
18 |
+
}
|
19 |
+
|
20 |
+
# Add NVM to bashrc for future sessions
|
21 |
+
echo 'export NVM_DIR="$HOME/.nvm"' >> ~/.bashrc
|
22 |
+
echo '[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"' >> ~/.bashrc
|
23 |
+
echo '[ -s "$NVM_DIR/bash_completion" ] && \. "$NVM_DIR/bash_completion"' >> ~/.bashrc
|
24 |
+
|
25 |
+
# Load NVM for current session
|
26 |
+
if [ -s "$NVM_DIR/nvm.sh" ]; then
|
27 |
+
\. "$NVM_DIR/nvm.sh"
|
28 |
+
echo "NVM loaded successfully"
|
29 |
+
else
|
30 |
+
echo "NVM script not found at $NVM_DIR/nvm.sh"
|
31 |
+
exit 1
|
32 |
+
fi
|
33 |
+
|
34 |
+
# Verify NVM is available
|
35 |
+
if ! command -v nvm &> /dev/null; then
|
36 |
+
echo "NVM command not found after sourcing. Trying alternative approach..."
|
37 |
+
# Try to source it with bash explicitly
|
38 |
+
bash -c "source $NVM_DIR/nvm.sh && nvm --version" || {
|
39 |
+
echo "Failed to verify NVM installation"
|
40 |
+
exit 1
|
41 |
+
}
|
42 |
+
fi
|
43 |
+
|
44 |
+
echo "π¦ Installing Node.js LTS..."
|
45 |
+
# Run nvm commands in a bash subshell to ensure proper environment
|
46 |
+
bash -c "source $NVM_DIR/nvm.sh && nvm install --lts" || {
|
47 |
+
echo "Failed to install Node.js"
|
48 |
+
exit 1
|
49 |
+
}
|
50 |
+
|
51 |
+
# Run nvm use in a bash subshell
|
52 |
+
bash -c "source $NVM_DIR/nvm.sh && nvm use --lts" || {
|
53 |
+
echo "Failed to use Node.js LTS"
|
54 |
+
exit 1
|
55 |
+
}
|
56 |
+
|
57 |
+
echo "π§° Installing latest npm..."
|
58 |
+
# Run npm in a bash subshell to ensure node is available
|
59 |
+
bash -c "source $NVM_DIR/nvm.sh && nvm use --lts && npm install -g npm@latest" || {
|
60 |
+
echo "Failed to update npm"
|
61 |
+
exit 1
|
62 |
+
}
|
63 |
+
|
64 |
+
echo "β
NVM, Node.js, and npm installed successfully."
|
65 |
+
|
66 |
+
if [ -f requirements.txt ]; then
|
67 |
+
echo "Installing requirements..."
|
68 |
+
pip install -r requirements.txt || {
|
69 |
+
echo "Failed to install requirements"
|
70 |
+
exit 1
|
71 |
+
}
|
72 |
+
else
|
73 |
+
echo "No requirements.txt found, skipping package installation"
|
74 |
+
fi
|
75 |
+
|
76 |
+
echo "Setting up terminal prompt..."
|
77 |
+
cat << 'EOF' >> ~/.bashrc
|
78 |
+
# Function to get git branch
|
79 |
+
parse_git_branch() {
|
80 |
+
git branch 2> /dev/null | sed -e '/^[^*]/d' -e 's/* \(.*\)/ (\1)/'
|
81 |
+
}
|
82 |
+
|
83 |
+
# Color definitions
|
84 |
+
BLUE='\[\033[34m\]'
|
85 |
+
GREEN='\[\033[32m\]'
|
86 |
+
YELLOW='\[\033[33m\]'
|
87 |
+
RESET='\[\033[00m\]'
|
88 |
+
|
89 |
+
# Set prompt with current directory and git branch
|
90 |
+
export PS1="${BLUE}\W${RESET}${YELLOW}\$(parse_git_branch)${RESET}${GREEN} $ ${RESET}"
|
91 |
+
EOF
|
92 |
+
|
93 |
+
export ENABLE_BACKGROUND_TASKS=1
|
94 |
+
|
95 |
+
echo "Setup completed successfully!"
|
.env.sample
CHANGED
@@ -1,2 +1,11 @@
|
|
1 |
ALLOW_RESET=TRUE
|
2 |
-
OPENAI_API_KEY="sk-your-openai-api-key"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
ALLOW_RESET=TRUE
|
2 |
+
OPENAI_API_KEY="sk-your-openai-api-key"
|
3 |
+
LANGSMITH_TRACING=true
|
4 |
+
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
|
5 |
+
##########################################################################
|
6 |
+
# Exercise 3:
|
7 |
+
# Reemember to put in your LANGSMITH_API_KEY
|
8 |
+
#
|
9 |
+
##########################################################################
|
10 |
+
LANGSMITH_API_KEY="<your-api-key>"
|
11 |
+
LANGSMITH_PROJECT="linkedin_learning"
|
.github/workflows/deploy-to-hf.yml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Deploy to HuggingFace Spaces
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches: [ main ]
|
6 |
+
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
deploy:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
|
13 |
+
steps:
|
14 |
+
- uses: actions/checkout@v3
|
15 |
+
with:
|
16 |
+
fetch-depth: 0
|
17 |
+
lfs: true
|
18 |
+
|
19 |
+
- name: Push to HuggingFace Spaces
|
20 |
+
env:
|
21 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
22 |
+
run: |
|
23 |
+
git push https://HF_USERNAME:[email protected]/spaces/HF_USERNAME/SPACE_NAME main
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[server]
|
2 |
+
runOnSave = true
|
3 |
+
fileWatcherType = "auto"
|
.vscode/settings.json
CHANGED
@@ -3,21 +3,23 @@
|
|
3 |
"editor.cursorBlinking": "solid",
|
4 |
"editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace",
|
5 |
"editor.fontLigatures": false,
|
6 |
-
"editor.fontSize":
|
7 |
"editor.formatOnPaste": true,
|
8 |
"editor.formatOnSave": true,
|
9 |
"editor.lineNumbers": "on",
|
10 |
"editor.matchBrackets": "always",
|
11 |
"editor.minimap.enabled": false,
|
12 |
"editor.smoothScrolling": true,
|
13 |
-
"editor.tabSize":
|
14 |
"editor.useTabStops": true,
|
15 |
"emmet.triggerExpansionOnTab": true,
|
16 |
-
"explorer.openEditors.visible": 0,
|
17 |
"files.autoSave": "afterDelay",
|
18 |
"screencastMode.onlyKeyboardShortcuts": true,
|
19 |
-
"terminal.integrated.fontSize":
|
20 |
"workbench.colorTheme": "Visual Studio Dark",
|
21 |
"workbench.fontAliasing": "antialiased",
|
22 |
-
"workbench.statusBar.visible": true
|
23 |
-
|
|
|
|
|
|
|
|
3 |
"editor.cursorBlinking": "solid",
|
4 |
"editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace",
|
5 |
"editor.fontLigatures": false,
|
6 |
+
"editor.fontSize": 14,
|
7 |
"editor.formatOnPaste": true,
|
8 |
"editor.formatOnSave": true,
|
9 |
"editor.lineNumbers": "on",
|
10 |
"editor.matchBrackets": "always",
|
11 |
"editor.minimap.enabled": false,
|
12 |
"editor.smoothScrolling": true,
|
13 |
+
"editor.tabSize": 4,
|
14 |
"editor.useTabStops": true,
|
15 |
"emmet.triggerExpansionOnTab": true,
|
|
|
16 |
"files.autoSave": "afterDelay",
|
17 |
"screencastMode.onlyKeyboardShortcuts": true,
|
18 |
+
"terminal.integrated.fontSize": 14,
|
19 |
"workbench.colorTheme": "Visual Studio Dark",
|
20 |
"workbench.fontAliasing": "antialiased",
|
21 |
+
"workbench.statusBar.visible": true,
|
22 |
+
"workbench.tree.indent": 8,
|
23 |
+
"workbench.tree.renderIndentGuides": "always",
|
24 |
+
"workbench.fontSize": 14
|
25 |
+
}
|
README.md
CHANGED
@@ -1,16 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Hands-On AI: Building and Deploying LLM-Powered Apps
|
2 |
This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
|
3 |
|
4 |
_See the readme file in the main branch for updated instructions and information._
|
5 |
-
## Lab6: Prompt Engineering
|
6 |
-
With the prompt templates extracted from the code, we can iterate on the prompts to fix the problem that we have observed!
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
## Exercises
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
## References
|
15 |
|
16 |
-
- [
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: other
|
3 |
+
title: Test
|
4 |
+
sdk: streamlit
|
5 |
+
sdk_version: 1.46.0
|
6 |
+
emoji: π
|
7 |
+
colorFrom: green
|
8 |
+
colorTo: green
|
9 |
+
pinned: false
|
10 |
+
app_file: app/app.py
|
11 |
+
---
|
12 |
# Hands-On AI: Building and Deploying LLM-Powered Apps
|
13 |
This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
|
14 |
|
15 |
_See the readme file in the main branch for updated instructions and information._
|
|
|
|
|
16 |
|
17 |
+
## Lab 7: Deploying the application to Huggingface and trace the application outputs on Langsmith
|
18 |
+
|
19 |
+
Now we have the application up and running, lets deploy it to Huggingface Spaces and trace the applicatio outputs on Langsmith. This way we can proudly show our work and keep record of what our users are doing with our application!
|
20 |
+
|
21 |
+
Before that happens, please register your account on Huggingface and Langsmith.
|
22 |
+
|
23 |
+
> NOTE: [Huggingface Spaces](https://huggingface.co/pricing) provides free tier access starting at $0.
|
24 |
+
> NOTE: [Langsmith](https://www.langchain.com/pricing-langsmith) is a paid application with a Developer tier license that gives access to 1 user and 5k traces per month.
|
25 |
|
26 |
## Exercises
|
27 |
|
28 |
+
After registering accounts on Huggingface and Langsmith, please grab the API keys and lets get to work.
|
29 |
+
|
30 |
+
Oh, and currently we have OpenAI API key baked into our application. Lets make sure that users of our application need to input their own key to use the application!
|
31 |
+
|
32 |
+
And then we will setup CI/CD for automated deployment.
|
33 |
+
|
34 |
+
Complete the exercises in `app/app.py` and `.env` (see `.env.sample`). Make sure you follow the instruction here: [https://huggingface.co/docs/hub/en/spaces-github-actions](https://huggingface.co/docs/hub/en/spaces-github-actions) and here [Hugging Face Hub: Important Git Authentication Changes](https://huggingface.co/blog/password-git-deprecation).
|
35 |
+
|
36 |
+
After deployment, please remember to into Huggingface Space settings to setup the environment variables such as `LANGSMITH_API_KEY`.
|
37 |
+
|
38 |
+
> NOTE: To reduce the scope, we will manually deploy to Huggingface Only. We prepared `.github/workflows/deploy-to-hf.yml` workflow as an extracurricular exercise for the learner.
|
39 |
|
40 |
## References
|
41 |
|
42 |
+
- [Huggingface Spaces](https://huggingface.co/pricing)
|
43 |
+
- [Langsmith](https://www.langchain.com/pricing-langsmith)
|
44 |
+
- [Huggingface Spaces Github Actions](https://huggingface.co/docs/hub/en/spaces-github-actions)
|
45 |
+
- [Hugging Face Hub: Important Git Authentication Changes](https://huggingface.co/blog/password-git-deprecation)
|
app/app.py
CHANGED
@@ -1,196 +1,322 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
import
|
5 |
-
|
6 |
-
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
|
7 |
-
|
8 |
-
from tempfile import NamedTemporaryFile
|
9 |
-
from typing import List
|
10 |
-
|
11 |
-
import chainlit as cl
|
12 |
-
from chainlit.types import AskFileResponse
|
13 |
-
import chromadb
|
14 |
-
from chromadb.config import Settings
|
15 |
-
from langchain.chains import RetrievalQAWithSourcesChain
|
16 |
-
from langchain.chat_models import ChatOpenAI
|
17 |
-
from langchain.document_loaders import PDFPlumberLoader
|
18 |
-
from langchain.embeddings.openai import OpenAIEmbeddings
|
19 |
from langchain.schema import Document
|
20 |
-
from
|
21 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
22 |
-
from langchain.vectorstores import Chroma
|
23 |
from langchain.vectorstores.base import VectorStore
|
24 |
|
25 |
-
from
|
|
|
|
|
|
|
|
|
26 |
|
|
|
|
|
27 |
|
28 |
-
def process_file(*, file: AskFileResponse) -> List[Document]:
|
29 |
-
"""Processes one PDF file from a Chainlit AskFileResponse object by first
|
30 |
-
loading the PDF document and then chunk it into sub documents. Only
|
31 |
-
supports PDF files.
|
32 |
|
33 |
-
|
34 |
-
file (AskFileResponse): input file to be processed
|
35 |
|
36 |
-
Raises:
|
37 |
-
ValueError: when we fail to process PDF files. We consider PDF file
|
38 |
-
processing failure when there's no text returned. For example, PDFs
|
39 |
-
with only image contents, corrupted PDFs, etc.
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
""
|
45 |
-
|
46 |
-
|
|
|
47 |
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
loader = PDFPlumberLoader(tempfile.name)
|
52 |
-
documents = loader.load()
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
|
68 |
|
69 |
-
def
|
70 |
-
|
71 |
-
) -> VectorStore:
|
72 |
-
"""Takes a list of Langchain Documents and an embedding model API wrapper
|
73 |
-
and build a search index using a VectorStore.
|
74 |
|
75 |
Args:
|
76 |
-
|
77 |
-
the
|
78 |
-
embeddings (Embeddings): encoder model API used to calculate embedding
|
79 |
|
80 |
Returns:
|
81 |
-
|
|
|
|
|
82 |
"""
|
83 |
-
|
84 |
-
|
85 |
-
client_settings = Settings(allow_reset=True, anonymized_telemetry=False)
|
86 |
-
|
87 |
-
# Reset the search engine to ensure we don't use old copies.
|
88 |
-
# NOTE: we do not need this for production
|
89 |
-
search_engine = Chroma(client=client, client_settings=client_settings)
|
90 |
-
search_engine._client.reset()
|
91 |
-
search_engine = Chroma.from_documents(
|
92 |
-
client=client,
|
93 |
-
documents=docs,
|
94 |
-
embedding=embeddings,
|
95 |
-
client_settings=client_settings,
|
96 |
-
)
|
97 |
|
98 |
-
|
|
|
|
|
|
|
|
|
99 |
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
|
107 |
-
None
|
108 |
-
"""
|
109 |
-
# Asking user to to upload a PDF to chat with
|
110 |
-
files = None
|
111 |
-
while files is None:
|
112 |
-
files = await cl.AskFileMessage(
|
113 |
-
content="Please Upload the PDF file you want to chat with...",
|
114 |
-
accept=["application/pdf"],
|
115 |
-
max_size_mb=20,
|
116 |
-
).send()
|
117 |
-
file = files[0]
|
118 |
-
|
119 |
-
# Process and save data in the user session
|
120 |
-
msg = cl.Message(content=f"Processing `{file.name}`...")
|
121 |
-
await msg.send()
|
122 |
-
|
123 |
-
docs = process_file(file=file)
|
124 |
-
cl.user_session.set("docs", docs)
|
125 |
-
msg.content = f"`{file.name}` processed. Loading ..."
|
126 |
-
await msg.update()
|
127 |
-
|
128 |
-
# Indexing documents into our search engine
|
129 |
-
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
|
130 |
-
try:
|
131 |
-
search_engine = await cl.make_async(create_search_engine)(
|
132 |
-
docs=docs, embeddings=embeddings
|
133 |
-
)
|
134 |
-
except Exception as e:
|
135 |
-
await cl.Message(content=f"Error: {e}").send()
|
136 |
-
raise SystemError
|
137 |
-
msg.content = f"`{file.name}` loaded. You can now ask questions!"
|
138 |
-
await msg.update()
|
139 |
-
|
140 |
-
model = ChatOpenAI(
|
141 |
-
model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True
|
142 |
-
)
|
143 |
|
144 |
-
chain = RetrievalQAWithSourcesChain.from_chain_type(
|
145 |
-
llm=model,
|
146 |
-
chain_type="stuff",
|
147 |
-
retriever=search_engine.as_retriever(max_tokens_limit=4097),
|
148 |
-
chain_type_kwargs={"prompt": PROMPT, "document_prompt": EXAMPLE_PROMPT},
|
149 |
-
)
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
cl.user_session.set("chain", chain)
|
154 |
|
|
|
|
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
-
response = await chain.acall(
|
162 |
-
message.content,
|
163 |
-
callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)],
|
164 |
-
)
|
165 |
-
answer = response["answer"]
|
166 |
-
sources = response["sources"].strip()
|
167 |
|
168 |
-
|
169 |
-
|
170 |
-
metadatas = [doc.metadata for doc in docs]
|
171 |
-
all_sources = [m["source"] for m in metadatas]
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
# Get the index of the source
|
182 |
-
try:
|
183 |
-
index = all_sources.index(source_name)
|
184 |
-
except ValueError:
|
185 |
-
continue
|
186 |
-
text = docs[index].page_content
|
187 |
-
found_sources.append(source_name)
|
188 |
-
# Create the text element referenced in the message
|
189 |
-
source_elements.append(cl.Text(content=text, name=source_name))
|
190 |
-
|
191 |
-
if found_sources:
|
192 |
-
answer += f"\nSources: {', '.join(found_sources)}"
|
193 |
else:
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
-
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import List, Dict, Any, Tuple
|
3 |
+
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from langchain.callbacks.base import BaseCallbackHandler
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from langchain.schema import Document
|
7 |
+
from langchain_openai import ChatOpenAI
|
|
|
|
|
8 |
from langchain.vectorstores.base import VectorStore
|
9 |
|
10 |
+
from langchain_core.output_parsers import StrOutputParser
|
11 |
+
from langchain_core.runnables import RunnablePassthrough
|
12 |
+
from langchain.memory import ConversationBufferWindowMemory
|
13 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
14 |
+
import streamlit as st
|
15 |
|
16 |
+
from utils import process_file, create_search_engine
|
17 |
+
from prompt import PROMPT, WELCOME_MESSAGE
|
18 |
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
load_dotenv()
|
|
|
21 |
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
# Page configuration
|
24 |
+
st.set_page_config(
|
25 |
+
page_title="PDF Q&A Assistant",
|
26 |
+
page_icon="π",
|
27 |
+
layout="wide",
|
28 |
+
initial_sidebar_state="expanded",
|
29 |
+
)
|
30 |
|
31 |
+
# Initialize session state
|
32 |
+
if "messages" not in st.session_state:
|
33 |
+
st.session_state.messages = []
|
34 |
+
if "chain" not in st.session_state:
|
35 |
+
st.session_state.chain = None
|
36 |
+
if "vector_store" not in st.session_state:
|
37 |
+
st.session_state.vector_store = None
|
38 |
+
if "retriever" not in st.session_state:
|
39 |
+
st.session_state.retriever = None
|
40 |
+
if "docs" not in st.session_state:
|
41 |
+
st.session_state.docs = None
|
42 |
+
if "processed_file" not in st.session_state:
|
43 |
+
st.session_state.processed_file = None
|
44 |
+
if "openai_api_key" not in st.session_state:
|
45 |
+
st.session_state.openai_api_key = None
|
46 |
|
|
|
|
|
47 |
|
48 |
+
def create_qa_chain(vector_store: VectorStore, api_key: str) -> Tuple[Any, Any]:
|
49 |
+
"""Create the QA chain with the vector store using LCEL.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
vector_store: The vector store containing document embeddings
|
53 |
+
api_key: OpenAI API key
|
54 |
|
55 |
+
Returns:
|
56 |
+
Tuple containing:
|
57 |
+
- chain: The LCEL chain for question answering
|
58 |
+
- retriever: The document retriever
|
59 |
+
"""
|
60 |
+
llm = ChatOpenAI(
|
61 |
+
model='gpt-4.1-mini',
|
62 |
+
temperature=0,
|
63 |
+
streaming=True,
|
64 |
+
max_tokens=8192,
|
65 |
+
api_key=api_key
|
66 |
+
)
|
67 |
|
68 |
+
# Create retriever
|
69 |
+
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
|
70 |
+
|
71 |
+
def format_docs(docs: List[Document]) -> str:
|
72 |
+
"""Format retrieved documents for the prompt.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
docs: List of retrieved documents
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
Formatted string containing document content and sources
|
79 |
+
"""
|
80 |
+
formatted = []
|
81 |
+
for doc in docs:
|
82 |
+
content = doc.page_content
|
83 |
+
source = doc.metadata.get("source", "unknown")
|
84 |
+
formatted.append(f"Content: {content}\nSource: {source}")
|
85 |
+
return "\n\n".join(formatted)
|
86 |
+
|
87 |
+
def get_question(inputs: Dict[str, Any]) -> str:
|
88 |
+
return inputs["question"]
|
89 |
+
|
90 |
+
def get_chat_history(inputs: Dict[str, Any]) -> List[Any]:
|
91 |
+
return inputs["chat_history"]
|
92 |
+
|
93 |
+
chain = (
|
94 |
+
{
|
95 |
+
"context": get_question | retriever | format_docs,
|
96 |
+
"question": get_question,
|
97 |
+
"chat_history": get_chat_history
|
98 |
+
}
|
99 |
+
| PROMPT
|
100 |
+
| llm
|
101 |
+
| StrOutputParser()
|
102 |
+
)
|
103 |
|
104 |
+
return chain, retriever
|
105 |
|
106 |
|
107 |
+
def format_answer_with_sources(response: str, retrieved_docs: List[Document]) -> Tuple[str, List[Dict[str, str]]]:
|
108 |
+
"""Format the answer with source information.
|
|
|
|
|
|
|
109 |
|
110 |
Args:
|
111 |
+
response: The LLM response containing the answer
|
112 |
+
retrieved_docs: List of documents retrieved from the vector store
|
|
|
113 |
|
114 |
Returns:
|
115 |
+
Tuple containing:
|
116 |
+
- answer: The formatted answer string
|
117 |
+
- source_contents: List of source dictionaries with name and content
|
118 |
"""
|
119 |
+
answer = response
|
120 |
+
source_contents = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
+
sources_text = ""
|
123 |
+
if "SOURCES:" in answer:
|
124 |
+
parts = answer.split("SOURCES:")
|
125 |
+
if len(parts) > 1:
|
126 |
+
sources_text = parts[1].strip()
|
127 |
|
128 |
+
if sources_text and retrieved_docs:
|
129 |
+
source_map = {}
|
130 |
+
for doc in retrieved_docs:
|
131 |
+
source_name = doc.metadata.get("source", "unknown")
|
132 |
+
source_map[source_name] = doc.page_content
|
133 |
|
134 |
+
found_sources = []
|
135 |
+
for source in sources_text.split(","):
|
136 |
+
source_name = source.strip().replace(".", "")
|
137 |
+
if source_name in source_map:
|
138 |
+
found_sources.append(source_name)
|
139 |
+
source_contents.append({
|
140 |
+
"name": source_name,
|
141 |
+
"content": source_map[source_name]
|
142 |
+
})
|
143 |
|
144 |
+
return answer, source_contents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
+
def get_chat_history_messages(messages: List[Dict[str, str]]) -> List[Any]:
|
148 |
+
"""Convert Streamlit messages to LangChain message format.
|
|
|
149 |
|
150 |
+
Args:
|
151 |
+
messages: List of Streamlit message dictionaries with 'role' and 'content' keys
|
152 |
|
153 |
+
Returns:
|
154 |
+
List of LangChain message objects (HumanMessage or AIMessage)
|
155 |
+
"""
|
156 |
+
chat_history = []
|
157 |
+
for msg in messages:
|
158 |
+
if msg["role"] == "user":
|
159 |
+
chat_history.append(HumanMessage(content=msg["content"]))
|
160 |
+
elif msg["role"] == "assistant":
|
161 |
+
chat_history.append(AIMessage(content=msg["content"]))
|
162 |
+
return chat_history
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
+
def main() -> None:
|
166 |
+
"""Main Streamlit application function for PDF Q&A Assistant.
|
|
|
|
|
167 |
|
168 |
+
Handles file upload, processing, and chat interface for asking questions
|
169 |
+
about uploaded PDF documents using RAG (Retrieval Augmented Generation).
|
170 |
+
"""
|
171 |
+
st.title("π PDF Q&A Assistant")
|
172 |
+
st.markdown(WELCOME_MESSAGE)
|
173 |
+
|
174 |
+
# Sidebar for file upload
|
175 |
+
with st.sidebar:
|
176 |
+
st.header("π API Configuration")
|
177 |
+
##########################################################################
|
178 |
+
# Exercise 1:
|
179 |
+
# Lets make sure we have user input their OpenAI API key.
|
180 |
+
# Remember to store it in st.session_state.openai_api_key so
|
181 |
+
# that we can use it later in the application.
|
182 |
+
##########################################################################
|
183 |
+
api_key = st.text_input(
|
184 |
+
"OpenAI API Key",
|
185 |
+
type="password",
|
186 |
+
value=st.session_state.openai_api_key if st.session_state.openai_api_key else "",
|
187 |
+
help="Enter your OpenAI API key to use the application"
|
188 |
+
)
|
189 |
|
190 |
+
if api_key:
|
191 |
+
st.session_state.openai_api_key = api_key
|
192 |
+
st.success("β
API Key configured")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
else:
|
194 |
+
st.warning("β οΈ Please enter your OpenAI API key to continue")
|
195 |
+
##########################################################################
|
196 |
+
st.divider()
|
197 |
+
|
198 |
+
st.header("π€ Upload PDF")
|
199 |
+
uploaded_file = st.file_uploader(
|
200 |
+
"Choose a PDF file",
|
201 |
+
type=["pdf"],
|
202 |
+
help="Upload a PDF file to ask questions about its content",
|
203 |
+
disabled=not st.session_state.openai_api_key
|
204 |
+
)
|
205 |
+
|
206 |
+
if uploaded_file is not None and st.session_state.openai_api_key:
|
207 |
+
if st.session_state.processed_file != uploaded_file.name:
|
208 |
+
with st.status("Processing PDF...", expanded=True) as status:
|
209 |
+
st.write("π Reading PDF content...")
|
210 |
+
|
211 |
+
try:
|
212 |
+
docs = process_file(
|
213 |
+
uploaded_file.getvalue(), "application/pdf")
|
214 |
+
st.write(f"β
Extracted {len(docs)} text chunks")
|
215 |
+
|
216 |
+
st.write("π Creating vector store...")
|
217 |
+
vector_store, _ = create_search_engine(
|
218 |
+
uploaded_file.getvalue(), "application/pdf", api_key=st.session_state.openai_api_key)
|
219 |
+
|
220 |
+
st.session_state.vector_store = vector_store
|
221 |
+
st.session_state.docs = docs
|
222 |
+
st.session_state.processed_file = uploaded_file.name
|
223 |
+
|
224 |
+
status.update(
|
225 |
+
label="β
PDF processed successfully!", state="complete")
|
226 |
+
|
227 |
+
except Exception as e:
|
228 |
+
status.update(
|
229 |
+
label="β Error processing PDF", state="error")
|
230 |
+
st.error(f"Error: {str(e)}")
|
231 |
+
return
|
232 |
+
|
233 |
+
st.success(f"π **{uploaded_file.name}** is ready for questions!")
|
234 |
+
|
235 |
+
if st.session_state.vector_store is not None and st.session_state.openai_api_key:
|
236 |
+
st.write("π§ Setting up Q&A chain...")
|
237 |
+
chain, retriever = create_qa_chain(
|
238 |
+
st.session_state.vector_store, st.session_state.openai_api_key)
|
239 |
+
|
240 |
+
# Store in session state
|
241 |
+
st.session_state.chain = chain
|
242 |
+
st.session_state.retriever = retriever
|
243 |
+
|
244 |
+
# Chat interface
|
245 |
+
if st.session_state.chain is not None:
|
246 |
+
# Display chat messages
|
247 |
+
for message in st.session_state.messages:
|
248 |
+
with st.chat_message(message["role"]):
|
249 |
+
st.text(message["content"])
|
250 |
+
|
251 |
+
# Display sources if available
|
252 |
+
if "sources" in message and message["sources"]:
|
253 |
+
for source in message["sources"]:
|
254 |
+
with st.expander(f"π Source: {source['name']}"):
|
255 |
+
st.text(source["content"])
|
256 |
+
|
257 |
+
# Chat input
|
258 |
+
if prompt := st.chat_input("Ask a question about the PDF..."):
|
259 |
+
# Add user message to chat history
|
260 |
+
st.session_state.messages.append(
|
261 |
+
{"role": "user", "content": prompt})
|
262 |
+
|
263 |
+
# Display user message
|
264 |
+
with st.chat_message("user"):
|
265 |
+
st.text(prompt)
|
266 |
+
|
267 |
+
# Generate response
|
268 |
+
with st.chat_message("assistant"):
|
269 |
+
with st.spinner("Thinking..."):
|
270 |
+
try:
|
271 |
+
chat_history = get_chat_history_messages(
|
272 |
+
st.session_state.messages)
|
273 |
+
|
274 |
+
# Get retrieved documents for source processing
|
275 |
+
retrieved_docs = st.session_state.retriever.invoke(
|
276 |
+
prompt)
|
277 |
+
|
278 |
+
# Invoke the LCEL chain
|
279 |
+
response = st.session_state.chain.invoke({
|
280 |
+
"question": prompt,
|
281 |
+
"chat_history": chat_history
|
282 |
+
})
|
283 |
+
|
284 |
+
answer, source_contents = format_answer_with_sources(
|
285 |
+
response, retrieved_docs
|
286 |
+
)
|
287 |
+
|
288 |
+
st.text(answer)
|
289 |
+
|
290 |
+
# Display sources
|
291 |
+
if source_contents:
|
292 |
+
for source in source_contents:
|
293 |
+
with st.expander(f"π Source: {source['name']}"):
|
294 |
+
st.text(source["content"])
|
295 |
+
|
296 |
+
# Add assistant response to chat history
|
297 |
+
st.session_state.messages.append({
|
298 |
+
"role": "assistant",
|
299 |
+
"content": answer,
|
300 |
+
"sources": source_contents
|
301 |
+
})
|
302 |
+
|
303 |
+
except Exception as e:
|
304 |
+
error_msg = f"Error generating response: {str(e)}"
|
305 |
+
import logging
|
306 |
+
logging.error(e, exc_info=True)
|
307 |
+
st.error(error_msg)
|
308 |
+
st.session_state.messages.append({
|
309 |
+
"role": "assistant",
|
310 |
+
"content": error_msg
|
311 |
+
})
|
312 |
+
|
313 |
+
else:
|
314 |
+
if not st.session_state.openai_api_key:
|
315 |
+
st.info(
|
316 |
+
"π Please enter your OpenAI API key in the sidebar to get started!")
|
317 |
+
else:
|
318 |
+
st.info("π Please upload a PDF file to get started!")
|
319 |
+
|
320 |
|
321 |
+
if __name__ == "__main__":
|
322 |
+
main()
|
app/prompt.py
CHANGED
@@ -1,28 +1,26 @@
|
|
1 |
-
|
2 |
-
# Exercise 1:
|
3 |
-
# Please utilize Chainlit's app playground for prompt engineering and
|
4 |
-
# experimentation. Once done, modify the prompts template below with your
|
5 |
-
# newly developed prompts.
|
6 |
-
##############################################################################
|
7 |
-
from langchain.prompts import PromptTemplate
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
14 |
|
15 |
-
|
16 |
-
=========
|
17 |
-
{summaries}
|
18 |
-
=========
|
19 |
-
FINAL ANSWER:"""
|
20 |
|
21 |
-
|
22 |
-
template=template, input_variables=["summaries", "question"]
|
23 |
-
)
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
28 |
)
|
|
|
1 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
WELCOME_MESSAGE = """\
|
4 |
+
Welcome to Introduction to LLM App Development Sample PDF QA Application!
|
5 |
+
To get started:
|
6 |
+
1. Upload a PDF or text file
|
7 |
+
2. Ask any question about the file!
|
8 |
+
"""
|
9 |
|
10 |
+
PROMPT = ChatPromptTemplate.from_messages(
|
11 |
+
[
|
12 |
+
(
|
13 |
+
"system",
|
14 |
+
"""Please act as an expert financial analyst when you answer the questions and pay special attention to the financial statements. Operating margin is also known as op margin and is calculated by dividing operating income by revenue.
|
15 |
|
16 |
+
Given the following extracted parts of a long document and the conversation history, create a final answer with references ("SOURCES"). If you don't know the answer, just say that you don't know. Don't try to make up an answer.
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
|
|
|
|
|
19 |
|
20 |
+
Context from documents:
|
21 |
+
{context}"""
|
22 |
+
),
|
23 |
+
MessagesPlaceholder(variable_name="chat_history"),
|
24 |
+
("human", "{question}")
|
25 |
+
]
|
26 |
)
|
app/utils.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
import tempfile
|
3 |
+
import os
|
4 |
+
from chromadb.config import Settings
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from langchain_community.document_loaders import PDFPlumberLoader
|
7 |
+
from langchain_chroma import Chroma
|
8 |
+
from langchain.vectorstores.base import VectorStore
|
9 |
+
from langchain_openai import OpenAIEmbeddings
|
10 |
+
|
11 |
+
|
12 |
+
def process_file(file_data, file_type: str = None) -> list:
|
13 |
+
"""
|
14 |
+
Process a PDF file and split it into documents.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
file_data: Either a file path (str) or file bytes
|
18 |
+
file_type: Optional file type, defaults to checking if PDF
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
List of processed documents
|
22 |
+
|
23 |
+
Raises:
|
24 |
+
TypeError: If file is not a PDF
|
25 |
+
ValueError: If PDF parsing fails
|
26 |
+
"""
|
27 |
+
if file_type and file_type != "application/pdf":
|
28 |
+
raise TypeError("Only PDF files are supported")
|
29 |
+
|
30 |
+
# Handle both file path and file bytes
|
31 |
+
if isinstance(file_data, bytes):
|
32 |
+
# Create a temporary file for the PDF bytes
|
33 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
34 |
+
tmp_file.write(file_data)
|
35 |
+
tmp_file_path = tmp_file.name
|
36 |
+
|
37 |
+
try:
|
38 |
+
loader = PDFPlumberLoader(tmp_file_path)
|
39 |
+
documents = loader.load()
|
40 |
+
finally:
|
41 |
+
# Clean up the temporary file
|
42 |
+
os.unlink(tmp_file_path)
|
43 |
+
else:
|
44 |
+
# Assume it's a file path
|
45 |
+
loader = PDFPlumberLoader(file_data)
|
46 |
+
documents = loader.load()
|
47 |
+
|
48 |
+
# Clean up extracted text to fix common PDF extraction issues
|
49 |
+
for doc in documents:
|
50 |
+
# Fix common spacing issues from PDF extraction
|
51 |
+
doc.page_content = doc.page_content.replace('\n', ' ') # Replace newlines with spaces
|
52 |
+
doc.page_content = ' '.join(doc.page_content.split()) # Normalize whitespace
|
53 |
+
|
54 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
55 |
+
chunk_size=3000,
|
56 |
+
chunk_overlap=100,
|
57 |
+
separators=["\n\n", "\n", " ", ""]
|
58 |
+
)
|
59 |
+
docs = text_splitter.split_documents(documents)
|
60 |
+
for i, doc in enumerate(docs):
|
61 |
+
doc.metadata["source"] = f"source_{i}"
|
62 |
+
if not docs:
|
63 |
+
raise ValueError("PDF file parsing failed.")
|
64 |
+
return docs
|
65 |
+
|
66 |
+
|
67 |
+
def create_search_engine(file_data, file_type: str = None, api_key: str = None) -> tuple[VectorStore, list]:
|
68 |
+
"""
|
69 |
+
Create a vector store search engine from a PDF file.
|
70 |
+
|
71 |
+
Args:
|
72 |
+
file_data: Either a file path (str) or file bytes
|
73 |
+
file_type: Optional file type for validation
|
74 |
+
api_key: OpenAI API key for embeddings
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
Tuple of (search_engine, docs) where:
|
78 |
+
- search_engine: The Chroma vector store
|
79 |
+
- docs: The processed documents
|
80 |
+
"""
|
81 |
+
# Process the file
|
82 |
+
docs = process_file(file_data, file_type)
|
83 |
+
|
84 |
+
encoder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key)
|
85 |
+
|
86 |
+
# Initialize Chromadb client and settings, reset to ensure we get a clean
|
87 |
+
# search engine
|
88 |
+
client = chromadb.EphemeralClient()
|
89 |
+
client_settings = Settings(
|
90 |
+
allow_reset=True,
|
91 |
+
anonymized_telemetry=False
|
92 |
+
)
|
93 |
+
search_engine = Chroma(
|
94 |
+
client=client,
|
95 |
+
client_settings=client_settings
|
96 |
+
)
|
97 |
+
search_engine._client.reset()
|
98 |
+
|
99 |
+
search_engine = Chroma.from_documents(
|
100 |
+
client=client,
|
101 |
+
documents=docs,
|
102 |
+
embedding=encoder,
|
103 |
+
client_settings=client_settings
|
104 |
+
)
|
105 |
+
|
106 |
+
return search_engine, docs
|
chainlit.md
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
# Welcome to your PDF QA Sample Application! ππ€
|
2 |
-
|
3 |
-
Hi Team! π Congratulations on launching your first LLM Application. This application is build using OpenAI, Langchain, Chainlit, and Chroma. The goal of this application is to provite a quick overview of the most basic archetype of LLM application and the prototyping and debugging environment.
|
4 |
-
|
5 |
-
## Useful Links π
|
6 |
-
|
7 |
-
- **Langchain Documentation:** Get started with [Langchain Documentation](https://python.langchain.com/) π
|
8 |
-
- **Chainlit Documentation:** Get started with [Chainlit Documentation](https://docs.chainlit.io) π
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
# Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.
|
2 |
-
|
3 |
-
langchain
|
4 |
-
|
5 |
-
|
6 |
-
pdfplumber
|
7 |
-
chromadb
|
8 |
-
|
9 |
-
|
|
|
1 |
# Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.
|
2 |
+
langchain>=0.3.25,<1.0.0
|
3 |
+
langchain-openai>=0.0.5,<1.0.0
|
4 |
+
langchain-chroma>=0.2.4,<1.0.0
|
5 |
+
streamlit>=1.31.0
|
6 |
+
pdfplumber>=0.11.6
|
7 |
+
chromadb>=1.0.10
|
8 |
+
ruff==0.11.11
|
9 |
+
python-dotenv>=1.0.0
|