feat: 02_06e
Browse files- README.md +5 -7
- app/app.py +49 -20
README.md
CHANGED
@@ -2,11 +2,10 @@
|
|
2 |
This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
|
3 |
|
4 |
_See the readme file in the main branch for updated instructions and information._
|
5 |
-
##
|
6 |
-
|
7 |
-
|
8 |
-
In this lab, we will utilize the build in PDF loading and parsing connectors inside Langchain, load the PDF, and chunk the PDFs into individual pieces with their associated metadata.
|
9 |
|
|
|
10 |
|
11 |
## Exercises
|
12 |
|
@@ -33,6 +32,5 @@ chainlit run app/app.py -w
|
|
33 |
|
34 |
## References
|
35 |
|
36 |
-
- [Langchain
|
37 |
-
- [Langchain
|
38 |
-
- [Chainlit Ask File Message](https://docs.chainlit.io/api-reference/ask/ask-for-file)
|
|
|
2 |
This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
|
3 |
|
4 |
_See the readme file in the main branch for updated instructions and information._
|
5 |
+
## Lab4: Indexing Documents into Vector Database
|
6 |
+
In the previous lab, we enabled document loading and chunking them into smaller sub documents. Now, we will need to index them into our search engine vector databse in order for us to build our Chat with PDF application using the RAG (Retrieval Augmented Generation) pattern.
|
|
|
|
|
7 |
|
8 |
+
In this lab, we will implement adding OpenAI's embedding model and index the documents we chunked in the previous section into a Vector Database. We will be using [Chroma](https://www.trychroma.com/) as the vector database of choice. Chroma is a lightweight embedding database that can live in memory, similar to SQLite.
|
9 |
|
10 |
## Exercises
|
11 |
|
|
|
32 |
|
33 |
## References
|
34 |
|
35 |
+
- [Langchain Embedding Models](https://python.langchain.com/docs/modules/data_connection/text_embedding/)
|
36 |
+
- [ChromaDB Langchain Integration](https://docs.trychroma.com/integrations/langchain)
|
|
app/app.py
CHANGED
@@ -18,6 +18,7 @@ from langchain.document_loaders import PDFPlumberLoader
|
|
18 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
19 |
from langchain.prompts import ChatPromptTemplate
|
20 |
from langchain.schema import Document, StrOutputParser
|
|
|
21 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
22 |
from langchain.vectorstores import Chroma
|
23 |
from langchain.vectorstores.base import VectorStore
|
@@ -55,8 +56,7 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
|
|
55 |
)
|
56 |
docs = text_splitter.split_documents(documents)
|
57 |
|
58 |
-
#
|
59 |
-
# source document it is.
|
60 |
for i, doc in enumerate(docs):
|
61 |
doc.metadata["source"] = f"source_{i}"
|
62 |
|
@@ -66,35 +66,44 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
|
|
66 |
return docs
|
67 |
|
68 |
|
69 |
-
def create_search_engine(*,
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
model
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
81 |
client = chromadb.EphemeralClient()
|
82 |
client_settings=Settings(
|
83 |
allow_reset=True,
|
84 |
anonymized_telemetry=False
|
85 |
)
|
|
|
|
|
|
|
86 |
search_engine = Chroma(
|
87 |
client=client,
|
88 |
client_settings=client_settings
|
89 |
)
|
90 |
search_engine._client.reset()
|
91 |
-
|
|
|
|
|
|
|
|
|
92 |
search_engine = Chroma.from_documents(
|
93 |
client=client,
|
94 |
documents=docs,
|
95 |
-
embedding=
|
96 |
client_settings=client_settings
|
97 |
)
|
|
|
98 |
|
99 |
return search_engine
|
100 |
|
@@ -107,7 +116,7 @@ async def on_chat_start():
|
|
107 |
Returns:
|
108 |
None
|
109 |
"""
|
110 |
-
|
111 |
files = None
|
112 |
while files is None:
|
113 |
files = await cl.AskFileMessage(
|
@@ -117,15 +126,35 @@ async def on_chat_start():
|
|
117 |
).send()
|
118 |
file = files[0]
|
119 |
|
120 |
-
#
|
121 |
msg = cl.Message(content=f"Processing `{file.name}`...")
|
122 |
await msg.send()
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
try:
|
125 |
-
search_engine = await cl.make_async(create_search_engine)(
|
|
|
|
|
|
|
126 |
except Exception as e:
|
127 |
await cl.Message(content=f"Error: {e}").send()
|
128 |
raise SystemError
|
|
|
|
|
129 |
|
130 |
model = ChatOpenAI(
|
131 |
model="gpt-3.5-turbo-16k-0613",
|
|
|
18 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
19 |
from langchain.prompts import ChatPromptTemplate
|
20 |
from langchain.schema import Document, StrOutputParser
|
21 |
+
from langchain.schema.embeddings import Embeddings
|
22 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
23 |
from langchain.vectorstores import Chroma
|
24 |
from langchain.vectorstores.base import VectorStore
|
|
|
56 |
)
|
57 |
docs = text_splitter.split_documents(documents)
|
58 |
|
59 |
+
# Adding source_id into the metadata to denote which document it is
|
|
|
60 |
for i, doc in enumerate(docs):
|
61 |
doc.metadata["source"] = f"source_{i}"
|
62 |
|
|
|
66 |
return docs
|
67 |
|
68 |
|
69 |
+
def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> VectorStore:
|
70 |
+
"""Takes a list of Langchain Documents and an embedding model API wrapper
|
71 |
+
and build a search index using a VectorStore.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
docs (List[Document]): List of Langchain Documents to be indexed into
|
75 |
+
the search engine.
|
76 |
+
embeddings (Embeddings): encoder model API used to calculate embedding
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
VectorStore: Langchain VectorStore
|
80 |
+
"""
|
81 |
+
# Initialize Chromadb client to enable resetting and disable telemtry
|
82 |
client = chromadb.EphemeralClient()
|
83 |
client_settings=Settings(
|
84 |
allow_reset=True,
|
85 |
anonymized_telemetry=False
|
86 |
)
|
87 |
+
|
88 |
+
# Reset the search engine to ensure we don't use old copies.
|
89 |
+
# NOTE: we do not need this for production
|
90 |
search_engine = Chroma(
|
91 |
client=client,
|
92 |
client_settings=client_settings
|
93 |
)
|
94 |
search_engine._client.reset()
|
95 |
+
##########################################################################
|
96 |
+
# Exercise 1b:
|
97 |
+
# Now we have defined our encoder model and initialized our search engine
|
98 |
+
# client, please create the search engine from documents
|
99 |
+
##########################################################################
|
100 |
search_engine = Chroma.from_documents(
|
101 |
client=client,
|
102 |
documents=docs,
|
103 |
+
embedding=embeddings,
|
104 |
client_settings=client_settings
|
105 |
)
|
106 |
+
##########################################################################
|
107 |
|
108 |
return search_engine
|
109 |
|
|
|
116 |
Returns:
|
117 |
None
|
118 |
"""
|
119 |
+
# Asking user to to upload a PDF to chat with
|
120 |
files = None
|
121 |
while files is None:
|
122 |
files = await cl.AskFileMessage(
|
|
|
126 |
).send()
|
127 |
file = files[0]
|
128 |
|
129 |
+
# Process and save data in the user session
|
130 |
msg = cl.Message(content=f"Processing `{file.name}`...")
|
131 |
await msg.send()
|
132 |
+
|
133 |
+
docs = process_file(file=file)
|
134 |
+
cl.user_session.set("docs", docs)
|
135 |
+
msg.content = f"`{file.name}` processed. Loading ..."
|
136 |
+
await msg.update()
|
137 |
+
|
138 |
+
# Indexing documents into our search engine
|
139 |
+
##########################################################################
|
140 |
+
# Exercise 1a:
|
141 |
+
# Add OpenAI's embedding model as the encoder. The most standard one to
|
142 |
+
# use is text-embedding-ada-002.
|
143 |
+
##########################################################################
|
144 |
+
embeddings = OpenAIEmbeddings(
|
145 |
+
model="text-embedding-ada-002"
|
146 |
+
)
|
147 |
+
##########################################################################
|
148 |
try:
|
149 |
+
search_engine = await cl.make_async(create_search_engine)(
|
150 |
+
docs=docs,
|
151 |
+
embeddings=embeddings
|
152 |
+
)
|
153 |
except Exception as e:
|
154 |
await cl.Message(content=f"Error: {e}").send()
|
155 |
raise SystemError
|
156 |
+
msg.content = f"`{file.name}` loaded. You can now ask questions!"
|
157 |
+
await msg.update()
|
158 |
|
159 |
model = ChatOpenAI(
|
160 |
model="gpt-3.5-turbo-16k-0613",
|