HanLee commited on
Commit
36ba8c8
·
1 Parent(s): 2e7a35a

feat: 02_06e

Browse files
Files changed (2) hide show
  1. README.md +5 -7
  2. app/app.py +49 -20
README.md CHANGED
@@ -2,11 +2,10 @@
2
  This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
3
 
4
  _See the readme file in the main branch for updated instructions and information._
5
- ## Lab3: Enabling Load PDF to Chainlit App
6
- Building on top of the current simplified version of ChatGPT using Chainlit, we now going to add loading PDF capabilities into the application.
7
-
8
- In this lab, we will utilize the build in PDF loading and parsing connectors inside Langchain, load the PDF, and chunk the PDFs into individual pieces with their associated metadata.
9
 
 
10
 
11
  ## Exercises
12
 
@@ -33,6 +32,5 @@ chainlit run app/app.py -w
33
 
34
  ## References
35
 
36
- - [Langchain PDF Loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf)
37
- - [Langchain Text Splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/#text-splitters)
38
- - [Chainlit Ask File Message](https://docs.chainlit.io/api-reference/ask/ask-for-file)
 
2
  This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
3
 
4
  _See the readme file in the main branch for updated instructions and information._
5
+ ## Lab4: Indexing Documents into Vector Database
6
+ In the previous lab, we enabled document loading and chunking them into smaller sub documents. Now, we will need to index them into our search engine vector databse in order for us to build our Chat with PDF application using the RAG (Retrieval Augmented Generation) pattern.
 
 
7
 
8
+ In this lab, we will implement adding OpenAI's embedding model and index the documents we chunked in the previous section into a Vector Database. We will be using [Chroma](https://www.trychroma.com/) as the vector database of choice. Chroma is a lightweight embedding database that can live in memory, similar to SQLite.
9
 
10
  ## Exercises
11
 
 
32
 
33
  ## References
34
 
35
+ - [Langchain Embedding Models](https://python.langchain.com/docs/modules/data_connection/text_embedding/)
36
+ - [ChromaDB Langchain Integration](https://docs.trychroma.com/integrations/langchain)
 
app/app.py CHANGED
@@ -18,6 +18,7 @@ from langchain.document_loaders import PDFPlumberLoader
18
  from langchain.embeddings.openai import OpenAIEmbeddings
19
  from langchain.prompts import ChatPromptTemplate
20
  from langchain.schema import Document, StrOutputParser
 
21
  from langchain.text_splitter import RecursiveCharacterTextSplitter
22
  from langchain.vectorstores import Chroma
23
  from langchain.vectorstores.base import VectorStore
@@ -55,8 +56,7 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
55
  )
56
  docs = text_splitter.split_documents(documents)
57
 
58
- # We are adding source_id into the metadata here to denote which
59
- # source document it is.
60
  for i, doc in enumerate(docs):
61
  doc.metadata["source"] = f"source_{i}"
62
 
@@ -66,35 +66,44 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
66
  return docs
67
 
68
 
69
- def create_search_engine(*, file: AskFileResponse) -> VectorStore:
70
-
71
- # Process and save data in the user session
72
- docs = process_file(file=file)
73
- cl.user_session.set("docs", docs)
74
-
75
- encoder = OpenAIEmbeddings(
76
- model="text-embedding-ada-002"
77
- )
78
-
79
- # Initialize Chromadb client and settings, reset to ensure we get a clean
80
- # search engine
 
81
  client = chromadb.EphemeralClient()
82
  client_settings=Settings(
83
  allow_reset=True,
84
  anonymized_telemetry=False
85
  )
 
 
 
86
  search_engine = Chroma(
87
  client=client,
88
  client_settings=client_settings
89
  )
90
  search_engine._client.reset()
91
-
 
 
 
 
92
  search_engine = Chroma.from_documents(
93
  client=client,
94
  documents=docs,
95
- embedding=encoder,
96
  client_settings=client_settings
97
  )
 
98
 
99
  return search_engine
100
 
@@ -107,7 +116,7 @@ async def on_chat_start():
107
  Returns:
108
  None
109
  """
110
-
111
  files = None
112
  while files is None:
113
  files = await cl.AskFileMessage(
@@ -117,15 +126,35 @@ async def on_chat_start():
117
  ).send()
118
  file = files[0]
119
 
120
- # Send message to user to let them know we are processing the file
121
  msg = cl.Message(content=f"Processing `{file.name}`...")
122
  await msg.send()
123
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  try:
125
- search_engine = await cl.make_async(create_search_engine)(file=file)
 
 
 
126
  except Exception as e:
127
  await cl.Message(content=f"Error: {e}").send()
128
  raise SystemError
 
 
129
 
130
  model = ChatOpenAI(
131
  model="gpt-3.5-turbo-16k-0613",
 
18
  from langchain.embeddings.openai import OpenAIEmbeddings
19
  from langchain.prompts import ChatPromptTemplate
20
  from langchain.schema import Document, StrOutputParser
21
+ from langchain.schema.embeddings import Embeddings
22
  from langchain.text_splitter import RecursiveCharacterTextSplitter
23
  from langchain.vectorstores import Chroma
24
  from langchain.vectorstores.base import VectorStore
 
56
  )
57
  docs = text_splitter.split_documents(documents)
58
 
59
+ # Adding source_id into the metadata to denote which document it is
 
60
  for i, doc in enumerate(docs):
61
  doc.metadata["source"] = f"source_{i}"
62
 
 
66
  return docs
67
 
68
 
69
+ def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> VectorStore:
70
+ """Takes a list of Langchain Documents and an embedding model API wrapper
71
+ and build a search index using a VectorStore.
72
+
73
+ Args:
74
+ docs (List[Document]): List of Langchain Documents to be indexed into
75
+ the search engine.
76
+ embeddings (Embeddings): encoder model API used to calculate embedding
77
+
78
+ Returns:
79
+ VectorStore: Langchain VectorStore
80
+ """
81
+ # Initialize Chromadb client to enable resetting and disable telemtry
82
  client = chromadb.EphemeralClient()
83
  client_settings=Settings(
84
  allow_reset=True,
85
  anonymized_telemetry=False
86
  )
87
+
88
+ # Reset the search engine to ensure we don't use old copies.
89
+ # NOTE: we do not need this for production
90
  search_engine = Chroma(
91
  client=client,
92
  client_settings=client_settings
93
  )
94
  search_engine._client.reset()
95
+ ##########################################################################
96
+ # Exercise 1b:
97
+ # Now we have defined our encoder model and initialized our search engine
98
+ # client, please create the search engine from documents
99
+ ##########################################################################
100
  search_engine = Chroma.from_documents(
101
  client=client,
102
  documents=docs,
103
+ embedding=embeddings,
104
  client_settings=client_settings
105
  )
106
+ ##########################################################################
107
 
108
  return search_engine
109
 
 
116
  Returns:
117
  None
118
  """
119
+ # Asking user to to upload a PDF to chat with
120
  files = None
121
  while files is None:
122
  files = await cl.AskFileMessage(
 
126
  ).send()
127
  file = files[0]
128
 
129
+ # Process and save data in the user session
130
  msg = cl.Message(content=f"Processing `{file.name}`...")
131
  await msg.send()
132
+
133
+ docs = process_file(file=file)
134
+ cl.user_session.set("docs", docs)
135
+ msg.content = f"`{file.name}` processed. Loading ..."
136
+ await msg.update()
137
+
138
+ # Indexing documents into our search engine
139
+ ##########################################################################
140
+ # Exercise 1a:
141
+ # Add OpenAI's embedding model as the encoder. The most standard one to
142
+ # use is text-embedding-ada-002.
143
+ ##########################################################################
144
+ embeddings = OpenAIEmbeddings(
145
+ model="text-embedding-ada-002"
146
+ )
147
+ ##########################################################################
148
  try:
149
+ search_engine = await cl.make_async(create_search_engine)(
150
+ docs=docs,
151
+ embeddings=embeddings
152
+ )
153
  except Exception as e:
154
  await cl.Message(content=f"Error: {e}").send()
155
  raise SystemError
156
+ msg.content = f"`{file.name}` loaded. You can now ask questions!"
157
+ await msg.update()
158
 
159
  model = ChatOpenAI(
160
  model="gpt-3.5-turbo-16k-0613",