HanLee commited on
Commit
ceaa8ef
·
1 Parent(s): 4a49d79

feat: 01_06 End

Browse files
Files changed (2) hide show
  1. app/app.py +2 -200
  2. app/prompt.py +0 -26
app/app.py CHANGED
@@ -1,207 +1,9 @@
1
- # Chroma compatibility issue resolution
2
- # https://docs.trychroma.com/troubleshooting#sqlite
3
- __import__('pysqlite3')
4
- import sys
5
- sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
6
-
7
- from tempfile import NamedTemporaryFile
8
-
9
  import chainlit as cl
10
- from chainlit.types import AskFileResponse
11
-
12
- import chromadb
13
- from chromadb.config import Settings
14
- from langchain.chains import ConversationalRetrievalChain, RetrievalQAWithSourcesChain
15
- from langchain.chains.base import Chain
16
- from langchain.chat_models import ChatOpenAI
17
- from langchain.document_loaders import PDFPlumberLoader
18
- from langchain.embeddings.openai import OpenAIEmbeddings
19
- from langchain.text_splitter import RecursiveCharacterTextSplitter
20
- from langchain.vectorstores import Chroma
21
- from langchain.vectorstores.base import VectorStore
22
-
23
- from prompt import EXAMPLE_PROMPT, PROMPT, WELCOME_MESSAGE
24
-
25
-
26
- namespaces = set()
27
-
28
-
29
- def process_file(*, file: AskFileResponse) -> list:
30
- if file.type != "application/pdf":
31
- raise TypeError("Only PDF files are supported")
32
-
33
-
34
- with NamedTemporaryFile() as tempfile:
35
- tempfile.write(file.content)
36
-
37
- ######################################################################
38
- #
39
- # 1. Load the PDF
40
- #
41
- ######################################################################
42
- loader = PDFPlumberLoader(tempfile.name)
43
-
44
- ######################################################################
45
- documents = loader.load()
46
-
47
- ######################################################################
48
- #
49
- # 2. Split the text
50
- #
51
- ######################################################################
52
- text_splitter = RecursiveCharacterTextSplitter(
53
- chunk_size=3000,
54
- chunk_overlap=100
55
- )
56
- ######################################################################
57
-
58
- docs = text_splitter.split_documents(documents)
59
-
60
- for i, doc in enumerate(docs):
61
- doc.metadata["source"] = f"source_{i}"
62
-
63
- if not docs:
64
- raise ValueError("PDF file parsing failed.")
65
-
66
- return docs
67
-
68
-
69
- def create_search_engine(*, file: AskFileResponse) -> VectorStore:
70
-
71
- # Process and save data in the user session
72
- docs = process_file(file=file)
73
- cl.user_session.set("docs", docs)
74
-
75
- ##########################################################################
76
- #
77
- # 3. Set the Encoder model for creating embeddings
78
- #
79
- ##########################################################################
80
- encoder = OpenAIEmbeddings(
81
- model="text-embedding-ada-002"
82
- )
83
- ##########################################################################
84
-
85
- # Initialize Chromadb client and settings, reset to ensure we get a clean
86
- # search engine
87
- client = chromadb.EphemeralClient()
88
- client_settings=Settings(
89
- allow_reset=True,
90
- anonymized_telemetry=False
91
- )
92
- search_engine = Chroma(
93
- client=client,
94
- client_settings=client_settings
95
- )
96
- search_engine._client.reset()
97
-
98
- ##########################################################################
99
- #
100
- # 4. Create the document search engine. Remember to add
101
- # client_settings using the above settings.
102
- #
103
- ##########################################################################
104
-
105
- search_engine = Chroma.from_documents(
106
- client=client,
107
- documents=docs,
108
- embedding=encoder,
109
- client_settings=client_settings
110
- )
111
- ##########################################################################
112
-
113
- return search_engine
114
-
115
-
116
- @cl.on_chat_start
117
- async def start():
118
-
119
- files = None
120
- while files is None:
121
- files = await cl.AskFileMessage(
122
- content=WELCOME_MESSAGE,
123
- accept=["application/pdf"],
124
- max_size_mb=20,
125
- ).send()
126
-
127
- file = files[0]
128
- msg = cl.Message(content=f"Processing `{file.name}`...")
129
- await msg.send()
130
-
131
- try:
132
- search_engine = await cl.make_async(create_search_engine)(file=file)
133
- except Exception as e:
134
- await cl.Message(content=f"Error: {e}").send()
135
- raise SystemError
136
-
137
- llm = ChatOpenAI(
138
- model='gpt-3.5-turbo-16k-0613',
139
- temperature=0,
140
- streaming=True
141
- )
142
-
143
- ##########################################################################
144
- #
145
- # 5. Create the chain / tool for RetrievalQAWithSourcesChain.
146
- #
147
- ##########################################################################
148
- chain = RetrievalQAWithSourcesChain.from_chain_type(
149
- llm=llm,
150
- chain_type="stuff",
151
- retriever=search_engine.as_retriever(max_tokens_limit=4097),
152
- ######################################################################
153
- # 6. Customize prompts to improve summarization and question
154
- # answering performance. Perhaps create your own prompt in prompts.py?
155
- ######################################################################
156
- chain_type_kwargs={
157
- "prompt": PROMPT,
158
- "document_prompt": EXAMPLE_PROMPT
159
- },
160
- )
161
- ##########################################################################
162
-
163
- # await msg.update(content=f"`{file.name}` processed. You can now ask questions!")
164
- msg.content = f"`{file.name}` processed. You can now ask questions!"
165
- await msg.update()
166
-
167
- cl.user_session.set("chain", chain)
168
 
169
 
170
  @cl.on_message
171
  async def main(message: cl.Message):
172
 
173
- chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain
174
- cb = cl.AsyncLangchainCallbackHandler()
175
- response = await chain.acall(message.content, callbacks=[cb])
176
- answer = response["answer"]
177
- sources = response["sources"].strip()
178
- source_elements = []
179
-
180
- # Get the documents from the user session
181
- docs = cl.user_session.get("docs")
182
- metadatas = [doc.metadata for doc in docs]
183
- all_sources = [m["source"] for m in metadatas]
184
-
185
- # Adding sources to the answer
186
- if sources:
187
- found_sources = []
188
-
189
- # Add the sources to the message
190
- for source in sources.split(","):
191
- source_name = source.strip().replace(".", "")
192
- # Get the index of the source
193
- try:
194
- index = all_sources.index(source_name)
195
- except ValueError:
196
- continue
197
- text = docs[index].page_content
198
- found_sources.append(source_name)
199
- # Create the text element referenced in the message
200
- source_elements.append(cl.Text(content=text, name=source_name))
201
-
202
- if found_sources:
203
- answer += f"\nSources: {', '.join(found_sources)}"
204
- else:
205
- answer += "\nNo sources found"
206
 
207
- await cl.Message(content=answer, elements=source_elements).send()
 
 
 
 
 
 
 
 
 
1
  import chainlit as cl
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  @cl.on_message
5
  async def main(message: cl.Message):
6
 
7
+ response = message.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ await cl.Message(content=response).send()
app/prompt.py DELETED
@@ -1,26 +0,0 @@
1
- # flake8: noqa
2
- from langchain.prompts import PromptTemplate
3
-
4
- WELCOME_MESSAGE = """\
5
- Welcome to Introduction to LLM App Development Sample PDF QA Application!
6
- To get started:
7
- 1. Upload a PDF or text file
8
- 2. Ask any question about the file!
9
- """
10
-
11
- template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
12
- If you don't know the answer, just say that you don't know. Don't try to make up an answer.
13
- ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
14
-
15
- QUESTION: {question}
16
- =========
17
- {summaries}
18
- =========
19
- FINAL ANSWER:"""
20
-
21
- PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
22
-
23
- EXAMPLE_PROMPT = PromptTemplate(
24
- template="Content: {page_content}\nSource: {source}",
25
- input_variables=["page_content", "source"],
26
- )