File size: 6,576 Bytes
2e7a35a bb87055 2e7a35a bb87055 2e7a35a 99b7cde 4a49d79 99b7cde 2e7a35a bb87055 324b092 2e7a35a 64fa517 36ba8c8 99b7cde 2e7a35a 99b7cde ce5d5d0 99b7cde bb87055 99b7cde adb96e6 99b7cde bb87055 99b7cde 36ba8c8 99b7cde cb31088 bb87055 36ba8c8 477f1cf bb87055 36ba8c8 bb87055 477f1cf 36ba8c8 bb87055 477f1cf bb87055 477f1cf 324b092 adb96e6 36ba8c8 6c98e1c 36ba8c8 6c98e1c bb87055 36ba8c8 bb87055 477f1cf 36ba8c8 bb87055 36ba8c8 477f1cf 36ba8c8 477f1cf 324b092 bb87055 324b092 64fa517 ce5d5d0 324b092 4a49d79 3a5a4c6 4a49d79 324b092 64fa517 324b092 64fa517 bb87055 324b092 64fa517 9073321 64fa517 9073321 64fa517 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
# Chroma compatibility issue resolution
# https://docs.trychroma.com/troubleshooting#sqlite
__import__("pysqlite3")
import sys
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
from tempfile import NamedTemporaryFile
from typing import List
import chainlit as cl
from chainlit.types import AskFileResponse
import chromadb
from chromadb.config import Settings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PDFPlumberLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain.schema.embeddings import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.vectorstores.base import VectorStore
from prompt import EXAMPLE_PROMPT, PROMPT
def process_file(*, file: AskFileResponse) -> List[Document]:
"""Processes one PDF file from a Chainlit AskFileResponse object by first
loading the PDF document and then chunk it into sub documents. Only
supports PDF files.
Args:
file (AskFileResponse): input file to be processed
Raises:
ValueError: when we fail to process PDF files. We consider PDF file
processing failure when there's no text returned. For example, PDFs
with only image contents, corrupted PDFs, etc.
Returns:
List[Document]: List of Document(s). Each individual document has two
fields: page_content(string) and metadata(dict).
"""
if file.type != "application/pdf":
raise TypeError("Only PDF files are supported")
with NamedTemporaryFile() as tempfile:
tempfile.write(file.content)
loader = PDFPlumberLoader(tempfile.name)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=3000, chunk_overlap=100
)
docs = text_splitter.split_documents(documents)
# Adding source_id into the metadata to denote which document it is
for i, doc in enumerate(docs):
doc.metadata["source"] = f"source_{i}"
if not docs:
raise ValueError("PDF file parsing failed.")
return docs
def create_search_engine(
*, docs: List[Document], embeddings: Embeddings
) -> VectorStore:
"""Takes a list of Langchain Documents and an embedding model API wrapper
and build a search index using a VectorStore.
Args:
docs (List[Document]): List of Langchain Documents to be indexed into
the search engine.
embeddings (Embeddings): encoder model API used to calculate embedding
Returns:
VectorStore: Langchain VectorStore
"""
# Initialize Chromadb client to enable resetting and disable telemtry
client = chromadb.EphemeralClient()
client_settings = Settings(allow_reset=True, anonymized_telemetry=False)
# Reset the search engine to ensure we don't use old copies.
# NOTE: we do not need this for production
search_engine = Chroma(client=client, client_settings=client_settings)
search_engine._client.reset()
search_engine = Chroma.from_documents(
client=client,
documents=docs,
embedding=embeddings,
client_settings=client_settings,
)
return search_engine
@cl.on_chat_start
async def on_chat_start():
"""This function is written to prepare the environments for the chat
with PDF application. It should be decorated with cl.on_chat_start.
Returns:
None
"""
# Asking user to to upload a PDF to chat with
files = None
while files is None:
files = await cl.AskFileMessage(
content="Please Upload the PDF file you want to chat with...",
accept=["application/pdf"],
max_size_mb=20,
).send()
file = files[0]
# Process and save data in the user session
msg = cl.Message(content=f"Processing `{file.name}`...")
await msg.send()
docs = process_file(file=file)
cl.user_session.set("docs", docs)
msg.content = f"`{file.name}` processed. Loading ..."
await msg.update()
# Indexing documents into our search engine
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
try:
search_engine = await cl.make_async(create_search_engine)(
docs=docs, embeddings=embeddings
)
except Exception as e:
await cl.Message(content=f"Error: {e}").send()
raise SystemError
msg.content = f"`{file.name}` loaded. You can now ask questions!"
await msg.update()
model = ChatOpenAI(
model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True
)
chain = RetrievalQAWithSourcesChain.from_chain_type(
llm=model,
chain_type="stuff",
retriever=search_engine.as_retriever(max_tokens_limit=4097),
chain_type_kwargs={"prompt": PROMPT, "document_prompt": EXAMPLE_PROMPT},
)
# We are saving the chain in user_session, so we do not have to rebuild
# it every single time.
cl.user_session.set("chain", chain)
@cl.on_message
async def main(message: cl.Message):
# Let's load the chain from user_session
chain = cl.user_session.get("chain") # type: RetrievalQAWithSourcesChain
response = await chain.acall(
message.content,
callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)],
)
answer = response["answer"]
sources = response["sources"].strip()
# Get all of the documents from user session
docs = cl.user_session.get("docs")
metadatas = [doc.metadata for doc in docs]
all_sources = [m["source"] for m in metadatas]
# Adding sources to the answer
source_elements = []
if sources:
found_sources = []
# Add the sources to the message
for source in sources.split(","):
source_name = source.strip().replace(".", "")
# Get the index of the source
try:
index = all_sources.index(source_name)
except ValueError:
continue
text = docs[index].page_content
found_sources.append(source_name)
# Create the text element referenced in the message
source_elements.append(cl.Text(content=text, name=source_name))
if found_sources:
answer += f"\nSources: {', '.join(found_sources)}"
else:
answer += "\nNo sources found"
await cl.Message(content=answer, elements=source_elements).send()
|