|
from tempfile import NamedTemporaryFile |
|
from typing import List |
|
|
|
import chainlit as cl |
|
from chainlit.types import AskFileResponse |
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.prompts import ChatPromptTemplate |
|
from langchain.schema import Document, StrOutputParser |
|
from langchain.chains import LLMChain |
|
|
|
from langchain.document_loaders import PDFPlumberLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
|
|
def process_file(*, file: AskFileResponse) -> List[Document]: |
|
"""Processes one PDF file from a Chainlit AskFileResponse object by first |
|
loading the PDF document and then chunk it into sub documents. Only |
|
supports PDF files. |
|
|
|
Args: |
|
file (AskFileResponse): input file to be processed |
|
|
|
Raises: |
|
ValueError: when we fail to process PDF files. We consider PDF file |
|
processing failure when there's no text returned. For example, PDFs |
|
with only image contents, corrupted PDFs, etc. |
|
|
|
Returns: |
|
List[Document]: List of Document(s). Each individual document has two |
|
fields: page_content(string) and metadata(dict). |
|
""" |
|
if file.type != "application/pdf": |
|
raise TypeError("Only PDF files are supported") |
|
|
|
with NamedTemporaryFile() as tempfile: |
|
tempfile.write(file.content) |
|
|
|
loader = PDFPlumberLoader(tempfile.name) |
|
documents = loader.load() |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=3000, |
|
chunk_overlap=100 |
|
) |
|
docs = text_splitter.split_documents(documents) |
|
|
|
|
|
|
|
for i, doc in enumerate(docs): |
|
doc.metadata["source"] = f"source_{i}" |
|
|
|
if not docs: |
|
raise ValueError("PDF file parsing failed.") |
|
|
|
return docs |
|
|
|
|
|
@cl.on_chat_start |
|
async def on_chat_start(): |
|
"""This function is written to prepare the environments for the chat |
|
with PDF application. It should be decorated with cl.on_chat_start. |
|
|
|
Returns: |
|
None |
|
""" |
|
|
|
files = None |
|
while files is None: |
|
files = await cl.AskFileMessage( |
|
content="Please Upload the PDF file you want to chat with...", |
|
accept=["application/pdf"], |
|
max_size_mb=20, |
|
).send() |
|
file = files[0] |
|
|
|
|
|
msg = cl.Message(content=f"Processing `{file.name}`...") |
|
await msg.send() |
|
|
|
model = ChatOpenAI( |
|
model="gpt-3.5-turbo-16k-0613", |
|
streaming=True |
|
) |
|
|
|
prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
( |
|
"system", |
|
"You are Chainlit GPT, a helpful assistant.", |
|
), |
|
( |
|
"human", |
|
"{question}" |
|
), |
|
] |
|
) |
|
chain = LLMChain(llm=model, prompt=prompt, output_parser=StrOutputParser()) |
|
|
|
|
|
|
|
cl.user_session.set("chain", chain) |
|
|
|
|
|
@cl.on_message |
|
async def main(message: cl.Message): |
|
|
|
|
|
chain = cl.user_session.get("chain") |
|
|
|
response = await chain.arun( |
|
question=message.content, callbacks=[cl.LangchainCallbackHandler()] |
|
) |
|
|
|
await cl.Message(content=response).send() |
|
|
|
|