feat: heya
Browse files- README.md +3 -4
- app/app.py +66 -1
README.md
CHANGED
@@ -5,9 +5,8 @@ _See the readme file in the main branch for updated instructions and information
|
|
5 |
## Lab3: Enabling Load PDF to Chainlit App
|
6 |
Building on top of the current simplified version of ChatGPT using Chainlit, we now going to add loading PDF capabilities into the application.
|
7 |
|
8 |
-
|
9 |
|
10 |
-
In this lab, we will be adding an Chat LLM to our Chainlit app using Langchain.
|
11 |
|
12 |
## Exercises
|
13 |
|
@@ -34,6 +33,6 @@ chainlit run app/app.py -w
|
|
34 |
|
35 |
## References
|
36 |
|
37 |
-
- [Langchain
|
38 |
-
- [Langchain
|
39 |
- [Chainlit's documentation](https://docs.chainlit.io/get-started/pure-python)
|
|
|
5 |
## Lab3: Enabling Load PDF to Chainlit App
|
6 |
Building on top of the current simplified version of ChatGPT using Chainlit, we now going to add loading PDF capabilities into the application.
|
7 |
|
8 |
+
In this lab, we will utilize the build in PDF loading and parsing connectors inside Langchain, load the PDF, and chunk the PDFs into individual pieces with their associated metadata.
|
9 |
|
|
|
10 |
|
11 |
## Exercises
|
12 |
|
|
|
33 |
|
34 |
## References
|
35 |
|
36 |
+
- [Langchain PDF Loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf)
|
37 |
+
- [Langchain Text Splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/#text-splitters)
|
38 |
- [Chainlit's documentation](https://docs.chainlit.io/get-started/pure-python)
|
app/app.py
CHANGED
@@ -1,9 +1,74 @@
|
|
|
|
|
|
|
|
1 |
import chainlit as cl
|
|
|
2 |
from langchain.chat_models import ChatOpenAI
|
3 |
from langchain.prompts import ChatPromptTemplate
|
4 |
-
from langchain.schema import StrOutputParser
|
5 |
from langchain.chains import LLMChain
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
@cl.on_chat_start
|
9 |
async def on_chat_start():
|
|
|
1 |
+
from tempfile import NamedTemporaryFile
|
2 |
+
from typing import List
|
3 |
+
|
4 |
import chainlit as cl
|
5 |
+
from chainlit.types import AskFileResponse
|
6 |
from langchain.chat_models import ChatOpenAI
|
7 |
from langchain.prompts import ChatPromptTemplate
|
8 |
+
from langchain.schema import Document, StrOutputParser
|
9 |
from langchain.chains import LLMChain
|
10 |
|
11 |
+
from langchain.document_loaders import PDFPlumberLoader
|
12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
|
14 |
+
|
15 |
+
def process_file(*, file: AskFileResponse) -> List[Document]:
|
16 |
+
"""Processes one PDF file from a Chainlit AskFileResponse object by first
|
17 |
+
loading the PDF document and then chunk it into sub documents. Only
|
18 |
+
supports PDF files.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
file (AskFileResponse): input file to be processed
|
22 |
+
|
23 |
+
Raises:
|
24 |
+
ValueError: when we fail to process PDF files. We consider PDF file
|
25 |
+
processing failure when there's no text returned. For example, PDFs
|
26 |
+
with only image contents, corrupted PDFs, etc.
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
List[Document]: List of Document(s). Each individual document has two
|
30 |
+
fields: page_content(string) and metadata(dict).
|
31 |
+
"""
|
32 |
+
# We only support PDF as input.
|
33 |
+
if file.type != "application/pdf":
|
34 |
+
raise TypeError("Only PDF files are supported")
|
35 |
+
|
36 |
+
with NamedTemporaryFile() as tempfile:
|
37 |
+
tempfile.write(file.content)
|
38 |
+
|
39 |
+
######################################################################
|
40 |
+
# Exercise 1a:
|
41 |
+
# We have the input PDF file saved as a temporary file. The name of
|
42 |
+
# the file is 'tempfile.name'. Please use one of the PDF loaders in
|
43 |
+
# Langchain to load the file.
|
44 |
+
######################################################################
|
45 |
+
loader = PDFPlumberLoader(tempfile.name)
|
46 |
+
documents = loader.load()
|
47 |
+
######################################################################
|
48 |
+
|
49 |
+
######################################################################
|
50 |
+
# Exercise 1b:
|
51 |
+
# We can now chunk the documents now it is loaded. Langchain provides
|
52 |
+
# a list of helpful text splitters. Please use one of the splitters
|
53 |
+
# to chunk the file.
|
54 |
+
######################################################################
|
55 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
56 |
+
chunk_size=3000,
|
57 |
+
chunk_overlap=100
|
58 |
+
)
|
59 |
+
docs = text_splitter.split_documents(documents)
|
60 |
+
######################################################################
|
61 |
+
|
62 |
+
# We are adding source_id into the metadata here to denote which
|
63 |
+
# source document it is.
|
64 |
+
for i, doc in enumerate(docs):
|
65 |
+
doc.metadata["source"] = f"source_{i}"
|
66 |
+
|
67 |
+
if not docs:
|
68 |
+
raise ValueError("PDF file parsing failed.")
|
69 |
+
|
70 |
+
return docs
|
71 |
+
|
72 |
|
73 |
@cl.on_chat_start
|
74 |
async def on_chat_start():
|