Spaces:
Runtime error
Runtime error
File size: 4,310 Bytes
59122b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import io
import os
import boto3
from langchain.document_loaders import PyPDFium2Loader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from pdf2image import convert_from_path
from sllim import chat
# Standard Textract client setup
textract_client = boto3.client("textract")
template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct.
DOCUMENTS:
{docs}
---
QUERY:
{query}
"""
embeddings = OpenAIEmbeddings()
def convert_pdf_to_text(pdf_file_path: str):
# Convert the PDF to an in-memory image format
images = convert_from_path(pdf_file_path)
docs = []
for image in images:
# Convert the image into byte stream
with io.BytesIO() as image_stream:
image.save(image_stream, "JPEG")
image_bytes = image_stream.getvalue()
# Use Textract to detect text in the local image
response = textract_client.detect_document_text(Document={"Bytes": image_bytes})
text = ""
# Print the detected text blocks
for item in response["Blocks"]:
if item["BlockType"] == "LINE":
text += item["Text"] + "\n"
docs.append(text)
return docs
def process_file(file_path):
index_path = get_index_name(file_path)
if os.path.exists(index_path):
return
loader = PyPDFium2Loader(file_path)
data = loader.load()
# Parse text into paragraphs
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=50,
length_function=len,
)
docs = text_splitter.split_documents(data)
if len(docs) == 0:
data = convert_pdf_to_text(file_path)
docs = text_splitter.create_documents(data)
# Embed paragraphs
db = FAISS.from_documents(docs, embeddings)
db.save_local(index_path)
def get_index_name(file_path):
basename = os.path.splitext(os.path.basename(file_path))[0]
index_path = basename + "_faiss_index"
return index_path
def ask_question_all(history):
indices = []
docs = []
messages = []
for user, bot in history:
if not isinstance(user, str):
indices.append(get_index_name(user[0]))
elif bot:
messages.append({"role": "user", "content": user})
messages.append({"role": "assistant", "content": bot})
else:
# Handle new message
for index_path in indices:
db = FAISS.load_local(index_path, embeddings)
docs.extend(db.similarity_search(user))
messages.append(
{
"role": "user",
"content": template.format(
query=user, docs="\n".join(map(lambda x: x.page_content, docs))
),
}
)
# send similar paragraphs with question to model
return chat(messages, model="gpt-3.5-turbo")
def ask_question(query, upload_file, history=None):
file_path = upload_file.name
index_path = get_index_name(file_path)
if not os.path.exists(index_path):
loader = PyPDFium2Loader(file_path)
data = loader.load()
# Parse text into paragraphs
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=50,
length_function=len,
)
docs = text_splitter.split_documents(data)
if len(docs) == 0:
data = convert_pdf_to_text(file_path)
docs = text_splitter.create_documents(data)
# Embed paragraphs
db = FAISS.from_documents(docs, embeddings)
db.save_local(index_path)
else:
db = FAISS.load_local(index_path, embeddings)
docs = db.similarity_search(query)
messages = [
{
"role": "user",
"content": template.format(
query=query, docs="\n".join(map(lambda x: x.page_content, docs))
),
}
]
# send similar paragraphs with question to model
return chat(messages, model="gpt-3.5-turbo")
|