awacke1 commited on
Commit
650829b
Β·
1 Parent(s): a787e2f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+ from io import BytesIO
4
+ from typing import Any, Dict, List
5
+ import openai
6
+ import streamlit as st
7
+ from langchain import LLMChain, OpenAI
8
+ from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.chains.question_answering import load_qa_chain
11
+ from langchain.docstore.document import Document
12
+ from langchain.document_loaders import PyPDFLoader
13
+ from langchain.embeddings.openai import OpenAIEmbeddings
14
+ from langchain.llms import OpenAI
15
+ from langchain.memory import ConversationBufferMemory
16
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from langchain.vectorstores import VectorStore
18
+ from langchain.vectorstores.faiss import FAISS
19
+ from pypdf import PdfReader
20
+
21
+ # Define a function to parse a PDF file and extract its text content
22
+ @st.cache_data
23
+ def parse_pdf(file: BytesIO) -> List[str]:
24
+ pdf = PdfReader(file)
25
+ output = []
26
+ for page in pdf.pages:
27
+ text = page.extract_text()
28
+ # Merge hyphenated words
29
+ text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
30
+ # Fix newlines in the middle of sentences
31
+ text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
32
+ # Remove multiple newlines
33
+ text = re.sub(r"\n\s*\n", "\n\n", text)
34
+ output.append(text)
35
+ return output
36
+
37
+
38
+ @st.cache_data
39
+ def text_to_docs(text: str) -> List[Document]:
40
+ """Converts a string or list of strings to a list of Documents
41
+ with metadata."""
42
+ if isinstance(text, str):
43
+ # Take a single string as one page
44
+ text = [text]
45
+ page_docs = [Document(page_content=page) for page in text]
46
+
47
+ # Add page numbers as metadata
48
+ for i, doc in enumerate(page_docs):
49
+ doc.metadata["page"] = i + 1
50
+
51
+ # Split pages into chunks
52
+ doc_chunks = []
53
+
54
+ for doc in page_docs:
55
+ text_splitter = RecursiveCharacterTextSplitter(
56
+ chunk_size=2000,
57
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
58
+ chunk_overlap=0,
59
+ )
60
+ chunks = text_splitter.split_text(doc.page_content)
61
+ for i, chunk in enumerate(chunks):
62
+ doc = Document(
63
+ page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
64
+ )
65
+ # Add sources a metadata
66
+ doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
67
+ doc_chunks.append(doc)
68
+ return doc_chunks
69
+
70
+
71
+ @st.cache_data
72
+ def test_embed():
73
+ embeddings = OpenAIEmbeddings(openai_api_key=api)
74
+ # Indexing
75
+ # Save in a Vector DB
76
+ with st.spinner("It's indexing..."):
77
+ index = FAISS.from_documents(pages, embeddings)
78
+ st.success("Embeddings done.", icon="βœ…")
79
+ return index
80
+
81
+ # Set up the Streamlit app
82
+ st.title("πŸ€– Memory Embeddings Chatbot 🧠 ")
83
+ st.markdown("""#### πŸ—¨οΈ Chat with your PDF files πŸ“œ with Memory """)
84
+ uploaded_file = st.file_uploader("**Upload PDF**", type=["pdf"])
85
+
86
+ if uploaded_file:
87
+ name_of_file = uploaded_file.name
88
+ doc = parse_pdf(uploaded_file)
89
+ pages = text_to_docs(doc)
90
+ if pages:
91
+ # Allow the user to select a page and view its content
92
+ with st.expander("Show Page Content", expanded=False):
93
+ page_sel = st.number_input(
94
+ label="Select Page", min_value=1, max_value=len(pages), step=1
95
+ )
96
+ pages[page_sel - 1]
97
+
98
+ api = os.getenv('OPENAI_KEY')
99
+
100
+ if api:
101
+ # Test the embeddings and save the index in a vector database
102
+ index = test_embed()
103
+ # Set up the question-answering system
104
+ qa = RetrievalQA.from_chain_type(
105
+ llm=OpenAI(openai_api_key=api),
106
+ chain_type = "map_reduce",
107
+ retriever=index.as_retriever(),
108
+ )
109
+ # Set up the conversational agent
110
+ tools = [
111
+ Tool(
112
+ name="State of Union QA System",
113
+ func=qa.run,
114
+ description="Useful for when you need to answer questions about the aspects asked. Input may be a partial or fully formed question.",
115
+ )
116
+ ]
117
+ prefix = """Have a conversation with a human, answering the following questions as best you can based on the context and memory available.
118
+ You have access to a single tool:"""
119
+ suffix = """Begin!"
120
+
121
+ {chat_history}
122
+ Question: {input}
123
+ {agent_scratchpad}"""
124
+
125
+ prompt = ZeroShotAgent.create_prompt(
126
+ tools,
127
+ prefix=prefix,
128
+ suffix=suffix,
129
+ input_variables=["input", "chat_history", "agent_scratchpad"],
130
+ )
131
+
132
+ if "memory" not in st.session_state:
133
+ st.session_state.memory = ConversationBufferMemory(
134
+ memory_key="chat_history"
135
+ )
136
+
137
+ llm_chain = LLMChain(
138
+ llm=OpenAI(
139
+ temperature=0, openai_api_key=api, model_name="gpt-3.5-turbo"
140
+ ),
141
+ prompt=prompt,
142
+ )
143
+ agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
144
+ agent_chain = AgentExecutor.from_agent_and_tools(
145
+ agent=agent, tools=tools, verbose=True, memory=st.session_state.memory
146
+ )
147
+
148
+ # Allow the user to enter a query and generate a response
149
+ query = st.text_input(
150
+ "**What's on your mind?**",
151
+ placeholder="Ask me anything from {}".format(name_of_file),
152
+ )
153
+
154
+ if query:
155
+ with st.spinner(
156
+ "Generating Answer to your Query : `{}` ".format(query)
157
+ ):
158
+ res = agent_chain.run(query)
159
+ st.info(res, icon="πŸ€–")
160
+
161
+ # Allow the user to view the conversation history and other information stored in the agent's memory
162
+ with st.expander("History/Memory"):
163
+ st.session_state.memory