awacke1 commited on
Commit
4cfa7cf
·
1 Parent(s): 2e540a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -67
app.py CHANGED
@@ -1,17 +1,16 @@
 
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
  from langchain.vectorstores import FAISS
7
  from langchain.chat_models import ChatOpenAI
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain.chains import ConversationalRetrievalChain
10
  from htmlTemplates import css, bot_template, user_template
11
- from langchain.llms import HuggingFaceHub
12
- import os
13
 
14
- def get_pdf_text(pdf_docs):
15
  text = ""
16
  for pdf in pdf_docs:
17
  pdf_reader = PdfReader(pdf)
@@ -19,91 +18,47 @@ def get_pdf_text(pdf_docs):
19
  text += page.extract_text()
20
  return text
21
 
 
 
 
22
 
23
- def get_text_chunks(text):
24
- text_splitter = CharacterTextSplitter(
25
- separator="\n",
26
- chunk_size=1000,
27
- chunk_overlap=200,
28
- length_function=len
29
- )
30
- chunks = text_splitter.split_text(text)
31
- return chunks
32
-
33
-
34
- def get_vectorstore(text_chunks):
35
- #embeddings = OpenAIEmbeddings()
36
-
37
  key = os.getenv('OPENAI_KEY')
38
  embeddings = OpenAIEmbeddings(openai_api_key=key)
39
-
40
- # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
41
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
42
- return vectorstore
43
 
44
-
45
- def get_conversation_chain(vectorstore):
46
  llm = ChatOpenAI()
47
- # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
48
-
49
- memory = ConversationBufferMemory(
50
- memory_key='chat_history', return_messages=True)
51
- conversation_chain = ConversationalRetrievalChain.from_llm(
52
- llm=llm,
53
- retriever=vectorstore.as_retriever(),
54
- memory=memory
55
- )
56
- return conversation_chain
57
-
58
 
59
- def handle_userinput(user_question):
60
  response = st.session_state.conversation({'question': user_question})
61
  st.session_state.chat_history = response['chat_history']
62
 
63
  for i, message in enumerate(st.session_state.chat_history):
64
- if i % 2 == 0:
65
- st.write(user_template.replace(
66
- "{{MSG}}", message.content), unsafe_allow_html=True)
67
- else:
68
- st.write(bot_template.replace(
69
- "{{MSG}}", message.content), unsafe_allow_html=True)
70
-
71
 
72
  def main():
73
  load_dotenv()
74
- st.set_page_config(page_title="Chat with multiple PDFs",
75
- page_icon=":books:")
76
  st.write(css, unsafe_allow_html=True)
77
 
78
- if "conversation" not in st.session_state:
79
- st.session_state.conversation = None
80
- if "chat_history" not in st.session_state:
81
- st.session_state.chat_history = None
82
-
83
  st.header("Chat with multiple PDFs :books:")
84
  user_question = st.text_input("Ask a question about your documents:")
85
  if user_question:
86
- handle_userinput(user_question)
87
 
88
  with st.sidebar:
89
  st.subheader("Your documents")
90
- pdf_docs = st.file_uploader(
91
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
92
  if st.button("Process"):
93
  with st.spinner("Processing"):
94
- # get pdf text
95
- raw_text = get_pdf_text(pdf_docs)
96
-
97
- # get the text chunks
98
- text_chunks = get_text_chunks(raw_text)
99
-
100
- # create vector store
101
- vectorstore = get_vectorstore(text_chunks)
102
-
103
- # create conversation chain
104
- st.session_state.conversation = get_conversation_chain(
105
- vectorstore)
106
-
107
 
108
  if __name__ == '__main__':
109
- main()
 
1
+ import os
2
  import streamlit as st
3
  from dotenv import load_dotenv
4
  from PyPDF2 import PdfReader
5
  from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.embeddings import OpenAIEmbeddings
7
  from langchain.vectorstores import FAISS
8
  from langchain.chat_models import ChatOpenAI
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
 
 
12
 
13
+ def extract_text_from_pdfs(pdf_docs):
14
  text = ""
15
  for pdf in pdf_docs:
16
  pdf_reader = PdfReader(pdf)
 
18
  text += page.extract_text()
19
  return text
20
 
21
+ def split_text_into_chunks(text):
22
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
23
+ return text_splitter.split_text(text)
24
 
25
+ def create_vector_store_from_text_chunks(text_chunks):
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  key = os.getenv('OPENAI_KEY')
27
  embeddings = OpenAIEmbeddings(openai_api_key=key)
28
+ return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
 
 
 
29
 
30
+ def create_conversation_chain(vectorstore):
 
31
  llm = ChatOpenAI()
32
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
33
+ return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory)
 
 
 
 
 
 
 
 
 
34
 
35
+ def process_user_input(user_question):
36
  response = st.session_state.conversation({'question': user_question})
37
  st.session_state.chat_history = response['chat_history']
38
 
39
  for i, message in enumerate(st.session_state.chat_history):
40
+ template = user_template if i % 2 == 0 else bot_template
41
+ st.write(template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
 
 
 
 
 
42
 
43
  def main():
44
  load_dotenv()
45
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
 
46
  st.write(css, unsafe_allow_html=True)
47
 
 
 
 
 
 
48
  st.header("Chat with multiple PDFs :books:")
49
  user_question = st.text_input("Ask a question about your documents:")
50
  if user_question:
51
+ process_user_input(user_question)
52
 
53
  with st.sidebar:
54
  st.subheader("Your documents")
55
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
 
56
  if st.button("Process"):
57
  with st.spinner("Processing"):
58
+ raw_text = extract_text_from_pdfs(pdf_docs)
59
+ text_chunks = split_text_into_chunks(raw_text)
60
+ vectorstore = create_vector_store_from_text_chunks(text_chunks)
61
+ st.session_state.conversation = create_conversation_chain(vectorstore)
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == '__main__':
64
+ main()