DebabrataHalder commited on
Commit
6090e99
·
verified ·
1 Parent(s): 7b38ee1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -203
app.py CHANGED
@@ -1,169 +1,16 @@
1
-
2
- # import os
3
- # import logging
4
- # from dotenv import load_dotenv
5
- # import streamlit as st
6
- # from PyPDF2 import PdfReader
7
- # from langchain.text_splitter import CharacterTextSplitter
8
- # # from langchain.embeddings import HuggingFaceInstructEmbeddings
9
- # from langchain_cohere import CohereEmbeddings
10
- # from langchain.vectorstores import FAISS
11
- # from langchain.memory import ConversationBufferMemory
12
- # from langchain.chains import ConversationalRetrievalChain
13
- # # from langchain.llms import Ollama
14
- # from langchain_groq import ChatGroq
15
-
16
- # # Load environment variables
17
- # load_dotenv()
18
-
19
- # # Set up logging
20
- # logging.basicConfig(
21
- # level=logging.INFO,
22
- # format='%(asctime)s - %(levelname)s - %(message)s'
23
- # )
24
-
25
- # # Function to extract text from PDF files
26
- # def get_pdf_text(pdf_docs):
27
- # text = ""
28
- # for pdf in pdf_docs:
29
- # pdf_reader = PdfReader(pdf)
30
- # for page in pdf_reader.pages:
31
- # text += page.extract_text()
32
- # return text
33
-
34
- # # Function to split the extracted text into chunks
35
- # def get_text_chunks(text):
36
- # text_splitter = CharacterTextSplitter(
37
- # separator="\n",
38
- # chunk_size=1000,
39
- # chunk_overlap=200,
40
- # length_function=len
41
- # )
42
- # chunks = text_splitter.split_text(text)
43
- # return chunks
44
-
45
- # # Function to create a FAISS vectorstore
46
- # # def get_vectorstore(text_chunks):
47
- # # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
48
- # # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
49
- # # return vectorstore
50
-
51
- # def get_vectorstore(text_chunks):
52
- # cohere_api_key = os.getenv("COHERE_API_KEY")
53
- # embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
54
- # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
55
- # return vectorstore
56
-
57
- # # Function to set up the conversational retrieval chain
58
- # def get_conversation_chain(vectorstore):
59
- # try:
60
- # # llm = Ollama(model="llama3.2:1b")
61
- # llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
62
- # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
63
-
64
- # conversation_chain = ConversationalRetrievalChain.from_llm(
65
- # llm=llm,
66
- # retriever=vectorstore.as_retriever(),
67
- # memory=memory
68
- # )
69
-
70
- # logging.info("Conversation chain created successfully.")
71
- # return conversation_chain
72
- # except Exception as e:
73
- # logging.error(f"Error creating conversation chain: {e}")
74
- # st.error("An error occurred while setting up the conversation chain.")
75
-
76
- # # Handle user input
77
- # def handle_userinput(user_question):
78
- # if st.session_state.conversation is not None:
79
- # response = st.session_state.conversation({'question': user_question})
80
- # st.session_state.chat_history = response['chat_history']
81
-
82
- # for i, message in enumerate(st.session_state.chat_history):
83
- # if i % 2 == 0:
84
- # st.write(f"*User:* {message.content}")
85
- # else:
86
- # st.write(f"*Bot:* {message.content}")
87
- # else:
88
- # st.warning("Please process the documents first.")
89
-
90
- # # Main function to run the Streamlit app
91
- # def main():
92
- # load_dotenv()
93
- # st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
94
-
95
- # if "conversation" not in st.session_state:
96
- # st.session_state.conversation = None
97
- # if "chat_history" not in st.session_state:
98
- # st.session_state.chat_history = None
99
-
100
- # st.header("Chat with multiple PDFs :books:")
101
- # user_question = st.text_input("Ask a question about your documents:")
102
- # if user_question:
103
- # handle_userinput(user_question)
104
-
105
- # with st.sidebar:
106
- # st.subheader("Your documents")
107
- # pdf_docs = st.file_uploader(
108
- # "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
109
- # )
110
- # if st.button("Process"):
111
- # with st.spinner("Processing..."):
112
- # raw_text = get_pdf_text(pdf_docs)
113
- # text_chunks = get_text_chunks(raw_text)
114
- # vectorstore = get_vectorstore(text_chunks)
115
- # st.session_state.conversation = get_conversation_chain(vectorstore)
116
-
117
- # if __name__ == '__main__':
118
- # main()
119
-
120
-
121
-
122
-
123
-
124
-
125
-
126
-
127
-
128
-
129
-
130
-
131
-
132
-
133
-
134
-
135
-
136
-
137
-
138
-
139
-
140
-
141
-
142
-
143
-
144
-
145
-
146
-
147
-
148
-
149
-
150
-
151
-
152
-
153
-
154
 
155
  import os
156
  import logging
157
  from dotenv import load_dotenv
158
  import streamlit as st
159
  from PyPDF2 import PdfReader
160
- from docx import Document # Import for handling Word files
161
- import io # Import for handling byte streams
162
  from langchain.text_splitter import CharacterTextSplitter
 
163
  from langchain_cohere import CohereEmbeddings
164
  from langchain.vectorstores import FAISS
165
  from langchain.memory import ConversationBufferMemory
166
  from langchain.chains import ConversationalRetrievalChain
 
167
  from langchain_groq import ChatGroq
168
 
169
  # Load environment variables
@@ -184,22 +31,6 @@ def get_pdf_text(pdf_docs):
184
  text += page.extract_text()
185
  return text
186
 
187
- # Function to extract text from Word files
188
- def get_word_text(word_docs):
189
- text = ""
190
- for word in word_docs:
191
- doc = Document(io.BytesIO(word.read())) # Read the Word document from bytes
192
- for para in doc.paragraphs:
193
- text += para.text + "\n" # Append each paragraph followed by a newline
194
- return text
195
-
196
- # Function to extract text from TXT files
197
- def get_txt_text(txt_docs):
198
- text = ""
199
- for txt in txt_docs:
200
- text += txt.read().decode("utf-8") + "\n" # Read and decode the text file content
201
- return text
202
-
203
  # Function to split the extracted text into chunks
204
  def get_text_chunks(text):
205
  text_splitter = CharacterTextSplitter(
@@ -211,6 +42,12 @@ def get_text_chunks(text):
211
  chunks = text_splitter.split_text(text)
212
  return chunks
213
 
 
 
 
 
 
 
214
  def get_vectorstore(text_chunks):
215
  cohere_api_key = os.getenv("COHERE_API_KEY")
216
  embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
@@ -220,6 +57,7 @@ def get_vectorstore(text_chunks):
220
  # Function to set up the conversational retrieval chain
221
  def get_conversation_chain(vectorstore):
222
  try:
 
223
  llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
224
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
225
 
@@ -252,54 +90,56 @@ def handle_userinput(user_question):
252
  # Main function to run the Streamlit app
253
  def main():
254
  load_dotenv()
255
- st.set_page_config(page_title="Chat with multiple documents", page_icon=":books:")
256
 
257
  if "conversation" not in st.session_state:
258
  st.session_state.conversation = None
259
  if "chat_history" not in st.session_state:
260
  st.session_state.chat_history = None
261
 
262
- st.header("Chat with multiple documents :books:")
263
-
264
  user_question = st.text_input("Ask a question about your documents:")
265
-
266
  if user_question:
267
  handle_userinput(user_question)
268
 
269
  with st.sidebar:
270
  st.subheader("Your documents")
271
-
272
  pdf_docs = st.file_uploader(
273
- "Upload your PDFs here", accept_multiple_files=True, type=["pdf"]
274
- )
275
-
276
- word_docs = st.file_uploader(
277
- "Upload your Word documents here", accept_multiple_files=True, type=["docx"]
278
- )
279
-
280
- txt_docs = st.file_uploader(
281
- "Upload your TXT files here", accept_multiple_files=True, type=["txt"]
282
  )
283
-
284
  if st.button("Process"):
285
  with st.spinner("Processing..."):
286
- raw_text = ""
287
-
288
- if pdf_docs:
289
- raw_text += get_pdf_text(pdf_docs)
290
-
291
- if word_docs:
292
- raw_text += get_word_text(word_docs)
293
-
294
- if txt_docs:
295
- raw_text += get_txt_text(txt_docs)
296
-
297
- if raw_text: # Only process if there is any raw text extracted.
298
- text_chunks = get_text_chunks(raw_text)
299
- vectorstore = get_vectorstore(text_chunks)
300
- st.session_state.conversation = get_conversation_chain(vectorstore)
301
- else:
302
- st.warning("No documents were uploaded or processed.")
303
 
304
  if __name__ == '__main__':
305
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  import os
3
  import logging
4
  from dotenv import load_dotenv
5
  import streamlit as st
6
  from PyPDF2 import PdfReader
 
 
7
  from langchain.text_splitter import CharacterTextSplitter
8
+ # from langchain.embeddings import HuggingFaceInstructEmbeddings
9
  from langchain_cohere import CohereEmbeddings
10
  from langchain.vectorstores import FAISS
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.chains import ConversationalRetrievalChain
13
+ # from langchain.llms import Ollama
14
  from langchain_groq import ChatGroq
15
 
16
  # Load environment variables
 
31
  text += page.extract_text()
32
  return text
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Function to split the extracted text into chunks
35
  def get_text_chunks(text):
36
  text_splitter = CharacterTextSplitter(
 
42
  chunks = text_splitter.split_text(text)
43
  return chunks
44
 
45
+ # Function to create a FAISS vectorstore
46
+ # def get_vectorstore(text_chunks):
47
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
48
+ # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
49
+ # return vectorstore
50
+
51
  def get_vectorstore(text_chunks):
52
  cohere_api_key = os.getenv("COHERE_API_KEY")
53
  embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
 
57
  # Function to set up the conversational retrieval chain
58
  def get_conversation_chain(vectorstore):
59
  try:
60
+ # llm = Ollama(model="llama3.2:1b")
61
  llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
62
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
63
 
 
90
  # Main function to run the Streamlit app
91
  def main():
92
  load_dotenv()
93
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
94
 
95
  if "conversation" not in st.session_state:
96
  st.session_state.conversation = None
97
  if "chat_history" not in st.session_state:
98
  st.session_state.chat_history = None
99
 
100
+ st.header("Chat with multiple PDFs :books:")
 
101
  user_question = st.text_input("Ask a question about your documents:")
 
102
  if user_question:
103
  handle_userinput(user_question)
104
 
105
  with st.sidebar:
106
  st.subheader("Your documents")
 
107
  pdf_docs = st.file_uploader(
108
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
 
 
 
 
 
 
 
 
109
  )
 
110
  if st.button("Process"):
111
  with st.spinner("Processing..."):
112
+ raw_text = get_pdf_text(pdf_docs)
113
+ text_chunks = get_text_chunks(raw_text)
114
+ vectorstore = get_vectorstore(text_chunks)
115
+ st.session_state.conversation = get_conversation_chain(vectorstore)
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  if __name__ == '__main__':
118
  main()
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+