DebabrataHalder commited on
Commit
7b38ee1
·
verified ·
1 Parent(s): 0020cf6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +204 -17
app.py CHANGED
@@ -1,16 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  import os
3
  import logging
4
  from dotenv import load_dotenv
5
  import streamlit as st
6
  from PyPDF2 import PdfReader
 
 
7
  from langchain.text_splitter import CharacterTextSplitter
8
- # from langchain.embeddings import HuggingFaceInstructEmbeddings
9
  from langchain_cohere import CohereEmbeddings
10
  from langchain.vectorstores import FAISS
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.chains import ConversationalRetrievalChain
13
- # from langchain.llms import Ollama
14
  from langchain_groq import ChatGroq
15
 
16
  # Load environment variables
@@ -31,6 +184,22 @@ def get_pdf_text(pdf_docs):
31
  text += page.extract_text()
32
  return text
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Function to split the extracted text into chunks
35
  def get_text_chunks(text):
36
  text_splitter = CharacterTextSplitter(
@@ -42,12 +211,6 @@ def get_text_chunks(text):
42
  chunks = text_splitter.split_text(text)
43
  return chunks
44
 
45
- # Function to create a FAISS vectorstore
46
- # def get_vectorstore(text_chunks):
47
- # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
48
- # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
49
- # return vectorstore
50
-
51
  def get_vectorstore(text_chunks):
52
  cohere_api_key = os.getenv("COHERE_API_KEY")
53
  embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
@@ -57,7 +220,6 @@ def get_vectorstore(text_chunks):
57
  # Function to set up the conversational retrieval chain
58
  def get_conversation_chain(vectorstore):
59
  try:
60
- # llm = Ollama(model="llama3.2:1b")
61
  llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
62
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
63
 
@@ -90,29 +252,54 @@ def handle_userinput(user_question):
90
  # Main function to run the Streamlit app
91
  def main():
92
  load_dotenv()
93
- st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
94
 
95
  if "conversation" not in st.session_state:
96
  st.session_state.conversation = None
97
  if "chat_history" not in st.session_state:
98
  st.session_state.chat_history = None
99
 
100
- st.header("Chat with multiple PDFs :books:")
 
101
  user_question = st.text_input("Ask a question about your documents:")
 
102
  if user_question:
103
  handle_userinput(user_question)
104
 
105
  with st.sidebar:
106
  st.subheader("Your documents")
 
107
  pdf_docs = st.file_uploader(
108
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
 
 
 
 
 
 
 
 
109
  )
 
110
  if st.button("Process"):
111
  with st.spinner("Processing..."):
112
- raw_text = get_pdf_text(pdf_docs)
113
- text_chunks = get_text_chunks(raw_text)
114
- vectorstore = get_vectorstore(text_chunks)
115
- st.session_state.conversation = get_conversation_chain(vectorstore)
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  if __name__ == '__main__':
118
- main()
 
1
+
2
+ # import os
3
+ # import logging
4
+ # from dotenv import load_dotenv
5
+ # import streamlit as st
6
+ # from PyPDF2 import PdfReader
7
+ # from langchain.text_splitter import CharacterTextSplitter
8
+ # # from langchain.embeddings import HuggingFaceInstructEmbeddings
9
+ # from langchain_cohere import CohereEmbeddings
10
+ # from langchain.vectorstores import FAISS
11
+ # from langchain.memory import ConversationBufferMemory
12
+ # from langchain.chains import ConversationalRetrievalChain
13
+ # # from langchain.llms import Ollama
14
+ # from langchain_groq import ChatGroq
15
+
16
+ # # Load environment variables
17
+ # load_dotenv()
18
+
19
+ # # Set up logging
20
+ # logging.basicConfig(
21
+ # level=logging.INFO,
22
+ # format='%(asctime)s - %(levelname)s - %(message)s'
23
+ # )
24
+
25
+ # # Function to extract text from PDF files
26
+ # def get_pdf_text(pdf_docs):
27
+ # text = ""
28
+ # for pdf in pdf_docs:
29
+ # pdf_reader = PdfReader(pdf)
30
+ # for page in pdf_reader.pages:
31
+ # text += page.extract_text()
32
+ # return text
33
+
34
+ # # Function to split the extracted text into chunks
35
+ # def get_text_chunks(text):
36
+ # text_splitter = CharacterTextSplitter(
37
+ # separator="\n",
38
+ # chunk_size=1000,
39
+ # chunk_overlap=200,
40
+ # length_function=len
41
+ # )
42
+ # chunks = text_splitter.split_text(text)
43
+ # return chunks
44
+
45
+ # # Function to create a FAISS vectorstore
46
+ # # def get_vectorstore(text_chunks):
47
+ # # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
48
+ # # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
49
+ # # return vectorstore
50
+
51
+ # def get_vectorstore(text_chunks):
52
+ # cohere_api_key = os.getenv("COHERE_API_KEY")
53
+ # embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
54
+ # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
55
+ # return vectorstore
56
+
57
+ # # Function to set up the conversational retrieval chain
58
+ # def get_conversation_chain(vectorstore):
59
+ # try:
60
+ # # llm = Ollama(model="llama3.2:1b")
61
+ # llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
62
+ # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
63
+
64
+ # conversation_chain = ConversationalRetrievalChain.from_llm(
65
+ # llm=llm,
66
+ # retriever=vectorstore.as_retriever(),
67
+ # memory=memory
68
+ # )
69
+
70
+ # logging.info("Conversation chain created successfully.")
71
+ # return conversation_chain
72
+ # except Exception as e:
73
+ # logging.error(f"Error creating conversation chain: {e}")
74
+ # st.error("An error occurred while setting up the conversation chain.")
75
+
76
+ # # Handle user input
77
+ # def handle_userinput(user_question):
78
+ # if st.session_state.conversation is not None:
79
+ # response = st.session_state.conversation({'question': user_question})
80
+ # st.session_state.chat_history = response['chat_history']
81
+
82
+ # for i, message in enumerate(st.session_state.chat_history):
83
+ # if i % 2 == 0:
84
+ # st.write(f"*User:* {message.content}")
85
+ # else:
86
+ # st.write(f"*Bot:* {message.content}")
87
+ # else:
88
+ # st.warning("Please process the documents first.")
89
+
90
+ # # Main function to run the Streamlit app
91
+ # def main():
92
+ # load_dotenv()
93
+ # st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
94
+
95
+ # if "conversation" not in st.session_state:
96
+ # st.session_state.conversation = None
97
+ # if "chat_history" not in st.session_state:
98
+ # st.session_state.chat_history = None
99
+
100
+ # st.header("Chat with multiple PDFs :books:")
101
+ # user_question = st.text_input("Ask a question about your documents:")
102
+ # if user_question:
103
+ # handle_userinput(user_question)
104
+
105
+ # with st.sidebar:
106
+ # st.subheader("Your documents")
107
+ # pdf_docs = st.file_uploader(
108
+ # "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
109
+ # )
110
+ # if st.button("Process"):
111
+ # with st.spinner("Processing..."):
112
+ # raw_text = get_pdf_text(pdf_docs)
113
+ # text_chunks = get_text_chunks(raw_text)
114
+ # vectorstore = get_vectorstore(text_chunks)
115
+ # st.session_state.conversation = get_conversation_chain(vectorstore)
116
+
117
+ # if __name__ == '__main__':
118
+ # main()
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
 
155
  import os
156
  import logging
157
  from dotenv import load_dotenv
158
  import streamlit as st
159
  from PyPDF2 import PdfReader
160
+ from docx import Document # Import for handling Word files
161
+ import io # Import for handling byte streams
162
  from langchain.text_splitter import CharacterTextSplitter
 
163
  from langchain_cohere import CohereEmbeddings
164
  from langchain.vectorstores import FAISS
165
  from langchain.memory import ConversationBufferMemory
166
  from langchain.chains import ConversationalRetrievalChain
 
167
  from langchain_groq import ChatGroq
168
 
169
  # Load environment variables
 
184
  text += page.extract_text()
185
  return text
186
 
187
+ # Function to extract text from Word files
188
+ def get_word_text(word_docs):
189
+ text = ""
190
+ for word in word_docs:
191
+ doc = Document(io.BytesIO(word.read())) # Read the Word document from bytes
192
+ for para in doc.paragraphs:
193
+ text += para.text + "\n" # Append each paragraph followed by a newline
194
+ return text
195
+
196
+ # Function to extract text from TXT files
197
+ def get_txt_text(txt_docs):
198
+ text = ""
199
+ for txt in txt_docs:
200
+ text += txt.read().decode("utf-8") + "\n" # Read and decode the text file content
201
+ return text
202
+
203
  # Function to split the extracted text into chunks
204
  def get_text_chunks(text):
205
  text_splitter = CharacterTextSplitter(
 
211
  chunks = text_splitter.split_text(text)
212
  return chunks
213
 
 
 
 
 
 
 
214
  def get_vectorstore(text_chunks):
215
  cohere_api_key = os.getenv("COHERE_API_KEY")
216
  embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
 
220
  # Function to set up the conversational retrieval chain
221
  def get_conversation_chain(vectorstore):
222
  try:
 
223
  llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
224
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
225
 
 
252
  # Main function to run the Streamlit app
253
  def main():
254
  load_dotenv()
255
+ st.set_page_config(page_title="Chat with multiple documents", page_icon=":books:")
256
 
257
  if "conversation" not in st.session_state:
258
  st.session_state.conversation = None
259
  if "chat_history" not in st.session_state:
260
  st.session_state.chat_history = None
261
 
262
+ st.header("Chat with multiple documents :books:")
263
+
264
  user_question = st.text_input("Ask a question about your documents:")
265
+
266
  if user_question:
267
  handle_userinput(user_question)
268
 
269
  with st.sidebar:
270
  st.subheader("Your documents")
271
+
272
  pdf_docs = st.file_uploader(
273
+ "Upload your PDFs here", accept_multiple_files=True, type=["pdf"]
274
+ )
275
+
276
+ word_docs = st.file_uploader(
277
+ "Upload your Word documents here", accept_multiple_files=True, type=["docx"]
278
+ )
279
+
280
+ txt_docs = st.file_uploader(
281
+ "Upload your TXT files here", accept_multiple_files=True, type=["txt"]
282
  )
283
+
284
  if st.button("Process"):
285
  with st.spinner("Processing..."):
286
+ raw_text = ""
287
+
288
+ if pdf_docs:
289
+ raw_text += get_pdf_text(pdf_docs)
290
+
291
+ if word_docs:
292
+ raw_text += get_word_text(word_docs)
293
+
294
+ if txt_docs:
295
+ raw_text += get_txt_text(txt_docs)
296
+
297
+ if raw_text: # Only process if there is any raw text extracted.
298
+ text_chunks = get_text_chunks(raw_text)
299
+ vectorstore = get_vectorstore(text_chunks)
300
+ st.session_state.conversation = get_conversation_chain(vectorstore)
301
+ else:
302
+ st.warning("No documents were uploaded or processed.")
303
 
304
  if __name__ == '__main__':
305
+ main()