samim2024 commited on
Commit
1676c9d
·
verified ·
1 Parent(s): 65ccc47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -8
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  # app.py
2
  import streamlit as st
3
  import os
@@ -56,10 +57,9 @@ with st.sidebar:
56
  input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
57
 
58
  if st.button("Process File") and input_data is not None:
59
- with st.spinner("Processing file..."):
60
- vector_store = process_input(input_data)
61
- st.session_state.vectorstore = vector_store
62
- st.success("File processed successfully. You can now ask questions.")
63
 
64
  # Display chat history
65
  st.subheader("Chat History")
@@ -136,9 +136,17 @@ def process_input(input_data):
136
  # Create uploads directory
137
  os.makedirs("uploads", exist_ok=True)
138
 
 
 
 
 
139
  documents = ""
140
  file_name = input_data.name.lower()
141
 
 
 
 
 
142
  if file_name.endswith(".pdf"):
143
  pdf_reader = PdfReader(input_data)
144
  for page in pdf_reader.pages:
@@ -147,24 +155,32 @@ def process_input(input_data):
147
  documents = input_data.read().decode("utf-8")
148
  elif file_name.endswith((".xls", ".xlsx")):
149
  df = pd.read_excel(input_data)
150
- # Convert all cells to strings and join
151
  documents = " ".join(df.astype(str).values.flatten())
152
  elif file_name.endswith((".doc", ".docx")):
153
  doc = Document(input_data)
154
  for para in doc.paragraphs:
155
  documents += para.text + "\n"
156
 
157
- # Split text
 
 
 
158
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
159
  texts = text_splitter.split_text(documents)
160
 
161
- # Create embeddings
 
 
 
162
  hf_embeddings = HuggingFaceEmbeddings(
163
  model_name="sentence-transformers/all-mpnet-base-v2",
164
  model_kwargs={'device': 'cpu'}
165
  )
166
 
167
- # Initialize FAISS
 
 
 
168
  dimension = len(hf_embeddings.embed_query("sample text"))
169
  index = faiss.IndexFlatL2(dimension)
170
  vector_store = FAISS(
@@ -181,6 +197,10 @@ def process_input(input_data):
181
  # Save vector store locally
182
  vector_store.save_local("vectorstore/faiss_index")
183
 
 
 
 
 
184
  return vector_store
185
 
186
  def answer_question(vectorstore, query):
 
1
+
2
  # app.py
3
  import streamlit as st
4
  import os
 
57
  input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
58
 
59
  if st.button("Process File") and input_data is not None:
60
+ vector_store = process_input(input_data)
61
+ st.session_state.vectorstore = vector_store
62
+ st.success("File processed successfully. You can now ask questions.")
 
63
 
64
  # Display chat history
65
  st.subheader("Chat History")
 
136
  # Create uploads directory
137
  os.makedirs("uploads", exist_ok=True)
138
 
139
+ # Initialize progress bar and status
140
+ progress_bar = st.progress(0)
141
+ status = st.status("Processing file...", expanded=True)
142
+
143
  documents = ""
144
  file_name = input_data.name.lower()
145
 
146
+ # Step 1: Read file
147
+ status.update(label="Reading file...")
148
+ progress_bar.progress(0.25)
149
+
150
  if file_name.endswith(".pdf"):
151
  pdf_reader = PdfReader(input_data)
152
  for page in pdf_reader.pages:
 
155
  documents = input_data.read().decode("utf-8")
156
  elif file_name.endswith((".xls", ".xlsx")):
157
  df = pd.read_excel(input_data)
 
158
  documents = " ".join(df.astype(str).values.flatten())
159
  elif file_name.endswith((".doc", ".docx")):
160
  doc = Document(input_data)
161
  for para in doc.paragraphs:
162
  documents += para.text + "\n"
163
 
164
+ # Step 2: Split text
165
+ status.update(label="Splitting text into chunks...")
166
+ progress_bar.progress(0.50)
167
+
168
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
169
  texts = text_splitter.split_text(documents)
170
 
171
+ # Step 3: Create embeddings
172
+ status.update(label="Creating embeddings...")
173
+ progress_bar.progress(0.75)
174
+
175
  hf_embeddings = HuggingFaceEmbeddings(
176
  model_name="sentence-transformers/all-mpnet-base-v2",
177
  model_kwargs={'device': 'cpu'}
178
  )
179
 
180
+ # Step 4: Initialize FAISS vector store
181
+ status.update(label="Building vector store...")
182
+ progress_bar.progress(0.90)
183
+
184
  dimension = len(hf_embeddings.embed_query("sample text"))
185
  index = faiss.IndexFlatL2(dimension)
186
  vector_store = FAISS(
 
197
  # Save vector store locally
198
  vector_store.save_local("vectorstore/faiss_index")
199
 
200
+ # Complete processing
201
+ status.update(label="Processing complete!", state="complete")
202
+ progress_bar.progress(1.0)
203
+
204
  return vector_store
205
 
206
  def answer_question(vectorstore, query):