Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
# app.py
|
2 |
import streamlit as st
|
3 |
import os
|
@@ -56,10 +57,9 @@ with st.sidebar:
|
|
56 |
input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
|
57 |
|
58 |
if st.button("Process File") and input_data is not None:
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
st.success("File processed successfully. You can now ask questions.")
|
63 |
|
64 |
# Display chat history
|
65 |
st.subheader("Chat History")
|
@@ -136,9 +136,17 @@ def process_input(input_data):
|
|
136 |
# Create uploads directory
|
137 |
os.makedirs("uploads", exist_ok=True)
|
138 |
|
|
|
|
|
|
|
|
|
139 |
documents = ""
|
140 |
file_name = input_data.name.lower()
|
141 |
|
|
|
|
|
|
|
|
|
142 |
if file_name.endswith(".pdf"):
|
143 |
pdf_reader = PdfReader(input_data)
|
144 |
for page in pdf_reader.pages:
|
@@ -147,24 +155,32 @@ def process_input(input_data):
|
|
147 |
documents = input_data.read().decode("utf-8")
|
148 |
elif file_name.endswith((".xls", ".xlsx")):
|
149 |
df = pd.read_excel(input_data)
|
150 |
-
# Convert all cells to strings and join
|
151 |
documents = " ".join(df.astype(str).values.flatten())
|
152 |
elif file_name.endswith((".doc", ".docx")):
|
153 |
doc = Document(input_data)
|
154 |
for para in doc.paragraphs:
|
155 |
documents += para.text + "\n"
|
156 |
|
157 |
-
# Split text
|
|
|
|
|
|
|
158 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
159 |
texts = text_splitter.split_text(documents)
|
160 |
|
161 |
-
# Create embeddings
|
|
|
|
|
|
|
162 |
hf_embeddings = HuggingFaceEmbeddings(
|
163 |
model_name="sentence-transformers/all-mpnet-base-v2",
|
164 |
model_kwargs={'device': 'cpu'}
|
165 |
)
|
166 |
|
167 |
-
# Initialize FAISS
|
|
|
|
|
|
|
168 |
dimension = len(hf_embeddings.embed_query("sample text"))
|
169 |
index = faiss.IndexFlatL2(dimension)
|
170 |
vector_store = FAISS(
|
@@ -181,6 +197,10 @@ def process_input(input_data):
|
|
181 |
# Save vector store locally
|
182 |
vector_store.save_local("vectorstore/faiss_index")
|
183 |
|
|
|
|
|
|
|
|
|
184 |
return vector_store
|
185 |
|
186 |
def answer_question(vectorstore, query):
|
|
|
1 |
+
|
2 |
# app.py
|
3 |
import streamlit as st
|
4 |
import os
|
|
|
57 |
input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
|
58 |
|
59 |
if st.button("Process File") and input_data is not None:
|
60 |
+
vector_store = process_input(input_data)
|
61 |
+
st.session_state.vectorstore = vector_store
|
62 |
+
st.success("File processed successfully. You can now ask questions.")
|
|
|
63 |
|
64 |
# Display chat history
|
65 |
st.subheader("Chat History")
|
|
|
136 |
# Create uploads directory
|
137 |
os.makedirs("uploads", exist_ok=True)
|
138 |
|
139 |
+
# Initialize progress bar and status
|
140 |
+
progress_bar = st.progress(0)
|
141 |
+
status = st.status("Processing file...", expanded=True)
|
142 |
+
|
143 |
documents = ""
|
144 |
file_name = input_data.name.lower()
|
145 |
|
146 |
+
# Step 1: Read file
|
147 |
+
status.update(label="Reading file...")
|
148 |
+
progress_bar.progress(0.25)
|
149 |
+
|
150 |
if file_name.endswith(".pdf"):
|
151 |
pdf_reader = PdfReader(input_data)
|
152 |
for page in pdf_reader.pages:
|
|
|
155 |
documents = input_data.read().decode("utf-8")
|
156 |
elif file_name.endswith((".xls", ".xlsx")):
|
157 |
df = pd.read_excel(input_data)
|
|
|
158 |
documents = " ".join(df.astype(str).values.flatten())
|
159 |
elif file_name.endswith((".doc", ".docx")):
|
160 |
doc = Document(input_data)
|
161 |
for para in doc.paragraphs:
|
162 |
documents += para.text + "\n"
|
163 |
|
164 |
+
# Step 2: Split text
|
165 |
+
status.update(label="Splitting text into chunks...")
|
166 |
+
progress_bar.progress(0.50)
|
167 |
+
|
168 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
169 |
texts = text_splitter.split_text(documents)
|
170 |
|
171 |
+
# Step 3: Create embeddings
|
172 |
+
status.update(label="Creating embeddings...")
|
173 |
+
progress_bar.progress(0.75)
|
174 |
+
|
175 |
hf_embeddings = HuggingFaceEmbeddings(
|
176 |
model_name="sentence-transformers/all-mpnet-base-v2",
|
177 |
model_kwargs={'device': 'cpu'}
|
178 |
)
|
179 |
|
180 |
+
# Step 4: Initialize FAISS vector store
|
181 |
+
status.update(label="Building vector store...")
|
182 |
+
progress_bar.progress(0.90)
|
183 |
+
|
184 |
dimension = len(hf_embeddings.embed_query("sample text"))
|
185 |
index = faiss.IndexFlatL2(dimension)
|
186 |
vector_store = FAISS(
|
|
|
197 |
# Save vector store locally
|
198 |
vector_store.save_local("vectorstore/faiss_index")
|
199 |
|
200 |
+
# Complete processing
|
201 |
+
status.update(label="Processing complete!", state="complete")
|
202 |
+
progress_bar.progress(1.0)
|
203 |
+
|
204 |
return vector_store
|
205 |
|
206 |
def answer_question(vectorstore, query):
|