Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +36 -8
src/streamlit_app.py
CHANGED
@@ -4,7 +4,9 @@ import openai
|
|
4 |
import os
|
5 |
import time
|
6 |
#from roles import *
|
7 |
-
|
|
|
|
|
8 |
import tempfile
|
9 |
from RAG import load_graph,text_splitter
|
10 |
import torch
|
@@ -180,18 +182,44 @@ uploaded_file = st.sidebar.file_uploader(
|
|
180 |
label_visibility="collapsed"
|
181 |
)
|
182 |
upload_button=st.sidebar.button("Upload Document")
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
st.session_state.file_text = file_text
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
st.session_state.embeddings = embeddings
|
192 |
st.session_state.chunks = chunks
|
193 |
st.session_state.doc_flag = True
|
194 |
|
|
|
|
|
195 |
st.sidebar.write("Before making the your faviorate charecter sound, authenicate your code")
|
196 |
Authenication=st.sidebar.button('Authenicate')
|
197 |
if Authenication:
|
|
|
4 |
import os
|
5 |
import time
|
6 |
#from roles import *
|
7 |
+
import io
|
8 |
+
from pypdf import PdfReader
|
9 |
+
#from langchain_community.document_loaders import PyPDFLoader
|
10 |
import tempfile
|
11 |
from RAG import load_graph,text_splitter
|
12 |
import torch
|
|
|
182 |
label_visibility="collapsed"
|
183 |
)
|
184 |
upload_button=st.sidebar.button("Upload Document")
|
185 |
+
uploaded_file = st.sidebar.file_uploader(
|
186 |
+
"Upload your PDF",
|
187 |
+
type=["pdf"],
|
188 |
+
key="pdf_uploader",
|
189 |
+
)
|
190 |
+
|
191 |
+
def extract_pdf_text_from_bytes(file_bytes: bytes) -> str:
|
192 |
+
reader = PdfReader(io.BytesIO(file_bytes))
|
193 |
+
pages_text = []
|
194 |
+
for p in reader.pages:
|
195 |
+
txt = p.extract_text() or ""
|
196 |
+
pages_text.append(txt)
|
197 |
+
return "\n".join(pages_text)
|
198 |
+
|
199 |
+
if uploaded_file is not None:
|
200 |
+
with st.spinner("Reading & embedding your PDF..."):
|
201 |
+
# Important: read bytes once on this rerun
|
202 |
+
file_bytes = uploaded_file.read()
|
203 |
+
# (Optional) if you ever re-use uploaded_file later, do: uploaded_file.seek(0)
|
204 |
+
|
205 |
+
# Extract text purely in-memory (no /tmp files, no PyPDFLoader)
|
206 |
+
file_text = extract_pdf_text_from_bytes(file_bytes)
|
207 |
+
|
208 |
+
# Persist to session state
|
209 |
st.session_state.file_text = file_text
|
210 |
+
|
211 |
+
# Build embeddings (uses your existing text_splitter + encoder)
|
212 |
+
chunks = text_splitter.split_text(file_text)
|
213 |
+
embeddings = st.session_state.encoder.encode(
|
214 |
+
chunks, convert_to_tensor=True, show_progress_bar=True
|
215 |
+
).cpu().numpy()
|
216 |
+
|
217 |
st.session_state.embeddings = embeddings
|
218 |
st.session_state.chunks = chunks
|
219 |
st.session_state.doc_flag = True
|
220 |
|
221 |
+
st.success(f"Loaded: {uploaded_file.name} — {len(st.session_state.chunks)} chunks")
|
222 |
+
|
223 |
st.sidebar.write("Before making the your faviorate charecter sound, authenicate your code")
|
224 |
Authenication=st.sidebar.button('Authenicate')
|
225 |
if Authenication:
|