Spaces:
Running
Running
adibak
commited on
Commit
·
9c242be
1
Parent(s):
1e2c128
chat uploads, make slider work
Browse files- app.py +32 -10
- helpers/file_manager.py +15 -6
app.py
CHANGED
@@ -13,6 +13,7 @@ import httpx
|
|
13 |
import huggingface_hub
|
14 |
import json5
|
15 |
import ollama
|
|
|
16 |
import requests
|
17 |
import streamlit as st
|
18 |
from dotenv import load_dotenv
|
@@ -260,6 +261,9 @@ def set_up_chat_ui():
|
|
260 |
Prepare the chat interface and related functionality.
|
261 |
"""
|
262 |
print(f"slider={st.session_state["page_range_slider"][0], st.session_state["page_range_slider"][1]}")
|
|
|
|
|
|
|
263 |
with st.expander('Usage Instructions'):
|
264 |
st.markdown(GlobalConfig.CHAT_USAGE_INSTRUCTIONS)
|
265 |
|
@@ -287,19 +291,37 @@ def set_up_chat_ui():
|
|
287 |
prompt_text = prompt.text or ''
|
288 |
if prompt['files']:
|
289 |
uploaded_pdf = prompt['files'][0]
|
290 |
-
|
291 |
-
# valid_pdf_length = min(50, pdf_length)
|
292 |
-
|
293 |
-
# st.session_state["page_range_slider"] = list(st.session_state["page_range_slider"])
|
294 |
-
# st.session_state["page_range_slider"][1] = valid_pdf_length
|
295 |
-
# print(f"length={pdf_length}, validated={valid_pdf_length}={st.session_state["page_range_slider"][-1]}")
|
296 |
-
|
297 |
-
# print(f"fname={uploaded_pdf.name}")
|
298 |
# Apparently, Streamlit stores uploaded files in memory and clears on browser close
|
299 |
# https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(uploaded_pdf,
|
301 |
-
st.session_state["
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
provider, llm_name = llm_helper.get_provider_model(
|
305 |
llm_provider_to_use,
|
|
|
13 |
import huggingface_hub
|
14 |
import json5
|
15 |
import ollama
|
16 |
+
from pypdf import PdfReader
|
17 |
import requests
|
18 |
import streamlit as st
|
19 |
from dotenv import load_dotenv
|
|
|
261 |
Prepare the chat interface and related functionality.
|
262 |
"""
|
263 |
print(f"slider={st.session_state["page_range_slider"][0], st.session_state["page_range_slider"][1]}")
|
264 |
+
st.session_state["start_page"] = st.session_state["page_range_slider"][0]
|
265 |
+
st.session_state["end_page"] = st.session_state["page_range_slider"][1]
|
266 |
+
|
267 |
with st.expander('Usage Instructions'):
|
268 |
st.markdown(GlobalConfig.CHAT_USAGE_INSTRUCTIONS)
|
269 |
|
|
|
291 |
prompt_text = prompt.text or ''
|
292 |
if prompt['files']:
|
293 |
uploaded_pdf = prompt['files'][0]
|
294 |
+
st.session_state["pdf_file"] = uploaded_pdf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
# Apparently, Streamlit stores uploaded files in memory and clears on browser close
|
296 |
# https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
|
297 |
+
|
298 |
+
# get validated page range
|
299 |
+
st.session_state["start_page"], st.session_state["end_page"] = filem.validate_page_range(uploaded_pdf,
|
300 |
+
st.session_state["start_page"],
|
301 |
+
st.session_state["end_page"])
|
302 |
+
# update sidebar text
|
303 |
+
with st.sidebar:
|
304 |
+
st.text(f"Extracting pages {st.session_state["start_page"]} to {st.session_state["end_page"]} in {uploaded_pdf.name}")
|
305 |
+
|
306 |
+
# get pdf contents
|
307 |
st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(uploaded_pdf,
|
308 |
+
(st.session_state["start_page"],
|
309 |
+
st.session_state["end_page"]))
|
310 |
+
else:
|
311 |
+
# if we're using the same file (nothing new uploaded)
|
312 |
+
if "start_page" in st.session_state and "end_page" in st.session_state and "pdf_file" in st.session_state:
|
313 |
+
# validate the page range
|
314 |
+
st.session_state["start_page"], st.session_state["end_page"] = filem.validate_page_range(st.session_state["pdf_file"],
|
315 |
+
st.session_state["start_page"],
|
316 |
+
st.session_state["end_page"])
|
317 |
+
# update sidebar text
|
318 |
+
with st.sidebar:
|
319 |
+
st.text(f"Extracting pages {st.session_state["start_page"]} to {st.session_state["end_page"]} in {st.session_state["pdf_file"].name}")
|
320 |
+
|
321 |
+
# get contents
|
322 |
+
st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(st.session_state["pdf_file"],
|
323 |
+
(st.session_state["start_page"], st.session_state["end_page"]))
|
324 |
+
|
325 |
|
326 |
provider, llm_name = llm_helper.get_provider_model(
|
327 |
llm_provider_to_use,
|
helpers/file_manager.py
CHANGED
@@ -32,13 +32,9 @@ def get_pdf_contents(
|
|
32 |
"""
|
33 |
|
34 |
reader = PdfReader(pdf_file)
|
35 |
-
n_pages = len(reader.pages)
|
36 |
|
37 |
start, end = page_range # set start and end per the range (user-specified values)
|
38 |
-
|
39 |
-
end = min(n_pages, end)
|
40 |
-
if start >= end:
|
41 |
-
start = 1
|
42 |
print(f"starting at {start}, ending {end}")
|
43 |
|
44 |
text = ''
|
@@ -46,4 +42,17 @@ def get_pdf_contents(
|
|
46 |
page = reader.pages[page_num]
|
47 |
text += page.extract_text()
|
48 |
|
49 |
-
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
"""
|
33 |
|
34 |
reader = PdfReader(pdf_file)
|
|
|
35 |
|
36 |
start, end = page_range # set start and end per the range (user-specified values)
|
37 |
+
|
|
|
|
|
|
|
38 |
print(f"starting at {start}, ending {end}")
|
39 |
|
40 |
text = ''
|
|
|
42 |
page = reader.pages[page_num]
|
43 |
text += page.extract_text()
|
44 |
|
45 |
+
return text
|
46 |
+
|
47 |
+
def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
48 |
+
start:int, end:int) -> tuple[int, int]:
|
49 |
+
|
50 |
+
n_pages = len(PdfReader(pdf_file).pages)
|
51 |
+
#start, end = st.session_state["page_range_slider"]
|
52 |
+
start = max(1, start)
|
53 |
+
end = min(n_pages, end)
|
54 |
+
|
55 |
+
if start >= end:
|
56 |
+
start = 1
|
57 |
+
|
58 |
+
return (start, end)
|