adibak commited on
Commit
9c242be
·
1 Parent(s): 1e2c128

chat uploads, make slider work

Browse files
Files changed (2) hide show
  1. app.py +32 -10
  2. helpers/file_manager.py +15 -6
app.py CHANGED
@@ -13,6 +13,7 @@ import httpx
13
  import huggingface_hub
14
  import json5
15
  import ollama
 
16
  import requests
17
  import streamlit as st
18
  from dotenv import load_dotenv
@@ -260,6 +261,9 @@ def set_up_chat_ui():
260
  Prepare the chat interface and related functionality.
261
  """
262
  print(f"slider={st.session_state["page_range_slider"][0], st.session_state["page_range_slider"][1]}")
 
 
 
263
  with st.expander('Usage Instructions'):
264
  st.markdown(GlobalConfig.CHAT_USAGE_INSTRUCTIONS)
265
 
@@ -287,19 +291,37 @@ def set_up_chat_ui():
287
  prompt_text = prompt.text or ''
288
  if prompt['files']:
289
  uploaded_pdf = prompt['files'][0]
290
- # pdf_length = filem.get_pdf_length(uploaded_pdf)
291
- # valid_pdf_length = min(50, pdf_length)
292
-
293
- # st.session_state["page_range_slider"] = list(st.session_state["page_range_slider"])
294
- # st.session_state["page_range_slider"][1] = valid_pdf_length
295
- # print(f"length={pdf_length}, validated={valid_pdf_length}={st.session_state["page_range_slider"][-1]}")
296
-
297
- # print(f"fname={uploaded_pdf.name}")
298
  # Apparently, Streamlit stores uploaded files in memory and clears on browser close
299
  # https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
 
 
 
 
 
 
 
 
 
 
300
  st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(uploaded_pdf,
301
- st.session_state["page_range_slider"])
302
- print(f"extracting={st.session_state["page_range_slider"]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
  provider, llm_name = llm_helper.get_provider_model(
305
  llm_provider_to_use,
 
13
  import huggingface_hub
14
  import json5
15
  import ollama
16
+ from pypdf import PdfReader
17
  import requests
18
  import streamlit as st
19
  from dotenv import load_dotenv
 
261
  Prepare the chat interface and related functionality.
262
  """
263
  print(f"slider={st.session_state["page_range_slider"][0], st.session_state["page_range_slider"][1]}")
264
+ st.session_state["start_page"] = st.session_state["page_range_slider"][0]
265
+ st.session_state["end_page"] = st.session_state["page_range_slider"][1]
266
+
267
  with st.expander('Usage Instructions'):
268
  st.markdown(GlobalConfig.CHAT_USAGE_INSTRUCTIONS)
269
 
 
291
  prompt_text = prompt.text or ''
292
  if prompt['files']:
293
  uploaded_pdf = prompt['files'][0]
294
+ st.session_state["pdf_file"] = uploaded_pdf
 
 
 
 
 
 
 
295
  # Apparently, Streamlit stores uploaded files in memory and clears on browser close
296
  # https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
297
+
298
+ # get validated page range
299
+ st.session_state["start_page"], st.session_state["end_page"] = filem.validate_page_range(uploaded_pdf,
300
+ st.session_state["start_page"],
301
+ st.session_state["end_page"])
302
+ # update sidebar text
303
+ with st.sidebar:
304
+ st.text(f"Extracting pages {st.session_state["start_page"]} to {st.session_state["end_page"]} in {uploaded_pdf.name}")
305
+
306
+ # get pdf contents
307
  st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(uploaded_pdf,
308
+ (st.session_state["start_page"],
309
+ st.session_state["end_page"]))
310
+ else:
311
+ # if we're using the same file (nothing new uploaded)
312
+ if "start_page" in st.session_state and "end_page" in st.session_state and "pdf_file" in st.session_state:
313
+ # validate the page range
314
+ st.session_state["start_page"], st.session_state["end_page"] = filem.validate_page_range(st.session_state["pdf_file"],
315
+ st.session_state["start_page"],
316
+ st.session_state["end_page"])
317
+ # update sidebar text
318
+ with st.sidebar:
319
+ st.text(f"Extracting pages {st.session_state["start_page"]} to {st.session_state["end_page"]} in {st.session_state["pdf_file"].name}")
320
+
321
+ # get contents
322
+ st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(st.session_state["pdf_file"],
323
+ (st.session_state["start_page"], st.session_state["end_page"]))
324
+
325
 
326
  provider, llm_name = llm_helper.get_provider_model(
327
  llm_provider_to_use,
helpers/file_manager.py CHANGED
@@ -32,13 +32,9 @@ def get_pdf_contents(
32
  """
33
 
34
  reader = PdfReader(pdf_file)
35
- n_pages = len(reader.pages)
36
 
37
  start, end = page_range # set start and end per the range (user-specified values)
38
- start = max(1, start)
39
- end = min(n_pages, end)
40
- if start >= end:
41
- start = 1
42
  print(f"starting at {start}, ending {end}")
43
 
44
  text = ''
@@ -46,4 +42,17 @@ def get_pdf_contents(
46
  page = reader.pages[page_num]
47
  text += page.extract_text()
48
 
49
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  """
33
 
34
  reader = PdfReader(pdf_file)
 
35
 
36
  start, end = page_range # set start and end per the range (user-specified values)
37
+
 
 
 
38
  print(f"starting at {start}, ending {end}")
39
 
40
  text = ''
 
42
  page = reader.pages[page_num]
43
  text += page.extract_text()
44
 
45
+ return text
46
+
47
+ def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
48
+ start:int, end:int) -> tuple[int, int]:
49
+
50
+ n_pages = len(PdfReader(pdf_file).pages)
51
+ #start, end = st.session_state["page_range_slider"]
52
+ start = max(1, start)
53
+ end = min(n_pages, end)
54
+
55
+ if start >= end:
56
+ start = 1
57
+
58
+ return (start, end)