adibak commited on
Commit
90c828a
·
1 Parent(s): acafc9b

make changes per PR review

Browse files
Files changed (3) hide show
  1. app.py +23 -35
  2. global_config.py +1 -0
  3. helpers/file_manager.py +4 -8
app.py CHANGED
@@ -13,7 +13,6 @@ import httpx
13
  import huggingface_hub
14
  import json5
15
  import ollama
16
- from pypdf import PdfReader
17
  import requests
18
  import streamlit as st
19
  from dotenv import load_dotenv
@@ -224,9 +223,9 @@ with st.sidebar:
224
  )
225
 
226
  # make slider with initial values
227
- page_range_slider = st.slider("7: Specify a page range:",
228
- 1, 50, [1, 50])
229
- st.session_state["page_range_slider"] = page_range_slider
230
 
231
 
232
  def build_ui():
@@ -262,8 +261,8 @@ def set_up_chat_ui():
262
  Prepare the chat interface and related functionality.
263
  """
264
  # set start and end page
265
- st.session_state["start_page"] = st.session_state["page_range_slider"][0]
266
- st.session_state["end_page"] = st.session_state["page_range_slider"][1]
267
 
268
  with st.expander('Usage Instructions'):
269
  st.markdown(GlobalConfig.CHAT_USAGE_INSTRUCTIONS)
@@ -293,38 +292,27 @@ def set_up_chat_ui():
293
  if prompt['files']:
294
  # store uploaded pdf in session state
295
  uploaded_pdf = prompt['files'][0]
296
- st.session_state["pdf_file"] = uploaded_pdf
297
  # Apparently, Streamlit stores uploaded files in memory and clears on browser close
298
  # https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
299
 
300
- # get validated page range
301
- st.session_state["start_page"], st.session_state["end_page"] = filem.validate_page_range(uploaded_pdf,
302
- st.session_state["start_page"],
303
- st.session_state["end_page"])
304
- # show sidebar text for page selection and file name
305
- with st.sidebar:
306
- st.text(f"Extracting pages {st.session_state["start_page"]} to {st.session_state["end_page"]} in {uploaded_pdf.name}")
307
-
308
- # get pdf contents
309
- st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(uploaded_pdf,
310
- (st.session_state["start_page"],
311
- st.session_state["end_page"]))
312
- else:
313
- # if we're using the same file (nothing new uploaded)
314
- if "start_page" in st.session_state and "end_page" in st.session_state and "pdf_file" in st.session_state:
315
- # validate the page range
316
- st.session_state["start_page"], st.session_state["end_page"] = filem.validate_page_range(st.session_state["pdf_file"],
317
- st.session_state["start_page"],
318
- st.session_state["end_page"])
319
- # update sidebar text for name and page selection
320
- with st.sidebar:
321
- st.text(f"Extracting pages {st.session_state["start_page"]} to {st.session_state["end_page"]} in {st.session_state["pdf_file"].name}")
322
-
323
- # get contents
324
- st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(st.session_state["pdf_file"],
325
- (st.session_state["start_page"], st.session_state["end_page"]))
326
-
327
-
328
  provider, llm_name = llm_helper.get_provider_model(
329
  llm_provider_to_use,
330
  use_ollama=RUN_IN_OFFLINE_MODE
 
13
  import huggingface_hub
14
  import json5
15
  import ollama
 
16
  import requests
17
  import streamlit as st
18
  from dotenv import load_dotenv
 
223
  )
224
 
225
  # make slider with initial values
226
+ page_range_slider = st.slider('7: Specify a page range for the PDF file:',
227
+ 1, GlobalConfig.MAX_ALLOWED_PAGES, [1, GlobalConfig.MAX_ALLOWED_PAGES])
228
+ st.session_state['page_range_slider'] = page_range_slider
229
 
230
 
231
  def build_ui():
 
261
  Prepare the chat interface and related functionality.
262
  """
263
  # set start and end page
264
+ st.session_state['start_page'] = st.session_state['page_range_slider'][0]
265
+ st.session_state['end_page'] = st.session_state['page_range_slider'][1]
266
 
267
  with st.expander('Usage Instructions'):
268
  st.markdown(GlobalConfig.CHAT_USAGE_INSTRUCTIONS)
 
292
  if prompt['files']:
293
  # store uploaded pdf in session state
294
  uploaded_pdf = prompt['files'][0]
295
+ st.session_state['pdf_file'] = uploaded_pdf
296
  # Apparently, Streamlit stores uploaded files in memory and clears on browser close
297
  # https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
298
 
299
+ # get validated page range
300
+ st.session_state['start_page'], st.session_state['end_page'] = filem.validate_page_range(
301
+ st.session_state['pdf_file'],
302
+ st.session_state['start_page'],
303
+ st.session_state['end_page']
304
+ )
305
+ # show sidebar text for page selection and file name
306
+ with st.sidebar:
307
+ st.text(f'Extracting pages {st.session_state['start_page']} to {st.session_state['end_page']} in {st.session_state['pdf_file'].name}')
308
+
309
+ # get pdf contents
310
+ st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
311
+ st.session_state['pdf_file'],
312
+ (st.session_state['start_page'],
313
+ st.session_state['end_page'])
314
+ )
315
+
 
 
 
 
 
 
 
 
 
 
 
316
  provider, llm_name = llm_helper.get_provider_model(
317
  llm_provider_to_use,
318
  use_ollama=RUN_IN_OFFLINE_MODE
global_config.py CHANGED
@@ -108,6 +108,7 @@ class GlobalConfig:
108
  DEFAULT_MODEL_INDEX = int(os.environ.get('DEFAULT_MODEL_INDEX', '4'))
109
  LLM_MODEL_TEMPERATURE = 0.2
110
  MAX_PAGE_COUNT = 50
 
111
  LLM_MODEL_MAX_INPUT_LENGTH = 1000 # characters
112
 
113
  LOG_LEVEL = 'DEBUG'
 
108
  DEFAULT_MODEL_INDEX = int(os.environ.get('DEFAULT_MODEL_INDEX', '4'))
109
  LLM_MODEL_TEMPERATURE = 0.2
110
  MAX_PAGE_COUNT = 50
111
+ MAX_ALLOWED_PAGES = 150
112
  LLM_MODEL_MAX_INPUT_LENGTH = 1000 # characters
113
 
114
  LOG_LEVEL = 'DEBUG'
helpers/file_manager.py CHANGED
@@ -19,23 +19,19 @@ logger = logging.getLogger(__name__)
19
 
20
  def get_pdf_contents(
21
  pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
22
- page_range: tuple[int, int],
23
- max_pages: int = GlobalConfig.MAX_PAGE_COUNT
24
- ) -> str:
25
  """
26
  Extract the text contents from a PDF file.
27
 
28
  :param pdf_file: The uploaded PDF file.
29
  :param page_range: The range of pages to extract contents from.
30
- :param max_pages: The max no. of pages to extract contents from.
31
  :return: The contents.
32
  """
33
 
34
  reader = PdfReader(pdf_file)
35
 
36
- start, end = page_range # set start and end per the range (user-specified values)
37
 
38
- print(f"starting at {start}, ending {end}")
39
 
40
  text = ''
41
  for page_num in range(start - 1, end):
@@ -51,7 +47,7 @@ def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
51
 
52
  :param pdf_file: The uploaded PDF file.
53
  :param start: The start page
54
- :param max_pages: The end page
55
  :return: The validated page range tuple
56
  """
57
  n_pages = len(PdfReader(pdf_file).pages)
@@ -65,4 +61,4 @@ def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
65
  if start > end: # if the start is higher than the end, make it 1
66
  start = 1
67
 
68
- return (start, end)
 
19
 
20
  def get_pdf_contents(
21
  pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
22
+ page_range: tuple[int, int]) -> str:
 
 
23
  """
24
  Extract the text contents from a PDF file.
25
 
26
  :param pdf_file: The uploaded PDF file.
27
  :param page_range: The range of pages to extract contents from.
 
28
  :return: The contents.
29
  """
30
 
31
  reader = PdfReader(pdf_file)
32
 
33
+ start, end = page_range # set start and end per the range (user-specified values)
34
 
 
35
 
36
  text = ''
37
  for page_num in range(start - 1, end):
 
47
 
48
  :param pdf_file: The uploaded PDF file.
49
  :param start: The start page
50
+ :param end: The end page
51
  :return: The validated page range tuple
52
  """
53
  n_pages = len(PdfReader(pdf_file).pages)
 
61
  if start > end: # if the start is higher than the end, make it 1
62
  start = 1
63
 
64
+ return start, end