adibak commited on
Commit
65c99df
·
1 Parent(s): d1e63a2

Refactor PDF extraction logic for single page, improve comments for clarity, and revert placeholder text

Browse files
Files changed (2) hide show
  1. app.py +10 -4
  2. helpers/file_manager.py +17 -8
app.py CHANGED
@@ -299,15 +299,22 @@ def set_up_chat_ui():
299
  # Check if pdf file is uploaded
300
  # (we can use the same file if the user doesn't upload a new one)
301
  if 'pdf_file' in st.session_state:
302
- # get validated page range
303
  st.session_state['start_page'], st.session_state['end_page'] = filem.validate_page_range(
304
  st.session_state['pdf_file'],
305
  st.session_state['start_page'],
306
  st.session_state['end_page']
307
  )
308
- #Show sidebar text for page selection and file name
309
  with st.sidebar:
310
- st.text(f'Extracting pages {st.session_state["start_page"]} to {st.session_state["end_page"]} in {st.session_state["pdf_file"].name}')
 
 
 
 
 
 
 
311
 
312
  # Get pdf contents
313
  st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
@@ -315,7 +322,6 @@ def set_up_chat_ui():
315
  (st.session_state['start_page'],
316
  st.session_state['end_page'])
317
  )
318
-
319
  provider, llm_name = llm_helper.get_provider_model(
320
  llm_provider_to_use,
321
  use_ollama=RUN_IN_OFFLINE_MODE
 
299
  # Check if pdf file is uploaded
300
  # (we can use the same file if the user doesn't upload a new one)
301
  if 'pdf_file' in st.session_state:
302
+ # Get validated page range
303
  st.session_state['start_page'], st.session_state['end_page'] = filem.validate_page_range(
304
  st.session_state['pdf_file'],
305
  st.session_state['start_page'],
306
  st.session_state['end_page']
307
  )
308
+ # Show sidebar text for page selection and file name
309
  with st.sidebar:
310
+ if st.session_state['end_page'] is None: # If the PDF has only one page
311
+ st.text('Extracting page %d in %s' % (
312
+ st.session_state['start_page'], st.session_state['pdf_file'].name
313
+ ))
314
+ else:
315
+ st.text('Extracting pages %d to %d in %s' % (
316
+ st.session_state['start_page'], st.session_state['end_page'], st.session_state['pdf_file'].name
317
+ ))
318
 
319
  # Get pdf contents
320
  st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
 
322
  (st.session_state['start_page'],
323
  st.session_state['end_page'])
324
  )
 
325
  provider, llm_name = llm_helper.get_provider_model(
326
  llm_provider_to_use,
327
  use_ollama=RUN_IN_OFFLINE_MODE
helpers/file_manager.py CHANGED
@@ -30,14 +30,19 @@ def get_pdf_contents(
30
 
31
  reader = PdfReader(pdf_file)
32
 
33
- start, end = page_range # set start and end per the range (user-specified values)
34
-
35
- print(f"Name: {pdf_file.name} Page range: {start} to {end}")
36
  text = ''
 
 
 
 
 
 
37
  for page_num in range(start - 1, end):
38
- page = reader.pages[page_num]
39
- text += page.extract_text()
40
 
 
41
  return text
42
 
43
  def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
@@ -52,13 +57,17 @@ def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
52
  """
53
  n_pages = len(PdfReader(pdf_file).pages)
54
 
55
- # set start to max of 1 or specified start (whichever's higher)
56
  start = max(1, start)
57
 
58
- # set end to min of pdf length or specified end (whichever's lower)
59
  end = min(n_pages, end)
60
 
61
- if start > end: # if the start is higher than the end, make it 1
62
  start = 1
63
 
 
 
 
 
64
  return start, end
 
30
 
31
  reader = PdfReader(pdf_file)
32
 
33
+ start, end = page_range # Set start and end per the range (user-specified values)
34
+
 
35
  text = ''
36
+
37
+ if end is None:
38
+ # If end is None (where PDF has only 1 page or start = end), extract start
39
+ end = start
40
+
41
+ # Get the text from the specified page range
42
  for page_num in range(start - 1, end):
43
+ text += reader.pages[page_num].extract_text()
 
44
 
45
+
46
  return text
47
 
48
  def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
 
57
  """
58
  n_pages = len(PdfReader(pdf_file).pages)
59
 
60
+ # Set start to max of 1 or specified start (whichever's higher)
61
  start = max(1, start)
62
 
63
+ # Set end to min of pdf length or specified end (whichever's lower)
64
  end = min(n_pages, end)
65
 
66
+ if start > end: # If the start is higher than the end, make it 1
67
  start = 1
68
 
69
+ if start == end:
70
+ # If start = end (including when PDF is 1 page long), set end to None
71
+ return start, None
72
+
73
  return start, end