Spaces:
Running
Running
adibak
commited on
Commit
·
65c99df
1
Parent(s):
d1e63a2
Refactor PDF extraction logic for single page, improve comments for clarity, and revert placeholder text
Browse files- app.py +10 -4
- helpers/file_manager.py +17 -8
app.py
CHANGED
@@ -299,15 +299,22 @@ def set_up_chat_ui():
|
|
299 |
# Check if pdf file is uploaded
|
300 |
# (we can use the same file if the user doesn't upload a new one)
|
301 |
if 'pdf_file' in st.session_state:
|
302 |
-
#
|
303 |
st.session_state['start_page'], st.session_state['end_page'] = filem.validate_page_range(
|
304 |
st.session_state['pdf_file'],
|
305 |
st.session_state['start_page'],
|
306 |
st.session_state['end_page']
|
307 |
)
|
308 |
-
#Show sidebar text for page selection and file name
|
309 |
with st.sidebar:
|
310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
|
312 |
# Get pdf contents
|
313 |
st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
|
@@ -315,7 +322,6 @@ def set_up_chat_ui():
|
|
315 |
(st.session_state['start_page'],
|
316 |
st.session_state['end_page'])
|
317 |
)
|
318 |
-
|
319 |
provider, llm_name = llm_helper.get_provider_model(
|
320 |
llm_provider_to_use,
|
321 |
use_ollama=RUN_IN_OFFLINE_MODE
|
|
|
299 |
# Check if pdf file is uploaded
|
300 |
# (we can use the same file if the user doesn't upload a new one)
|
301 |
if 'pdf_file' in st.session_state:
|
302 |
+
# Get validated page range
|
303 |
st.session_state['start_page'], st.session_state['end_page'] = filem.validate_page_range(
|
304 |
st.session_state['pdf_file'],
|
305 |
st.session_state['start_page'],
|
306 |
st.session_state['end_page']
|
307 |
)
|
308 |
+
# Show sidebar text for page selection and file name
|
309 |
with st.sidebar:
|
310 |
+
if st.session_state['end_page'] is None: # If the PDF has only one page
|
311 |
+
st.text('Extracting page %d in %s' % (
|
312 |
+
st.session_state['start_page'], st.session_state['pdf_file'].name
|
313 |
+
))
|
314 |
+
else:
|
315 |
+
st.text('Extracting pages %d to %d in %s' % (
|
316 |
+
st.session_state['start_page'], st.session_state['end_page'], st.session_state['pdf_file'].name
|
317 |
+
))
|
318 |
|
319 |
# Get pdf contents
|
320 |
st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
|
|
|
322 |
(st.session_state['start_page'],
|
323 |
st.session_state['end_page'])
|
324 |
)
|
|
|
325 |
provider, llm_name = llm_helper.get_provider_model(
|
326 |
llm_provider_to_use,
|
327 |
use_ollama=RUN_IN_OFFLINE_MODE
|
helpers/file_manager.py
CHANGED
@@ -30,14 +30,19 @@ def get_pdf_contents(
|
|
30 |
|
31 |
reader = PdfReader(pdf_file)
|
32 |
|
33 |
-
start, end = page_range #
|
34 |
-
|
35 |
-
print(f"Name: {pdf_file.name} Page range: {start} to {end}")
|
36 |
text = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
for page_num in range(start - 1, end):
|
38 |
-
|
39 |
-
text += page.extract_text()
|
40 |
|
|
|
41 |
return text
|
42 |
|
43 |
def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
@@ -52,13 +57,17 @@ def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
|
52 |
"""
|
53 |
n_pages = len(PdfReader(pdf_file).pages)
|
54 |
|
55 |
-
#
|
56 |
start = max(1, start)
|
57 |
|
58 |
-
#
|
59 |
end = min(n_pages, end)
|
60 |
|
61 |
-
if start > end: #
|
62 |
start = 1
|
63 |
|
|
|
|
|
|
|
|
|
64 |
return start, end
|
|
|
30 |
|
31 |
reader = PdfReader(pdf_file)
|
32 |
|
33 |
+
start, end = page_range # Set start and end per the range (user-specified values)
|
34 |
+
|
|
|
35 |
text = ''
|
36 |
+
|
37 |
+
if end is None:
|
38 |
+
# If end is None (where PDF has only 1 page or start = end), extract start
|
39 |
+
end = start
|
40 |
+
|
41 |
+
# Get the text from the specified page range
|
42 |
for page_num in range(start - 1, end):
|
43 |
+
text += reader.pages[page_num].extract_text()
|
|
|
44 |
|
45 |
+
|
46 |
return text
|
47 |
|
48 |
def validate_page_range(pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
|
|
57 |
"""
|
58 |
n_pages = len(PdfReader(pdf_file).pages)
|
59 |
|
60 |
+
# Set start to max of 1 or specified start (whichever's higher)
|
61 |
start = max(1, start)
|
62 |
|
63 |
+
# Set end to min of pdf length or specified end (whichever's lower)
|
64 |
end = min(n_pages, end)
|
65 |
|
66 |
+
if start > end: # If the start is higher than the end, make it 1
|
67 |
start = 1
|
68 |
|
69 |
+
if start == end:
|
70 |
+
# If start = end (including when PDF is 1 page long), set end to None
|
71 |
+
return start, None
|
72 |
+
|
73 |
return start, end
|