Cachoups commited on
Commit
96d2dc7
·
verified ·
1 Parent(s): 9f63e0a

Update lib/read_pdf.py

Browse files
Files changed (1) hide show
  1. lib/read_pdf.py +4 -3
lib/read_pdf.py CHANGED
@@ -104,10 +104,11 @@ def extract_and_format_paragraphs(pdf_path):
104
  width = page.width
105
  height = page.height
106
 
107
- # Define bounding boxes for left and right columns
108
- left_bbox = (0, 0, width / 2, height) # Left column
109
- right_bbox = (width / 2, 0, width, height) # Right column
110
 
 
 
111
  # Extract text from the left column
112
  left_column_text = page.within_bbox(left_bbox).extract_text() or ""
113
  # Clean the left column text
 
104
  width = page.width
105
  height = page.height
106
 
107
+ header_height = height * 0.1 # Adjust this value based on your PDF
108
+ #footer_height = height * 0.1 # Adjust this value based on your PDF
 
109
 
110
+ left_bbox = (0, header_height, width / 2, height - footer_height) # Left column
111
+ right_bbox = (width / 2, header_height, width, height)
112
  # Extract text from the left column
113
  left_column_text = page.within_bbox(left_bbox).extract_text() or ""
114
  # Clean the left column text