Update app.py via AI Editor
Browse files
app.py
CHANGED
@@ -95,22 +95,17 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
|
|
95 |
reader = PdfReader(input_path)
|
96 |
n_pages = len(reader.pages)
|
97 |
splits = []
|
98 |
-
current_writer = PdfWriter()
|
99 |
-
last_split_at = 0
|
100 |
-
i = 0
|
101 |
last_header = None
|
102 |
-
|
103 |
while i < n_pages:
|
104 |
-
current_writer = PdfWriter()
|
105 |
part_start = i
|
106 |
-
|
107 |
-
|
|
|
108 |
if size > max_mb:
|
109 |
-
# Single page exceeds max_mb, must split after this page
|
110 |
splits.append((i, i+1))
|
111 |
i += 1
|
112 |
continue
|
113 |
-
# Add consecutive pages until approaching min_split_mb
|
114 |
j = i + 1
|
115 |
while j < n_pages:
|
116 |
tmp_writer = PdfWriter()
|
@@ -118,24 +113,24 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
|
|
118 |
tmp_writer.add_page(reader.pages[k])
|
119 |
size = estimate_writer_size(tmp_writer) / (1024 * 1024)
|
120 |
if size > max_mb:
|
121 |
-
# Last added page makes it too big; break, split at previous
|
122 |
break
|
123 |
-
# If above min_split_mb, check for natural break
|
124 |
header = extract_text_headers(reader, j)
|
125 |
blank = is_blank_page(reader, j)
|
126 |
chapter = is_chapter_header(header)
|
127 |
if size >= min_split_mb and (blank or chapter or (header and header != last_header)):
|
128 |
-
# Good split point found
|
129 |
j += 1
|
130 |
break
|
131 |
last_header = header
|
132 |
j += 1
|
133 |
splits.append((part_start, j))
|
134 |
i = j
|
|
|
135 |
split_files = []
|
|
|
136 |
for idx, (start, end) in enumerate(splits):
|
137 |
writer = PdfWriter()
|
138 |
for p in range(start, end):
|
|
|
139 |
writer.add_page(reader.pages[p])
|
140 |
out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf')
|
141 |
with open(out_path, 'wb') as f:
|
@@ -143,7 +138,8 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
|
|
143 |
size = os.path.getsize(out_path) / (1024 * 1024)
|
144 |
split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path})
|
145 |
logging.info(f"Saved split file {out_path} ({size:.2f} MB) for pages {start}-{end-1}")
|
146 |
-
|
|
|
147 |
return split_files
|
148 |
|
149 |
def make_zip_of_splits(split_files, session_dir):
|
|
|
95 |
reader = PdfReader(input_path)
|
96 |
n_pages = len(reader.pages)
|
97 |
splits = []
|
|
|
|
|
|
|
98 |
last_header = None
|
99 |
+
i = 0
|
100 |
while i < n_pages:
|
|
|
101 |
part_start = i
|
102 |
+
writer = PdfWriter()
|
103 |
+
writer.add_page(reader.pages[i])
|
104 |
+
size = estimate_writer_size(writer) / (1024 * 1024)
|
105 |
if size > max_mb:
|
|
|
106 |
splits.append((i, i+1))
|
107 |
i += 1
|
108 |
continue
|
|
|
109 |
j = i + 1
|
110 |
while j < n_pages:
|
111 |
tmp_writer = PdfWriter()
|
|
|
113 |
tmp_writer.add_page(reader.pages[k])
|
114 |
size = estimate_writer_size(tmp_writer) / (1024 * 1024)
|
115 |
if size > max_mb:
|
|
|
116 |
break
|
|
|
117 |
header = extract_text_headers(reader, j)
|
118 |
blank = is_blank_page(reader, j)
|
119 |
chapter = is_chapter_header(header)
|
120 |
if size >= min_split_mb and (blank or chapter or (header and header != last_header)):
|
|
|
121 |
j += 1
|
122 |
break
|
123 |
last_header = header
|
124 |
j += 1
|
125 |
splits.append((part_start, j))
|
126 |
i = j
|
127 |
+
|
128 |
split_files = []
|
129 |
+
input_size = os.path.getsize(input_path) / (1024 * 1024)
|
130 |
for idx, (start, end) in enumerate(splits):
|
131 |
writer = PdfWriter()
|
132 |
for p in range(start, end):
|
133 |
+
# Add only the required page references; this avoids resource bloat
|
134 |
writer.add_page(reader.pages[p])
|
135 |
out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf')
|
136 |
with open(out_path, 'wb') as f:
|
|
|
138 |
size = os.path.getsize(out_path) / (1024 * 1024)
|
139 |
split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path})
|
140 |
logging.info(f"Saved split file {out_path} ({size:.2f} MB) for pages {start}-{end-1}")
|
141 |
+
total_output_size = sum([f['size'] for f in split_files])
|
142 |
+
logging.info(f"Original input size: {input_size:.2f} MB, Total split output size: {total_output_size:.2f} MB, {len(split_files)} files created.")
|
143 |
return split_files
|
144 |
|
145 |
def make_zip_of_splits(split_files, session_dir):
|