bluenevus commited on
Commit
f3088d0
·
1 Parent(s): 6914bb7

Update app.py via AI Editor

Browse files
Files changed (1) hide show
  1. app.py +36 -39
app.py CHANGED
@@ -96,41 +96,47 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
96
  n_pages = len(reader.pages)
97
  splits = []
98
  current_writer = PdfWriter()
99
- split_points = []
100
- last_header = None
101
  last_split_at = 0
102
- for i in range(n_pages):
103
- page = reader.pages[i]
104
- current_writer.add_page(page)
 
 
 
 
105
  size = estimate_writer_size(current_writer) / (1024 * 1024)
106
- header = extract_text_headers(reader, i)
107
- blank = is_blank_page(reader, i)
108
- chapter = is_chapter_header(header)
109
- split_here = False
110
-
111
- logging.debug(f"Page {i}: size={size:.2f}MB, header='{header}', blank={blank}, chapter={chapter}")
112
-
113
- if size >= max_mb:
114
- split_here = True
115
- elif size >= min_split_mb:
116
- if blank or chapter or (header and header != last_header):
117
- split_here = True
118
-
119
- if split_here:
120
- splits.append((last_split_at, i+1))
121
- logging.info(f"Splitting at pages {last_split_at}-{i+1} (size ~{size:.2f}MB)")
122
- last_split_at = i+1
123
- current_writer = PdfWriter()
124
- last_header = header
125
-
126
- if last_split_at < n_pages:
127
- splits.append((last_split_at, n_pages))
128
-
 
 
 
 
129
  split_files = []
130
  for idx, (start, end) in enumerate(splits):
131
  writer = PdfWriter()
132
- for i in range(start, end):
133
- writer.add_page(reader.pages[i])
134
  out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf')
135
  with open(out_path, 'wb') as f:
136
  writer.write(f)
@@ -303,7 +309,6 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
303
  if session_data is None:
304
  session_data = {}
305
 
306
- # Handle Clear Session
307
  if trigger == 'clear-session':
308
  logging.info(f"handle_upload: Clear session button pressed for {session_id}")
309
  clean_session(session_id)
@@ -311,7 +316,6 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
311
  logging.info(f"Session cleared for {session_id}")
312
  return "", True, get_split_results_placeholder(), resp_data
313
 
314
- # Handle Delete Upload (detect ANY delete button press)
315
  delete_pressed = False
316
  if isinstance(trigger, dict) and trigger.get('type') == 'delete-upload-btn':
317
  delete_pressed = True
@@ -332,7 +336,6 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
332
  logging.info(f"Session files deleted for {session_id}")
333
  return "", True, get_split_results_placeholder(), {}
334
 
335
- # Handle Delete Split File
336
  delete_split_pressed = False
337
  delete_split_idx = None
338
  if isinstance(trigger, dict) and trigger.get('type') == 'delete-split-btn':
@@ -354,7 +357,6 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
354
  logging.info(f"Deleted split file: {file_path} for session {session_id}")
355
  split_files = [f for i, f in enumerate(split_files) if i != delete_split_idx]
356
  session_data['split_files'] = split_files
357
- # Recreate ZIP if any splits remain, else remove ZIP
358
  zip_path = os.path.join(session_dir, "split_files.zip")
359
  if split_files:
360
  make_zip_of_splits(split_files, session_dir)
@@ -365,7 +367,6 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
365
  os.remove(zip_path)
366
  logging.info(f"Deleted ZIP file as no splits remain for session {session_id}")
367
  session_data['zip_ready'] = False
368
- # Build UI
369
  orig_filename = session_data.get('orig_filename', '')
370
  file_info = dbc.Row([
371
  dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
@@ -389,7 +390,6 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
389
  else:
390
  logging.warning(f"Split file delete index {delete_split_idx} invalid for session {session_id}")
391
 
392
- # Handle Upload
393
  if trigger == 'upload-pdf':
394
  logging.info(f"handle_upload: Upload triggered for filename={filename}, session_id={session_id}")
395
  if not contents:
@@ -426,7 +426,6 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
426
  logging.error(f"Error processing PDF: {e}")
427
  return html.Div(f"Error: {e}", style={'color': 'red'}), True, get_split_results_placeholder(), {}
428
 
429
- # Handle Split
430
  if trigger == 'split-btn':
431
  orig_filename = session_data.get('orig_filename')
432
  logging.info(f"handle_upload: Split button clicked for {session_id}, orig_filename={orig_filename}")
@@ -470,7 +469,6 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
470
  logging.error(f"Error splitting PDF for session {session_id}: {e}")
471
  return html.Div(f"Error: {e}", style={'color': 'red'}), False, get_split_results_placeholder(), session_data
472
 
473
- # Restore after split
474
  if session_data.get('split_files'):
475
  split_files = session_data['split_files']
476
  orig_filename = session_data.get('orig_filename', '')
@@ -494,7 +492,6 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
494
  logging.info(f"handle_upload: Restoring split results for session {session_id}, {len(split_files)} files.")
495
  return file_info, False, results, session_data
496
 
497
- # Restore after upload (before split)
498
  if session_data.get('orig_filename') and not session_data.get('split_files'):
499
  file_info = dbc.Row([
500
  dbc.Col(html.Div(f"Uploaded: {session_data['orig_filename']}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
 
96
  n_pages = len(reader.pages)
97
  splits = []
98
  current_writer = PdfWriter()
 
 
99
  last_split_at = 0
100
+ i = 0
101
+ last_header = None
102
+ force_split = False
103
+ while i < n_pages:
104
+ current_writer = PdfWriter()
105
+ part_start = i
106
+ current_writer.add_page(reader.pages[i])
107
  size = estimate_writer_size(current_writer) / (1024 * 1024)
108
+ if size > max_mb:
109
+ # Single page exceeds max_mb, must split after this page
110
+ splits.append((i, i+1))
111
+ i += 1
112
+ continue
113
+ # Add consecutive pages until approaching min_split_mb
114
+ j = i + 1
115
+ while j < n_pages:
116
+ tmp_writer = PdfWriter()
117
+ for k in range(part_start, j+1):
118
+ tmp_writer.add_page(reader.pages[k])
119
+ size = estimate_writer_size(tmp_writer) / (1024 * 1024)
120
+ if size > max_mb:
121
+ # Last added page makes it too big; break, split at previous
122
+ break
123
+ # If above min_split_mb, check for natural break
124
+ header = extract_text_headers(reader, j)
125
+ blank = is_blank_page(reader, j)
126
+ chapter = is_chapter_header(header)
127
+ if size >= min_split_mb and (blank or chapter or (header and header != last_header)):
128
+ # Good split point found
129
+ j += 1
130
+ break
131
+ last_header = header
132
+ j += 1
133
+ splits.append((part_start, j))
134
+ i = j
135
  split_files = []
136
  for idx, (start, end) in enumerate(splits):
137
  writer = PdfWriter()
138
+ for p in range(start, end):
139
+ writer.add_page(reader.pages[p])
140
  out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf')
141
  with open(out_path, 'wb') as f:
142
  writer.write(f)
 
309
  if session_data is None:
310
  session_data = {}
311
 
 
312
  if trigger == 'clear-session':
313
  logging.info(f"handle_upload: Clear session button pressed for {session_id}")
314
  clean_session(session_id)
 
316
  logging.info(f"Session cleared for {session_id}")
317
  return "", True, get_split_results_placeholder(), resp_data
318
 
 
319
  delete_pressed = False
320
  if isinstance(trigger, dict) and trigger.get('type') == 'delete-upload-btn':
321
  delete_pressed = True
 
336
  logging.info(f"Session files deleted for {session_id}")
337
  return "", True, get_split_results_placeholder(), {}
338
 
 
339
  delete_split_pressed = False
340
  delete_split_idx = None
341
  if isinstance(trigger, dict) and trigger.get('type') == 'delete-split-btn':
 
357
  logging.info(f"Deleted split file: {file_path} for session {session_id}")
358
  split_files = [f for i, f in enumerate(split_files) if i != delete_split_idx]
359
  session_data['split_files'] = split_files
 
360
  zip_path = os.path.join(session_dir, "split_files.zip")
361
  if split_files:
362
  make_zip_of_splits(split_files, session_dir)
 
367
  os.remove(zip_path)
368
  logging.info(f"Deleted ZIP file as no splits remain for session {session_id}")
369
  session_data['zip_ready'] = False
 
370
  orig_filename = session_data.get('orig_filename', '')
371
  file_info = dbc.Row([
372
  dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
 
390
  else:
391
  logging.warning(f"Split file delete index {delete_split_idx} invalid for session {session_id}")
392
 
 
393
  if trigger == 'upload-pdf':
394
  logging.info(f"handle_upload: Upload triggered for filename={filename}, session_id={session_id}")
395
  if not contents:
 
426
  logging.error(f"Error processing PDF: {e}")
427
  return html.Div(f"Error: {e}", style={'color': 'red'}), True, get_split_results_placeholder(), {}
428
 
 
429
  if trigger == 'split-btn':
430
  orig_filename = session_data.get('orig_filename')
431
  logging.info(f"handle_upload: Split button clicked for {session_id}, orig_filename={orig_filename}")
 
469
  logging.error(f"Error splitting PDF for session {session_id}: {e}")
470
  return html.Div(f"Error: {e}", style={'color': 'red'}), False, get_split_results_placeholder(), session_data
471
 
 
472
  if session_data.get('split_files'):
473
  split_files = session_data['split_files']
474
  orig_filename = session_data.get('orig_filename', '')
 
492
  logging.info(f"handle_upload: Restoring split results for session {session_id}, {len(split_files)} files.")
493
  return file_info, False, results, session_data
494
 
 
495
  if session_data.get('orig_filename') and not session_data.get('split_files'):
496
  file_info = dbc.Row([
497
  dbc.Col(html.Div(f"Uploaded: {session_data['orig_filename']}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),