bluenevus commited on
Commit
4fb932e
·
1 Parent(s): f3088d0

Update app.py via AI Editor

Browse files
Files changed (1) hide show
  1. app.py +9 -13
app.py CHANGED
@@ -95,22 +95,17 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
95
  reader = PdfReader(input_path)
96
  n_pages = len(reader.pages)
97
  splits = []
98
- current_writer = PdfWriter()
99
- last_split_at = 0
100
- i = 0
101
  last_header = None
102
- force_split = False
103
  while i < n_pages:
104
- current_writer = PdfWriter()
105
  part_start = i
106
- current_writer.add_page(reader.pages[i])
107
- size = estimate_writer_size(current_writer) / (1024 * 1024)
 
108
  if size > max_mb:
109
- # Single page exceeds max_mb, must split after this page
110
  splits.append((i, i+1))
111
  i += 1
112
  continue
113
- # Add consecutive pages until approaching min_split_mb
114
  j = i + 1
115
  while j < n_pages:
116
  tmp_writer = PdfWriter()
@@ -118,24 +113,24 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
118
  tmp_writer.add_page(reader.pages[k])
119
  size = estimate_writer_size(tmp_writer) / (1024 * 1024)
120
  if size > max_mb:
121
- # Last added page makes it too big; break, split at previous
122
  break
123
- # If above min_split_mb, check for natural break
124
  header = extract_text_headers(reader, j)
125
  blank = is_blank_page(reader, j)
126
  chapter = is_chapter_header(header)
127
  if size >= min_split_mb and (blank or chapter or (header and header != last_header)):
128
- # Good split point found
129
  j += 1
130
  break
131
  last_header = header
132
  j += 1
133
  splits.append((part_start, j))
134
  i = j
 
135
  split_files = []
 
136
  for idx, (start, end) in enumerate(splits):
137
  writer = PdfWriter()
138
  for p in range(start, end):
 
139
  writer.add_page(reader.pages[p])
140
  out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf')
141
  with open(out_path, 'wb') as f:
@@ -143,7 +138,8 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
143
  size = os.path.getsize(out_path) / (1024 * 1024)
144
  split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path})
145
  logging.info(f"Saved split file {out_path} ({size:.2f} MB) for pages {start}-{end-1}")
146
- logging.info(f"intelligent_pdf_split: Finished. Total {len(split_files)} files created.")
 
147
  return split_files
148
 
149
  def make_zip_of_splits(split_files, session_dir):
 
95
  reader = PdfReader(input_path)
96
  n_pages = len(reader.pages)
97
  splits = []
 
 
 
98
  last_header = None
99
+ i = 0
100
  while i < n_pages:
 
101
  part_start = i
102
+ writer = PdfWriter()
103
+ writer.add_page(reader.pages[i])
104
+ size = estimate_writer_size(writer) / (1024 * 1024)
105
  if size > max_mb:
 
106
  splits.append((i, i+1))
107
  i += 1
108
  continue
 
109
  j = i + 1
110
  while j < n_pages:
111
  tmp_writer = PdfWriter()
 
113
  tmp_writer.add_page(reader.pages[k])
114
  size = estimate_writer_size(tmp_writer) / (1024 * 1024)
115
  if size > max_mb:
 
116
  break
 
117
  header = extract_text_headers(reader, j)
118
  blank = is_blank_page(reader, j)
119
  chapter = is_chapter_header(header)
120
  if size >= min_split_mb and (blank or chapter or (header and header != last_header)):
 
121
  j += 1
122
  break
123
  last_header = header
124
  j += 1
125
  splits.append((part_start, j))
126
  i = j
127
+
128
  split_files = []
129
+ input_size = os.path.getsize(input_path) / (1024 * 1024)
130
  for idx, (start, end) in enumerate(splits):
131
  writer = PdfWriter()
132
  for p in range(start, end):
133
+ # Add only the required page references; this avoids resource bloat
134
  writer.add_page(reader.pages[p])
135
  out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf')
136
  with open(out_path, 'wb') as f:
 
138
  size = os.path.getsize(out_path) / (1024 * 1024)
139
  split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path})
140
  logging.info(f"Saved split file {out_path} ({size:.2f} MB) for pages {start}-{end-1}")
141
+ total_output_size = sum([f['size'] for f in split_files])
142
+ logging.info(f"Original input size: {input_size:.2f} MB, Total split output size: {total_output_size:.2f} MB, {len(split_files)} files created.")
143
  return split_files
144
 
145
  def make_zip_of_splits(split_files, session_dir):