Spaces:

MicroHealth
/

maiko-file-splitter

Paused

updates to include original file name in chunk names

by zegner - opened May 5

←

Files changed (1) hide show

app.py CHANGED Viewed

@@ -90,7 +90,9 @@ def estimate_writer_size(writer):
     writer.write(f)
     return f.tell()
-def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
     logging.info(f"intelligent_pdf_split: Starting split for {input_path} in {session_dir}")
     reader = PdfReader(input_path)
     n_pages = len(reader.pages)
@@ -127,11 +129,13 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
     split_files = []
     input_size = os.path.getsize(input_path) / (1024 * 1024)
     for idx, (start, end) in enumerate(splits):
         writer = PdfWriter()
         for p in range(start, end):
             writer.add_page(reader.pages[p])
-        out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf')
         with open(out_path, 'wb') as f:
             writer.write(f)
         size = os.path.getsize(out_path) / (1024 * 1024)
@@ -448,7 +452,7 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
         try:
             logging.info(f"Splitting PDF for session {session_id}. File: {pdf_path}")
             with lock:
-                split_files = intelligent_pdf_split(pdf_path, session_dir)
                 for fi in split_files:
                     logging.info(f"Split file saved: {fi['path']} ({fi['size']:.2f} MB)")
                 zip_path = make_zip_of_splits(split_files, session_dir)

     writer.write(f)
     return f.tell()
+def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4, orig_filename=None):
+    import io
+    from pathlib import Path
     logging.info(f"intelligent_pdf_split: Starting split for {input_path} in {session_dir}")
     reader = PdfReader(input_path)
     n_pages = len(reader.pages)
     split_files = []
     input_size = os.path.getsize(input_path) / (1024 * 1024)
+    stem = Path(orig_filename).stem if orig_filename else "split_part"
     for idx, (start, end) in enumerate(splits):
         writer = PdfWriter()
         for p in range(start, end):
             writer.add_page(reader.pages[p])
+        part_name = f"{stem}_part_{idx+1}.pdf"
+        out_path = os.path.join(session_dir, part_name)
         with open(out_path, 'wb') as f:
             writer.write(f)
         size = os.path.getsize(out_path) / (1024 * 1024)
         try:
             logging.info(f"Splitting PDF for session {session_id}. File: {pdf_path}")
             with lock:
+                split_files = intelligent_pdf_split(pdf_path, session_dir, orig_filename=orig_filename)
                 for fi in split_files:
                     logging.info(f"Split file saved: {fi['path']} ({fi['size']:.2f} MB)")
                 zip_path = make_zip_of_splits(split_files, session_dir)