Spaces:

MicroHealth
/

maiko-file-splitter

Paused

zegner commited on May 5

Commit

3385c9f

verified ·

1 Parent(s): ff01f81

updates to include original file name in chunk names

have not tested this yet, but it should make it so we can utilize the original file name

Files changed (1) hide show

app.py CHANGED Viewed

@@ -90,7 +90,9 @@ def estimate_writer_size(writer):
     writer.write(f)
     return f.tell()
-def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
     logging.info(f"intelligent_pdf_split: Starting split for {input_path} in {session_dir}")
     reader = PdfReader(input_path)
     n_pages = len(reader.pages)
@@ -127,11 +129,13 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
     split_files = []
     input_size = os.path.getsize(input_path) / (1024 * 1024)
     for idx, (start, end) in enumerate(splits):
         writer = PdfWriter()
         for p in range(start, end):
             writer.add_page(reader.pages[p])
-        out_path = os.path.join(session_dir, f'split_part_{idx+1}.pdf')
         with open(out_path, 'wb') as f:
             writer.write(f)
         size = os.path.getsize(out_path) / (1024 * 1024)
@@ -448,7 +452,7 @@ def handle_upload(contents, filename, clear_n, delete_upload_n_list, split_n, de
         try:
             logging.info(f"Splitting PDF for session {session_id}. File: {pdf_path}")
             with lock:
-                split_files = intelligent_pdf_split(pdf_path, session_dir)
                 for fi in split_files:
                     logging.info(f"Split file saved: {fi['path']} ({fi['size']:.2f} MB)")
                 zip_path = make_zip_of_splits(split_files, session_dir)

     writer.write(f)
     return f.tell()
+def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4, orig_filename=None):
+    import io
+    from pathlib import Path
     logging.info(f"intelligent_pdf_split: Starting split for {input_path} in {session_dir}")
     reader = PdfReader(input_path)
     n_pages = len(reader.pages)
     split_files = []
     input_size = os.path.getsize(input_path) / (1024 * 1024)
+    stem = Path(orig_filename).stem if orig_filename else "split_part"
     for idx, (start, end) in enumerate(splits):
         writer = PdfWriter()
         for p in range(start, end):
             writer.add_page(reader.pages[p])
+        part_name = f"{stem}_part_{idx+1}.pdf"
+        out_path = os.path.join(session_dir, part_name)
         with open(out_path, 'wb') as f:
             writer.write(f)
         size = os.path.getsize(out_path) / (1024 * 1024)
         try:
             logging.info(f"Splitting PDF for session {session_id}. File: {pdf_path}")
             with lock:
+                split_files = intelligent_pdf_split(pdf_path, session_dir, orig_filename=orig_filename)
                 for fi in split_files:
                     logging.info(f"Split file saved: {fi['path']} ({fi['size']:.2f} MB)")
                 zip_path = make_zip_of_splits(split_files, session_dir)