extract-photos-from-pdf

Sleeping

App Files Files Community

Dejansimic commited on Apr 15

Commit

e8cd67b

verified ·

1 Parent(s): 93bf38c

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -43

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import zipfile
 import shutil
 import tempfile
 from pathlib import Path
 def zip_folder(folder_path, output_path):
     """Create a zip archive from a folder with improved error handling"""
@@ -14,10 +16,11 @@ def zip_folder(folder_path, output_path):
                 for file in files:
                     file_path = os.path.join(root, file)
                     zipf.write(file_path, os.path.relpath(file_path, folder_path))
-        return True
     except Exception as e:
-        print(f"Error creating zip file: {e}")
-        return False
 # Use more robust directory handling with pathlib
 BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
@@ -47,7 +50,7 @@ def clear_directory(directory):
     """Safely clear a directory with error handling"""
     directory = Path(directory)
     if not directory.exists():
-        return
     try:
         for item in directory.iterdir():
@@ -55,25 +58,71 @@ def clear_directory(directory):
                 item.unlink()
             elif item.is_dir():
                 shutil.rmtree(item)
     except Exception as e:
-        print(f"Failed to clear directory {directory}. Reason: {e}")
 def extract_photos_from_pdf(file_pdf):
-    """Extract all pages from a PDF as images"""
-    # Clear directories for new extraction
-    clear_directory(DIRECTORY)
-    clear_directory(DIRECTORY_OUTPUT)
     if file_pdf is None:
-        return (
-            gr.Gallery.update(value=[], label="No file uploaded", visible=True),
-            gr.File.update(visible=False)
         )
     try:
         pdf_path = file_pdf.name
-        info = pdfinfo_from_path(pdf_path)
-        total_pages = info["Pages"]
         # Progress tracking variables
         batch_size = 10  # Smaller batch size for better progress visibility
@@ -81,50 +130,96 @@ def extract_photos_from_pdf(file_pdf):
         # Process PDF in batches
         for start_page in range(1, total_pages + 1, batch_size):
             end_page = min(start_page + batch_size - 1, total_pages)
-            images = convert_from_path(
-                pdf_path,
-                first_page=start_page,
-                last_page=end_page,
-                dpi=150  # Adjustable DPI for quality vs size
             )
-            for idx, image in enumerate(images, start=start_page):
-                image_path = DIRECTORY / f"{idx}.png"
-                image.save(str(image_path), 'PNG')
         # Get list of extracted images and sort them numerically
         images_pdf_list = get_image_files(DIRECTORY)
         if not images_pdf_list:
-            return (
-                gr.Gallery.update(value=[], label="No images extracted", visible=True),
-                gr.File.update(visible=False)
             )
         image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
-        sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
         # Create zip file of all images
         zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
-        if zip_folder(DIRECTORY, zip_path):
-            return (
                 gr.Gallery.update(
                     value=sorted_names,
                     label=f"Extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''}",
                     visible=True
                 ),
-                gr.File.update(value=str(zip_path), visible=True)
             )
         else:
-            return (
-                gr.Gallery.update(value=sorted_names, label="Extracted images (zip creation failed)", visible=True),
-                gr.File.update(visible=False)
             )
     except Exception as e:
-        print(f"Error extracting PDF: {e}")
-        return (
-            gr.Gallery.update(value=[], label=f"Error: {str(e)}", visible=True),
-            gr.File.update(visible=False)
         )
 # Create Gradio interface with improved layout and error handling
@@ -149,9 +244,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                     clear_btn = gr.Button("Clear")
         with gr.Column():
-            status = gr.Textbox(label="Status", visible=True)
-            # Opraveno: V novějších verzích Gradio, Gallery nemá metodu style
-            # Místo toho nastavujeme parametry přímo při vytváření
             gallery = gr.Gallery(
                 label="Extracted Pages",
                 show_label=True,
@@ -172,7 +269,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             examples=[[example_path]],
             fn=extract_photos_from_pdf,
             inputs=[file_pdf],
-            outputs=[gallery, download_btn],
             cache_examples=False
         )
@@ -180,17 +277,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     btn.click(
         fn=extract_photos_from_pdf,
         inputs=[file_pdf],
-        outputs=[gallery, download_btn],
         api_name="extract"
     )
     clear_btn.click(
         fn=lambda: (
             gr.Gallery.update(value=[], label="Extracted Pages", visible=True),
-            gr.File.update(visible=False)
         ),
         inputs=[],
-        outputs=[gallery, download_btn]
     )
 if __name__ == "__main__":

 import shutil
 import tempfile
 from pathlib import Path
+import traceback
+import sys
 def zip_folder(folder_path, output_path):
     """Create a zip archive from a folder with improved error handling"""
                 for file in files:
                     file_path = os.path.join(root, file)
                     zipf.write(file_path, os.path.relpath(file_path, folder_path))
+        return True, ""
     except Exception as e:
+        error_msg = f"Error creating zip file: {str(e)}"
+        print(error_msg)
+        return False, error_msg
 # Use more robust directory handling with pathlib
 BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
     """Safely clear a directory with error handling"""
     directory = Path(directory)
     if not directory.exists():
+        return True, ""
     try:
         for item in directory.iterdir():
                 item.unlink()
             elif item.is_dir():
                 shutil.rmtree(item)
+        return True, ""
     except Exception as e:
+        error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}"
+        print(error_msg)
+        return False, error_msg
 def extract_photos_from_pdf(file_pdf):
+    """Extract all pages from a PDF as images with comprehensive error handling"""
+    # Update status at the beginning
+    yield (
+        gr.Gallery.update(value=[], visible=True),
+        gr.File.update(visible=False),
+        gr.Textbox.update(value="Starting extraction process...", visible=True)
+    )
+    # Check if file is provided
     if file_pdf is None:
+        yield (
+            gr.Gallery.update(value=[], visible=True),
+            gr.File.update(visible=False),
+            gr.Textbox.update(value="Error: No file uploaded", visible=True)
         )
+        return
+    # Clear directories for new extraction
+    clear_success, clear_error = clear_directory(DIRECTORY)
+    if not clear_success:
+        yield (
+            gr.Gallery.update(value=[], visible=True),
+            gr.File.update(visible=False),
+            gr.Textbox.update(value=f"Error clearing directories: {clear_error}", visible=True)
+        )
+        return
+    clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT)
+    if not clear_success:
+        yield (
+            gr.Gallery.update(value=[], visible=True),
+            gr.File.update(visible=False),
+            gr.Textbox.update(value=f"Error clearing output directory: {clear_error}", visible=True)
+        )
+        return
     try:
+        # Get PDF path and info
         pdf_path = file_pdf.name
+        # Update status
+        yield (
+            gr.Gallery.update(value=[], visible=True),
+            gr.File.update(visible=False),
+            gr.Textbox.update(value="Reading PDF information...", visible=True)
+        )
+        try:
+            info = pdfinfo_from_path(pdf_path)
+            total_pages = info["Pages"]
+        except Exception as e:
+            error_details = traceback.format_exc()
+            yield (
+                gr.Gallery.update(value=[], visible=True),
+                gr.File.update(visible=False),
+                gr.Textbox.update(value=f"Error reading PDF: {str(e)}\n\nDetails: {error_details}", visible=True)
+            )
+            return
         # Progress tracking variables
         batch_size = 10  # Smaller batch size for better progress visibility
         # Process PDF in batches
         for start_page in range(1, total_pages + 1, batch_size):
             end_page = min(start_page + batch_size - 1, total_pages)
+            # Update status
+            yield (
+                gr.Gallery.update(value=[], visible=True),
+                gr.File.update(visible=False),
+                gr.Textbox.update(value=f"Processing pages {start_page} to {end_page} of {total_pages}...", visible=True)
             )
+            try:
+                images = convert_from_path(
+                    pdf_path,
+                    first_page=start_page,
+                    last_page=end_page,
+                    dpi=150  # Adjustable DPI for quality vs size
+                )
+                for idx, image in enumerate(images, start=start_page):
+                    image_path = DIRECTORY / f"{idx}.png"
+                    image.save(str(image_path), 'PNG')
+            except Exception as e:
+                error_details = traceback.format_exc()
+                yield (
+                    gr.Gallery.update(value=[], visible=True),
+                    gr.File.update(visible=False),
+                    gr.Textbox.update(value=f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}", visible=True)
+                )
+                return
+        # Get list of extracted images
+        yield (
+            gr.Gallery.update(value=[], visible=True),
+            gr.File.update(visible=False),
+            gr.Textbox.update(value="Preparing gallery view...", visible=True)
+        )
         # Get list of extracted images and sort them numerically
         images_pdf_list = get_image_files(DIRECTORY)
         if not images_pdf_list:
+            yield (
+                gr.Gallery.update(value=[], visible=True),
+                gr.File.update(visible=False),
+                gr.Textbox.update(value="No images could be extracted from the PDF.", visible=True)
             )
+            return
         image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
+        try:
+            sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
+        except Exception as e:
+            # Fallback to unsorted if sorting fails
+            sorted_names = image_names
+            print(f"Error sorting images: {e}")
         # Create zip file of all images
+        yield (
+            gr.Gallery.update(value=[], visible=True),
+            gr.File.update(visible=False),
+            gr.Textbox.update(value="Creating downloadable zip file...", visible=True)
+        )
         zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
+        zip_success, zip_error = zip_folder(DIRECTORY, zip_path)
+        if zip_success:
+            yield (
                 gr.Gallery.update(
                     value=sorted_names,
                     label=f"Extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''}",
                     visible=True
                 ),
+                gr.File.update(value=str(zip_path), visible=True),
+                gr.Textbox.update(value=f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF.", visible=True)
             )
         else:
+            yield (
+                gr.Gallery.update(
+                    value=sorted_names,
+                    label="Extracted images (zip creation failed)",
+                    visible=True
+                ),
+                gr.File.update(visible=False),
+                gr.Textbox.update(value=f"Images extracted but zip creation failed: {zip_error}", visible=True)
             )
     except Exception as e:
+        error_details = traceback.format_exc()
+        yield (
+            gr.Gallery.update(value=[], visible=True),
+            gr.File.update(visible=False),
+            gr.Textbox.update(value=f"Unexpected error: {str(e)}\n\nDetails: {error_details}", visible=True)
         )
 # Create Gradio interface with improved layout and error handling
                     clear_btn = gr.Button("Clear")
         with gr.Column():
+            status = gr.Textbox(
+                label="Status",
+                value="Upload a PDF and click 'Extract Images'",
+                visible=True
+            )
             gallery = gr.Gallery(
                 label="Extracted Pages",
                 show_label=True,
             examples=[[example_path]],
             fn=extract_photos_from_pdf,
             inputs=[file_pdf],
+            outputs=[gallery, download_btn, status],
             cache_examples=False
         )
     btn.click(
         fn=extract_photos_from_pdf,
         inputs=[file_pdf],
+        outputs=[gallery, download_btn, status],
         api_name="extract"
     )
     clear_btn.click(
         fn=lambda: (
             gr.Gallery.update(value=[], label="Extracted Pages", visible=True),
+            gr.File.update(visible=False),
+            gr.Textbox.update(value="Cleared. Upload a PDF to begin.", visible=True)
         ),
         inputs=[],
+        outputs=[gallery, download_btn, status]
     )
 if __name__ == "__main__":