Dejansimic commited on
Commit
e8cd67b
·
verified ·
1 Parent(s): 93bf38c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -43
app.py CHANGED
@@ -5,6 +5,8 @@ import zipfile
5
  import shutil
6
  import tempfile
7
  from pathlib import Path
 
 
8
 
9
  def zip_folder(folder_path, output_path):
10
  """Create a zip archive from a folder with improved error handling"""
@@ -14,10 +16,11 @@ def zip_folder(folder_path, output_path):
14
  for file in files:
15
  file_path = os.path.join(root, file)
16
  zipf.write(file_path, os.path.relpath(file_path, folder_path))
17
- return True
18
  except Exception as e:
19
- print(f"Error creating zip file: {e}")
20
- return False
 
21
 
22
  # Use more robust directory handling with pathlib
23
  BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
@@ -47,7 +50,7 @@ def clear_directory(directory):
47
  """Safely clear a directory with error handling"""
48
  directory = Path(directory)
49
  if not directory.exists():
50
- return
51
 
52
  try:
53
  for item in directory.iterdir():
@@ -55,25 +58,71 @@ def clear_directory(directory):
55
  item.unlink()
56
  elif item.is_dir():
57
  shutil.rmtree(item)
 
58
  except Exception as e:
59
- print(f"Failed to clear directory {directory}. Reason: {e}")
 
 
60
 
61
  def extract_photos_from_pdf(file_pdf):
62
- """Extract all pages from a PDF as images"""
63
- # Clear directories for new extraction
64
- clear_directory(DIRECTORY)
65
- clear_directory(DIRECTORY_OUTPUT)
 
 
 
66
 
 
67
  if file_pdf is None:
68
- return (
69
- gr.Gallery.update(value=[], label="No file uploaded", visible=True),
70
- gr.File.update(visible=False)
 
71
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  try:
 
74
  pdf_path = file_pdf.name
75
- info = pdfinfo_from_path(pdf_path)
76
- total_pages = info["Pages"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # Progress tracking variables
79
  batch_size = 10 # Smaller batch size for better progress visibility
@@ -81,50 +130,96 @@ def extract_photos_from_pdf(file_pdf):
81
  # Process PDF in batches
82
  for start_page in range(1, total_pages + 1, batch_size):
83
  end_page = min(start_page + batch_size - 1, total_pages)
84
- images = convert_from_path(
85
- pdf_path,
86
- first_page=start_page,
87
- last_page=end_page,
88
- dpi=150 # Adjustable DPI for quality vs size
 
89
  )
90
 
91
- for idx, image in enumerate(images, start=start_page):
92
- image_path = DIRECTORY / f"{idx}.png"
93
- image.save(str(image_path), 'PNG')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  # Get list of extracted images and sort them numerically
96
  images_pdf_list = get_image_files(DIRECTORY)
97
  if not images_pdf_list:
98
- return (
99
- gr.Gallery.update(value=[], label="No images extracted", visible=True),
100
- gr.File.update(visible=False)
 
101
  )
 
102
 
103
  image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
104
- sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
 
 
 
 
 
105
 
106
  # Create zip file of all images
 
 
 
 
 
 
107
  zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
108
- if zip_folder(DIRECTORY, zip_path):
109
- return (
 
 
110
  gr.Gallery.update(
111
  value=sorted_names,
112
  label=f"Extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''}",
113
  visible=True
114
  ),
115
- gr.File.update(value=str(zip_path), visible=True)
 
116
  )
117
  else:
118
- return (
119
- gr.Gallery.update(value=sorted_names, label="Extracted images (zip creation failed)", visible=True),
120
- gr.File.update(visible=False)
 
 
 
 
 
121
  )
122
 
123
  except Exception as e:
124
- print(f"Error extracting PDF: {e}")
125
- return (
126
- gr.Gallery.update(value=[], label=f"Error: {str(e)}", visible=True),
127
- gr.File.update(visible=False)
 
128
  )
129
 
130
  # Create Gradio interface with improved layout and error handling
@@ -149,9 +244,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
149
  clear_btn = gr.Button("Clear")
150
 
151
  with gr.Column():
152
- status = gr.Textbox(label="Status", visible=True)
153
- # Opraveno: V novějších verzích Gradio, Gallery nemá metodu style
154
- # Místo toho nastavujeme parametry přímo při vytváření
 
 
155
  gallery = gr.Gallery(
156
  label="Extracted Pages",
157
  show_label=True,
@@ -172,7 +269,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
172
  examples=[[example_path]],
173
  fn=extract_photos_from_pdf,
174
  inputs=[file_pdf],
175
- outputs=[gallery, download_btn],
176
  cache_examples=False
177
  )
178
 
@@ -180,17 +277,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
180
  btn.click(
181
  fn=extract_photos_from_pdf,
182
  inputs=[file_pdf],
183
- outputs=[gallery, download_btn],
184
  api_name="extract"
185
  )
186
 
187
  clear_btn.click(
188
  fn=lambda: (
189
  gr.Gallery.update(value=[], label="Extracted Pages", visible=True),
190
- gr.File.update(visible=False)
 
191
  ),
192
  inputs=[],
193
- outputs=[gallery, download_btn]
194
  )
195
 
196
  if __name__ == "__main__":
 
5
  import shutil
6
  import tempfile
7
  from pathlib import Path
8
+ import traceback
9
+ import sys
10
 
11
  def zip_folder(folder_path, output_path):
12
  """Create a zip archive from a folder with improved error handling"""
 
16
  for file in files:
17
  file_path = os.path.join(root, file)
18
  zipf.write(file_path, os.path.relpath(file_path, folder_path))
19
+ return True, ""
20
  except Exception as e:
21
+ error_msg = f"Error creating zip file: {str(e)}"
22
+ print(error_msg)
23
+ return False, error_msg
24
 
25
  # Use more robust directory handling with pathlib
26
  BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
 
50
  """Safely clear a directory with error handling"""
51
  directory = Path(directory)
52
  if not directory.exists():
53
+ return True, ""
54
 
55
  try:
56
  for item in directory.iterdir():
 
58
  item.unlink()
59
  elif item.is_dir():
60
  shutil.rmtree(item)
61
+ return True, ""
62
  except Exception as e:
63
+ error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}"
64
+ print(error_msg)
65
+ return False, error_msg
66
 
67
  def extract_photos_from_pdf(file_pdf):
68
+ """Extract all pages from a PDF as images with comprehensive error handling"""
69
+ # Update status at the beginning
70
+ yield (
71
+ gr.Gallery.update(value=[], visible=True),
72
+ gr.File.update(visible=False),
73
+ gr.Textbox.update(value="Starting extraction process...", visible=True)
74
+ )
75
 
76
+ # Check if file is provided
77
  if file_pdf is None:
78
+ yield (
79
+ gr.Gallery.update(value=[], visible=True),
80
+ gr.File.update(visible=False),
81
+ gr.Textbox.update(value="Error: No file uploaded", visible=True)
82
  )
83
+ return
84
+
85
+ # Clear directories for new extraction
86
+ clear_success, clear_error = clear_directory(DIRECTORY)
87
+ if not clear_success:
88
+ yield (
89
+ gr.Gallery.update(value=[], visible=True),
90
+ gr.File.update(visible=False),
91
+ gr.Textbox.update(value=f"Error clearing directories: {clear_error}", visible=True)
92
+ )
93
+ return
94
+
95
+ clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT)
96
+ if not clear_success:
97
+ yield (
98
+ gr.Gallery.update(value=[], visible=True),
99
+ gr.File.update(visible=False),
100
+ gr.Textbox.update(value=f"Error clearing output directory: {clear_error}", visible=True)
101
+ )
102
+ return
103
 
104
  try:
105
+ # Get PDF path and info
106
  pdf_path = file_pdf.name
107
+
108
+ # Update status
109
+ yield (
110
+ gr.Gallery.update(value=[], visible=True),
111
+ gr.File.update(visible=False),
112
+ gr.Textbox.update(value="Reading PDF information...", visible=True)
113
+ )
114
+
115
+ try:
116
+ info = pdfinfo_from_path(pdf_path)
117
+ total_pages = info["Pages"]
118
+ except Exception as e:
119
+ error_details = traceback.format_exc()
120
+ yield (
121
+ gr.Gallery.update(value=[], visible=True),
122
+ gr.File.update(visible=False),
123
+ gr.Textbox.update(value=f"Error reading PDF: {str(e)}\n\nDetails: {error_details}", visible=True)
124
+ )
125
+ return
126
 
127
  # Progress tracking variables
128
  batch_size = 10 # Smaller batch size for better progress visibility
 
130
  # Process PDF in batches
131
  for start_page in range(1, total_pages + 1, batch_size):
132
  end_page = min(start_page + batch_size - 1, total_pages)
133
+
134
+ # Update status
135
+ yield (
136
+ gr.Gallery.update(value=[], visible=True),
137
+ gr.File.update(visible=False),
138
+ gr.Textbox.update(value=f"Processing pages {start_page} to {end_page} of {total_pages}...", visible=True)
139
  )
140
 
141
+ try:
142
+ images = convert_from_path(
143
+ pdf_path,
144
+ first_page=start_page,
145
+ last_page=end_page,
146
+ dpi=150 # Adjustable DPI for quality vs size
147
+ )
148
+
149
+ for idx, image in enumerate(images, start=start_page):
150
+ image_path = DIRECTORY / f"{idx}.png"
151
+ image.save(str(image_path), 'PNG')
152
+ except Exception as e:
153
+ error_details = traceback.format_exc()
154
+ yield (
155
+ gr.Gallery.update(value=[], visible=True),
156
+ gr.File.update(visible=False),
157
+ gr.Textbox.update(value=f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}", visible=True)
158
+ )
159
+ return
160
+
161
+ # Get list of extracted images
162
+ yield (
163
+ gr.Gallery.update(value=[], visible=True),
164
+ gr.File.update(visible=False),
165
+ gr.Textbox.update(value="Preparing gallery view...", visible=True)
166
+ )
167
 
168
  # Get list of extracted images and sort them numerically
169
  images_pdf_list = get_image_files(DIRECTORY)
170
  if not images_pdf_list:
171
+ yield (
172
+ gr.Gallery.update(value=[], visible=True),
173
+ gr.File.update(visible=False),
174
+ gr.Textbox.update(value="No images could be extracted from the PDF.", visible=True)
175
  )
176
+ return
177
 
178
  image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
179
+ try:
180
+ sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
181
+ except Exception as e:
182
+ # Fallback to unsorted if sorting fails
183
+ sorted_names = image_names
184
+ print(f"Error sorting images: {e}")
185
 
186
  # Create zip file of all images
187
+ yield (
188
+ gr.Gallery.update(value=[], visible=True),
189
+ gr.File.update(visible=False),
190
+ gr.Textbox.update(value="Creating downloadable zip file...", visible=True)
191
+ )
192
+
193
  zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
194
+ zip_success, zip_error = zip_folder(DIRECTORY, zip_path)
195
+
196
+ if zip_success:
197
+ yield (
198
  gr.Gallery.update(
199
  value=sorted_names,
200
  label=f"Extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''}",
201
  visible=True
202
  ),
203
+ gr.File.update(value=str(zip_path), visible=True),
204
+ gr.Textbox.update(value=f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF.", visible=True)
205
  )
206
  else:
207
+ yield (
208
+ gr.Gallery.update(
209
+ value=sorted_names,
210
+ label="Extracted images (zip creation failed)",
211
+ visible=True
212
+ ),
213
+ gr.File.update(visible=False),
214
+ gr.Textbox.update(value=f"Images extracted but zip creation failed: {zip_error}", visible=True)
215
  )
216
 
217
  except Exception as e:
218
+ error_details = traceback.format_exc()
219
+ yield (
220
+ gr.Gallery.update(value=[], visible=True),
221
+ gr.File.update(visible=False),
222
+ gr.Textbox.update(value=f"Unexpected error: {str(e)}\n\nDetails: {error_details}", visible=True)
223
  )
224
 
225
  # Create Gradio interface with improved layout and error handling
 
244
  clear_btn = gr.Button("Clear")
245
 
246
  with gr.Column():
247
+ status = gr.Textbox(
248
+ label="Status",
249
+ value="Upload a PDF and click 'Extract Images'",
250
+ visible=True
251
+ )
252
  gallery = gr.Gallery(
253
  label="Extracted Pages",
254
  show_label=True,
 
269
  examples=[[example_path]],
270
  fn=extract_photos_from_pdf,
271
  inputs=[file_pdf],
272
+ outputs=[gallery, download_btn, status],
273
  cache_examples=False
274
  )
275
 
 
277
  btn.click(
278
  fn=extract_photos_from_pdf,
279
  inputs=[file_pdf],
280
+ outputs=[gallery, download_btn, status],
281
  api_name="extract"
282
  )
283
 
284
  clear_btn.click(
285
  fn=lambda: (
286
  gr.Gallery.update(value=[], label="Extracted Pages", visible=True),
287
+ gr.File.update(visible=False),
288
+ gr.Textbox.update(value="Cleared. Upload a PDF to begin.", visible=True)
289
  ),
290
  inputs=[],
291
+ outputs=[gallery, download_btn, status]
292
  )
293
 
294
  if __name__ == "__main__":