Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,8 @@ import zipfile
|
|
5 |
import shutil
|
6 |
import tempfile
|
7 |
from pathlib import Path
|
|
|
|
|
8 |
|
9 |
def zip_folder(folder_path, output_path):
|
10 |
"""Create a zip archive from a folder with improved error handling"""
|
@@ -14,10 +16,11 @@ def zip_folder(folder_path, output_path):
|
|
14 |
for file in files:
|
15 |
file_path = os.path.join(root, file)
|
16 |
zipf.write(file_path, os.path.relpath(file_path, folder_path))
|
17 |
-
return True
|
18 |
except Exception as e:
|
19 |
-
|
20 |
-
|
|
|
21 |
|
22 |
# Use more robust directory handling with pathlib
|
23 |
BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
|
@@ -47,7 +50,7 @@ def clear_directory(directory):
|
|
47 |
"""Safely clear a directory with error handling"""
|
48 |
directory = Path(directory)
|
49 |
if not directory.exists():
|
50 |
-
return
|
51 |
|
52 |
try:
|
53 |
for item in directory.iterdir():
|
@@ -55,25 +58,71 @@ def clear_directory(directory):
|
|
55 |
item.unlink()
|
56 |
elif item.is_dir():
|
57 |
shutil.rmtree(item)
|
|
|
58 |
except Exception as e:
|
59 |
-
|
|
|
|
|
60 |
|
61 |
def extract_photos_from_pdf(file_pdf):
|
62 |
-
"""Extract all pages from a PDF as images"""
|
63 |
-
#
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
66 |
|
|
|
67 |
if file_pdf is None:
|
68 |
-
|
69 |
-
gr.Gallery.update(value=[],
|
70 |
-
gr.File.update(visible=False)
|
|
|
71 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
try:
|
|
|
74 |
pdf_path = file_pdf.name
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# Progress tracking variables
|
79 |
batch_size = 10 # Smaller batch size for better progress visibility
|
@@ -81,50 +130,96 @@ def extract_photos_from_pdf(file_pdf):
|
|
81 |
# Process PDF in batches
|
82 |
for start_page in range(1, total_pages + 1, batch_size):
|
83 |
end_page = min(start_page + batch_size - 1, total_pages)
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
89 |
)
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
# Get list of extracted images and sort them numerically
|
96 |
images_pdf_list = get_image_files(DIRECTORY)
|
97 |
if not images_pdf_list:
|
98 |
-
|
99 |
-
gr.Gallery.update(value=[],
|
100 |
-
gr.File.update(visible=False)
|
|
|
101 |
)
|
|
|
102 |
|
103 |
image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
# Create zip file of all images
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
|
108 |
-
|
109 |
-
|
|
|
|
|
110 |
gr.Gallery.update(
|
111 |
value=sorted_names,
|
112 |
label=f"Extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''}",
|
113 |
visible=True
|
114 |
),
|
115 |
-
gr.File.update(value=str(zip_path), visible=True)
|
|
|
116 |
)
|
117 |
else:
|
118 |
-
|
119 |
-
gr.Gallery.update(
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
121 |
)
|
122 |
|
123 |
except Exception as e:
|
124 |
-
|
125 |
-
|
126 |
-
gr.Gallery.update(value=[],
|
127 |
-
gr.File.update(visible=False)
|
|
|
128 |
)
|
129 |
|
130 |
# Create Gradio interface with improved layout and error handling
|
@@ -149,9 +244,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
149 |
clear_btn = gr.Button("Clear")
|
150 |
|
151 |
with gr.Column():
|
152 |
-
status = gr.Textbox(
|
153 |
-
|
154 |
-
|
|
|
|
|
155 |
gallery = gr.Gallery(
|
156 |
label="Extracted Pages",
|
157 |
show_label=True,
|
@@ -172,7 +269,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
172 |
examples=[[example_path]],
|
173 |
fn=extract_photos_from_pdf,
|
174 |
inputs=[file_pdf],
|
175 |
-
outputs=[gallery, download_btn],
|
176 |
cache_examples=False
|
177 |
)
|
178 |
|
@@ -180,17 +277,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
180 |
btn.click(
|
181 |
fn=extract_photos_from_pdf,
|
182 |
inputs=[file_pdf],
|
183 |
-
outputs=[gallery, download_btn],
|
184 |
api_name="extract"
|
185 |
)
|
186 |
|
187 |
clear_btn.click(
|
188 |
fn=lambda: (
|
189 |
gr.Gallery.update(value=[], label="Extracted Pages", visible=True),
|
190 |
-
gr.File.update(visible=False)
|
|
|
191 |
),
|
192 |
inputs=[],
|
193 |
-
outputs=[gallery, download_btn]
|
194 |
)
|
195 |
|
196 |
if __name__ == "__main__":
|
|
|
5 |
import shutil
|
6 |
import tempfile
|
7 |
from pathlib import Path
|
8 |
+
import traceback
|
9 |
+
import sys
|
10 |
|
11 |
def zip_folder(folder_path, output_path):
|
12 |
"""Create a zip archive from a folder with improved error handling"""
|
|
|
16 |
for file in files:
|
17 |
file_path = os.path.join(root, file)
|
18 |
zipf.write(file_path, os.path.relpath(file_path, folder_path))
|
19 |
+
return True, ""
|
20 |
except Exception as e:
|
21 |
+
error_msg = f"Error creating zip file: {str(e)}"
|
22 |
+
print(error_msg)
|
23 |
+
return False, error_msg
|
24 |
|
25 |
# Use more robust directory handling with pathlib
|
26 |
BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
|
|
|
50 |
"""Safely clear a directory with error handling"""
|
51 |
directory = Path(directory)
|
52 |
if not directory.exists():
|
53 |
+
return True, ""
|
54 |
|
55 |
try:
|
56 |
for item in directory.iterdir():
|
|
|
58 |
item.unlink()
|
59 |
elif item.is_dir():
|
60 |
shutil.rmtree(item)
|
61 |
+
return True, ""
|
62 |
except Exception as e:
|
63 |
+
error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}"
|
64 |
+
print(error_msg)
|
65 |
+
return False, error_msg
|
66 |
|
67 |
def extract_photos_from_pdf(file_pdf):
|
68 |
+
"""Extract all pages from a PDF as images with comprehensive error handling"""
|
69 |
+
# Update status at the beginning
|
70 |
+
yield (
|
71 |
+
gr.Gallery.update(value=[], visible=True),
|
72 |
+
gr.File.update(visible=False),
|
73 |
+
gr.Textbox.update(value="Starting extraction process...", visible=True)
|
74 |
+
)
|
75 |
|
76 |
+
# Check if file is provided
|
77 |
if file_pdf is None:
|
78 |
+
yield (
|
79 |
+
gr.Gallery.update(value=[], visible=True),
|
80 |
+
gr.File.update(visible=False),
|
81 |
+
gr.Textbox.update(value="Error: No file uploaded", visible=True)
|
82 |
)
|
83 |
+
return
|
84 |
+
|
85 |
+
# Clear directories for new extraction
|
86 |
+
clear_success, clear_error = clear_directory(DIRECTORY)
|
87 |
+
if not clear_success:
|
88 |
+
yield (
|
89 |
+
gr.Gallery.update(value=[], visible=True),
|
90 |
+
gr.File.update(visible=False),
|
91 |
+
gr.Textbox.update(value=f"Error clearing directories: {clear_error}", visible=True)
|
92 |
+
)
|
93 |
+
return
|
94 |
+
|
95 |
+
clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT)
|
96 |
+
if not clear_success:
|
97 |
+
yield (
|
98 |
+
gr.Gallery.update(value=[], visible=True),
|
99 |
+
gr.File.update(visible=False),
|
100 |
+
gr.Textbox.update(value=f"Error clearing output directory: {clear_error}", visible=True)
|
101 |
+
)
|
102 |
+
return
|
103 |
|
104 |
try:
|
105 |
+
# Get PDF path and info
|
106 |
pdf_path = file_pdf.name
|
107 |
+
|
108 |
+
# Update status
|
109 |
+
yield (
|
110 |
+
gr.Gallery.update(value=[], visible=True),
|
111 |
+
gr.File.update(visible=False),
|
112 |
+
gr.Textbox.update(value="Reading PDF information...", visible=True)
|
113 |
+
)
|
114 |
+
|
115 |
+
try:
|
116 |
+
info = pdfinfo_from_path(pdf_path)
|
117 |
+
total_pages = info["Pages"]
|
118 |
+
except Exception as e:
|
119 |
+
error_details = traceback.format_exc()
|
120 |
+
yield (
|
121 |
+
gr.Gallery.update(value=[], visible=True),
|
122 |
+
gr.File.update(visible=False),
|
123 |
+
gr.Textbox.update(value=f"Error reading PDF: {str(e)}\n\nDetails: {error_details}", visible=True)
|
124 |
+
)
|
125 |
+
return
|
126 |
|
127 |
# Progress tracking variables
|
128 |
batch_size = 10 # Smaller batch size for better progress visibility
|
|
|
130 |
# Process PDF in batches
|
131 |
for start_page in range(1, total_pages + 1, batch_size):
|
132 |
end_page = min(start_page + batch_size - 1, total_pages)
|
133 |
+
|
134 |
+
# Update status
|
135 |
+
yield (
|
136 |
+
gr.Gallery.update(value=[], visible=True),
|
137 |
+
gr.File.update(visible=False),
|
138 |
+
gr.Textbox.update(value=f"Processing pages {start_page} to {end_page} of {total_pages}...", visible=True)
|
139 |
)
|
140 |
|
141 |
+
try:
|
142 |
+
images = convert_from_path(
|
143 |
+
pdf_path,
|
144 |
+
first_page=start_page,
|
145 |
+
last_page=end_page,
|
146 |
+
dpi=150 # Adjustable DPI for quality vs size
|
147 |
+
)
|
148 |
+
|
149 |
+
for idx, image in enumerate(images, start=start_page):
|
150 |
+
image_path = DIRECTORY / f"{idx}.png"
|
151 |
+
image.save(str(image_path), 'PNG')
|
152 |
+
except Exception as e:
|
153 |
+
error_details = traceback.format_exc()
|
154 |
+
yield (
|
155 |
+
gr.Gallery.update(value=[], visible=True),
|
156 |
+
gr.File.update(visible=False),
|
157 |
+
gr.Textbox.update(value=f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}", visible=True)
|
158 |
+
)
|
159 |
+
return
|
160 |
+
|
161 |
+
# Get list of extracted images
|
162 |
+
yield (
|
163 |
+
gr.Gallery.update(value=[], visible=True),
|
164 |
+
gr.File.update(visible=False),
|
165 |
+
gr.Textbox.update(value="Preparing gallery view...", visible=True)
|
166 |
+
)
|
167 |
|
168 |
# Get list of extracted images and sort them numerically
|
169 |
images_pdf_list = get_image_files(DIRECTORY)
|
170 |
if not images_pdf_list:
|
171 |
+
yield (
|
172 |
+
gr.Gallery.update(value=[], visible=True),
|
173 |
+
gr.File.update(visible=False),
|
174 |
+
gr.Textbox.update(value="No images could be extracted from the PDF.", visible=True)
|
175 |
)
|
176 |
+
return
|
177 |
|
178 |
image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
|
179 |
+
try:
|
180 |
+
sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
|
181 |
+
except Exception as e:
|
182 |
+
# Fallback to unsorted if sorting fails
|
183 |
+
sorted_names = image_names
|
184 |
+
print(f"Error sorting images: {e}")
|
185 |
|
186 |
# Create zip file of all images
|
187 |
+
yield (
|
188 |
+
gr.Gallery.update(value=[], visible=True),
|
189 |
+
gr.File.update(visible=False),
|
190 |
+
gr.Textbox.update(value="Creating downloadable zip file...", visible=True)
|
191 |
+
)
|
192 |
+
|
193 |
zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
|
194 |
+
zip_success, zip_error = zip_folder(DIRECTORY, zip_path)
|
195 |
+
|
196 |
+
if zip_success:
|
197 |
+
yield (
|
198 |
gr.Gallery.update(
|
199 |
value=sorted_names,
|
200 |
label=f"Extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''}",
|
201 |
visible=True
|
202 |
),
|
203 |
+
gr.File.update(value=str(zip_path), visible=True),
|
204 |
+
gr.Textbox.update(value=f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF.", visible=True)
|
205 |
)
|
206 |
else:
|
207 |
+
yield (
|
208 |
+
gr.Gallery.update(
|
209 |
+
value=sorted_names,
|
210 |
+
label="Extracted images (zip creation failed)",
|
211 |
+
visible=True
|
212 |
+
),
|
213 |
+
gr.File.update(visible=False),
|
214 |
+
gr.Textbox.update(value=f"Images extracted but zip creation failed: {zip_error}", visible=True)
|
215 |
)
|
216 |
|
217 |
except Exception as e:
|
218 |
+
error_details = traceback.format_exc()
|
219 |
+
yield (
|
220 |
+
gr.Gallery.update(value=[], visible=True),
|
221 |
+
gr.File.update(visible=False),
|
222 |
+
gr.Textbox.update(value=f"Unexpected error: {str(e)}\n\nDetails: {error_details}", visible=True)
|
223 |
)
|
224 |
|
225 |
# Create Gradio interface with improved layout and error handling
|
|
|
244 |
clear_btn = gr.Button("Clear")
|
245 |
|
246 |
with gr.Column():
|
247 |
+
status = gr.Textbox(
|
248 |
+
label="Status",
|
249 |
+
value="Upload a PDF and click 'Extract Images'",
|
250 |
+
visible=True
|
251 |
+
)
|
252 |
gallery = gr.Gallery(
|
253 |
label="Extracted Pages",
|
254 |
show_label=True,
|
|
|
269 |
examples=[[example_path]],
|
270 |
fn=extract_photos_from_pdf,
|
271 |
inputs=[file_pdf],
|
272 |
+
outputs=[gallery, download_btn, status],
|
273 |
cache_examples=False
|
274 |
)
|
275 |
|
|
|
277 |
btn.click(
|
278 |
fn=extract_photos_from_pdf,
|
279 |
inputs=[file_pdf],
|
280 |
+
outputs=[gallery, download_btn, status],
|
281 |
api_name="extract"
|
282 |
)
|
283 |
|
284 |
clear_btn.click(
|
285 |
fn=lambda: (
|
286 |
gr.Gallery.update(value=[], label="Extracted Pages", visible=True),
|
287 |
+
gr.File.update(visible=False),
|
288 |
+
gr.Textbox.update(value="Cleared. Upload a PDF to begin.", visible=True)
|
289 |
),
|
290 |
inputs=[],
|
291 |
+
outputs=[gallery, download_btn, status]
|
292 |
)
|
293 |
|
294 |
if __name__ == "__main__":
|