Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -20,11 +20,13 @@ import requests # For requests.exceptions.HTTPError
|
|
20 |
from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
|
21 |
|
22 |
import pdfplumber
|
23 |
-
|
|
|
|
|
|
|
24 |
import pytesseract
|
25 |
from PIL import Image
|
26 |
from huggingface_hub import HfApi, create_repo
|
27 |
-
# from huggingface_hub.utils import HfHubHTTPError # This was the incorrect one
|
28 |
|
29 |
# --- Flask App Initialization ---
|
30 |
app = Flask(__name__)
|
@@ -69,34 +71,28 @@ def ensure_hf_dataset():
|
|
69 |
logger.warning(msg)
|
70 |
return "Error: " + msg
|
71 |
try:
|
72 |
-
# create_repo can raise huggingface_hub.utils.RepositoryNotFoundError,
|
73 |
-
# huggingface_hub.utils.HfHubHTTPError (which inherits from requests.HTTPError for some cases),
|
74 |
-
# or other requests.exceptions
|
75 |
repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
76 |
logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
|
77 |
return repo_id_obj.repo_id
|
78 |
-
except RequestsHTTPError as e:
|
79 |
-
if e.response is not None and e.response.status_code == 409:
|
80 |
logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists (HTTP 409).")
|
81 |
-
# Attempt to construct the full repo_id (namespace/repo_name)
|
82 |
try:
|
83 |
-
user_info = hf_api.whoami(token=HF_TOKEN)
|
84 |
namespace = user_info.get('name') if user_info else None
|
85 |
if namespace:
|
86 |
return f"{namespace}/{HF_DATASET_REPO_NAME}"
|
87 |
-
else:
|
88 |
logger.warning(f"Could not determine namespace for existing repo '{HF_DATASET_REPO_NAME}'. Using generic ID.")
|
89 |
-
return HF_DATASET_REPO_NAME
|
90 |
except Exception as whoami_e:
|
91 |
logger.error(f"Could not determine namespace for existing repo via whoami due to: {whoami_e}. Using generic ID.")
|
92 |
-
return HF_DATASET_REPO_NAME
|
93 |
-
else:
|
94 |
status_code = e.response.status_code if e.response is not None else "Unknown"
|
95 |
logger.error(f"Hugging Face dataset HTTP error (Status: {status_code}): {str(e)}")
|
96 |
return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}' due to HTTP error: {str(e)}"
|
97 |
-
except Exception as e:
|
98 |
-
# This could be Hf একাধিক RepoExistsError if exist_ok=False, or other utility errors.
|
99 |
-
# For exist_ok=True, a 409 is the more likely signal for existing repo.
|
100 |
logger.error(f"Hugging Face dataset general error: {str(e)}", exc_info=True)
|
101 |
return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
|
102 |
|
@@ -112,6 +108,8 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
|
|
112 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
113 |
repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
|
114 |
|
|
|
|
|
115 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
|
116 |
temp_image_path = tmp_file.name
|
117 |
image_pil.save(temp_image_path, format="PNG")
|
@@ -123,7 +121,7 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
|
|
123 |
)
|
124 |
logger.info(f"Successfully uploaded image: {file_url}")
|
125 |
return file_url
|
126 |
-
except Exception as e:
|
127 |
logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
|
128 |
return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
|
129 |
finally:
|
@@ -177,17 +175,17 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
|
|
177 |
|
178 |
if source_is_url:
|
179 |
try:
|
180 |
-
response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
|
181 |
response.raise_for_status()
|
182 |
pdf_bytes_for_images = response.content
|
183 |
pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images)
|
184 |
yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
|
185 |
time.sleep(0.01)
|
186 |
-
except RequestsHTTPError as e:
|
187 |
logger.error(f"URL fetch HTTP error for PDF processing: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
|
188 |
yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
|
189 |
return
|
190 |
-
except requests.RequestException as e:
|
191 |
logger.error(f"URL fetch network error for PDF processing: {str(e)}", exc_info=True)
|
192 |
yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
|
193 |
return
|
@@ -211,7 +209,7 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
|
|
211 |
tables = page.extract_tables()
|
212 |
if tables:
|
213 |
for table_idx, table_data in enumerate(tables):
|
214 |
-
if table_data and len(table_data) > 0 and len(table_data[0]) > 0 :
|
215 |
yield yield_message("status", {"message": f" Processing table {table_idx+1} on page {i+1}..."})
|
216 |
header_cells = table_data[0]
|
217 |
header = [" | ".join(str(cell) if cell is not None else "" for cell in header_cells)]
|
@@ -229,9 +227,7 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
|
|
229 |
except Exception as e:
|
230 |
logger.error(f"Error during PDF text/table extraction: {str(e)}", exc_info=True)
|
231 |
yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
|
232 |
-
# Decide if to return or continue to image extraction. Let's try to continue.
|
233 |
|
234 |
-
# 2. Image Extraction and OCR
|
235 |
if not check_poppler():
|
236 |
yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
|
237 |
else:
|
@@ -241,97 +237,107 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
|
|
241 |
yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
|
242 |
|
243 |
time.sleep(0.01)
|
244 |
-
|
245 |
try:
|
246 |
image_source_for_convert = None
|
247 |
if source_is_url and pdf_bytes_for_images:
|
248 |
image_source_for_convert = pdf_bytes_for_images
|
249 |
logger.info("Using downloaded bytes for image conversion.")
|
250 |
elif not source_is_url:
|
251 |
-
image_source_for_convert = pdf_input_source_path_or_url
|
252 |
logger.info("Using local file path for image conversion.")
|
253 |
|
254 |
if image_source_for_convert:
|
255 |
-
# Attempt to get page count for more granular image processing if pdf2image is the bottleneck
|
256 |
try:
|
257 |
pdf_info = None
|
258 |
if isinstance(image_source_for_convert, bytes):
|
259 |
pdf_info = pdf2image.pdfinfo_from_bytes(image_source_for_convert, userpw=None, poppler_path=None)
|
260 |
-
else:
|
261 |
pdf_info = pdf2image.pdfinfo_from_path(image_source_for_convert, userpw=None, poppler_path=None)
|
262 |
|
263 |
num_image_pages = pdf_info.get("Pages", 0)
|
264 |
yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for potential image extraction."})
|
265 |
|
266 |
-
|
267 |
-
batch_size = 1 # Process one page at a time for images
|
268 |
for page_idx_start in range(1, num_image_pages + 1, batch_size):
|
269 |
page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
|
270 |
-
yield yield_message("status", {"message": f"Extracting images from page(s) {page_idx_start}-{page_idx_end}..."})
|
271 |
time.sleep(0.01)
|
272 |
|
273 |
page_images_pil = []
|
274 |
if isinstance(image_source_for_convert, bytes):
|
275 |
page_images_pil = convert_from_bytes(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
|
276 |
-
else:
|
277 |
page_images_pil = convert_from_path(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
|
278 |
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
yield yield_message("status", {"message": f"Processing image {current_image_index} (from PDF page {page_num_for_log}) (OCR & Upload)..."})
|
286 |
time.sleep(0.01)
|
287 |
|
288 |
ocr_text = ""
|
289 |
try:
|
290 |
ocr_text = pytesseract.image_to_string(img_pil).strip()
|
291 |
-
if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {
|
292 |
except Exception as ocr_e:
|
293 |
-
logger.error(f"OCR error for image {
|
294 |
ocr_text = f"OCR failed: {str(ocr_e)}"
|
295 |
|
296 |
image_md_chunk = ""
|
297 |
if HF_TOKEN:
|
298 |
image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
|
299 |
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
300 |
-
image_md_chunk += f"![Image {
|
301 |
-
yield yield_message("status", {"message": f" Image {
|
302 |
else:
|
303 |
-
image_md_chunk += f"**Image {
|
304 |
-
yield yield_message("error", {"message": f"Failed to upload image {
|
305 |
else:
|
306 |
-
image_md_chunk += f"**Image {
|
307 |
|
308 |
if ocr_text:
|
309 |
-
image_md_chunk += f"**Image {
|
310 |
|
311 |
yield yield_message("image_md", {"content": image_md_chunk})
|
312 |
time.sleep(0.01)
|
313 |
except Exception as e_img_info:
|
314 |
logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
|
315 |
yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
|
316 |
-
# Fallback to bulk conversion
|
|
|
317 |
if isinstance(image_source_for_convert, bytes):
|
318 |
-
|
319 |
-
else:
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
yield yield_message("image_md", {"content": image_md_chunk})
|
329 |
time.sleep(0.01)
|
330 |
|
331 |
-
else:
|
332 |
-
yield yield_message("status", {"message": "No valid source
|
333 |
|
334 |
-
except Exception as e:
|
335 |
logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
|
336 |
yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
|
337 |
|
@@ -353,15 +359,10 @@ def process_pdf_stream():
|
|
353 |
pdf_file = request.files.get('pdf_file')
|
354 |
pdf_url = request.form.get('pdf_url', '').strip()
|
355 |
|
356 |
-
|
357 |
-
# and accessed in finally. Or pass it around.
|
358 |
-
# For simplicity, we'll rely on the generator's finally block if it's created within.
|
359 |
-
# Here, temp_pdf_path is primarily for the *uploaded* file before passing its path.
|
360 |
-
|
361 |
-
outer_temp_pdf_path = None # For uploaded file cleanup
|
362 |
|
363 |
def stream_processor():
|
364 |
-
nonlocal outer_temp_pdf_path
|
365 |
pdf_input_source_for_generator = None
|
366 |
|
367 |
try:
|
@@ -375,7 +376,7 @@ def process_pdf_stream():
|
|
375 |
fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
|
376 |
os.close(fd)
|
377 |
pdf_file.save(temp_path)
|
378 |
-
outer_temp_pdf_path = temp_path
|
379 |
logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
|
380 |
pdf_input_source_for_generator = outer_temp_pdf_path
|
381 |
yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
|
@@ -400,17 +401,11 @@ def process_pdf_stream():
|
|
400 |
except Exception as e:
|
401 |
logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
|
402 |
yield yield_message("error", {"message": f"Setup error: {str(e)}"})
|
403 |
-
# The 'finally' block for cleaning outer_temp_pdf_path will be outside this generator,
|
404 |
-
# in the main route function after the Response is fully generated.
|
405 |
-
# However, with stream_with_context, the 'finally' here is better.
|
406 |
finally:
|
407 |
if outer_temp_pdf_path and os.path.exists(outer_temp_pdf_path):
|
408 |
try:
|
409 |
os.remove(outer_temp_pdf_path)
|
410 |
logger.info(f"Cleaned up temporary PDF: {outer_temp_pdf_path}")
|
411 |
-
# Yielding from finally inside a generator that's part of a streamed response can be tricky.
|
412 |
-
# It's better if status messages about cleanup are logged or handled differently.
|
413 |
-
# For this case, logging is sufficient.
|
414 |
except OSError as ose:
|
415 |
logger.error(f"Error removing temporary PDF {outer_temp_pdf_path}: {ose}")
|
416 |
|
|
|
20 |
from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
|
21 |
|
22 |
import pdfplumber
|
23 |
+
import pdf2image # <<<<<<<<<<<<<<<< CORRECTED: Added this import
|
24 |
+
from pdf2image import convert_from_path, convert_from_bytes # Keep these for direct use too
|
25 |
+
# from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
|
26 |
+
|
27 |
import pytesseract
|
28 |
from PIL import Image
|
29 |
from huggingface_hub import HfApi, create_repo
|
|
|
30 |
|
31 |
# --- Flask App Initialization ---
|
32 |
app = Flask(__name__)
|
|
|
71 |
logger.warning(msg)
|
72 |
return "Error: " + msg
|
73 |
try:
|
|
|
|
|
|
|
74 |
repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
75 |
logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
|
76 |
return repo_id_obj.repo_id
|
77 |
+
except RequestsHTTPError as e:
|
78 |
+
if e.response is not None and e.response.status_code == 409:
|
79 |
logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists (HTTP 409).")
|
|
|
80 |
try:
|
81 |
+
user_info = hf_api.whoami(token=HF_TOKEN)
|
82 |
namespace = user_info.get('name') if user_info else None
|
83 |
if namespace:
|
84 |
return f"{namespace}/{HF_DATASET_REPO_NAME}"
|
85 |
+
else:
|
86 |
logger.warning(f"Could not determine namespace for existing repo '{HF_DATASET_REPO_NAME}'. Using generic ID.")
|
87 |
+
return HF_DATASET_REPO_NAME
|
88 |
except Exception as whoami_e:
|
89 |
logger.error(f"Could not determine namespace for existing repo via whoami due to: {whoami_e}. Using generic ID.")
|
90 |
+
return HF_DATASET_REPO_NAME
|
91 |
+
else:
|
92 |
status_code = e.response.status_code if e.response is not None else "Unknown"
|
93 |
logger.error(f"Hugging Face dataset HTTP error (Status: {status_code}): {str(e)}")
|
94 |
return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}' due to HTTP error: {str(e)}"
|
95 |
+
except Exception as e:
|
|
|
|
|
96 |
logger.error(f"Hugging Face dataset general error: {str(e)}", exc_info=True)
|
97 |
return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
|
98 |
|
|
|
108 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
109 |
repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
|
110 |
|
111 |
+
# Ensure UPLOAD_FOLDER exists before writing temp file
|
112 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
113 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
|
114 |
temp_image_path = tmp_file.name
|
115 |
image_pil.save(temp_image_path, format="PNG")
|
|
|
121 |
)
|
122 |
logger.info(f"Successfully uploaded image: {file_url}")
|
123 |
return file_url
|
124 |
+
except Exception as e:
|
125 |
logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
|
126 |
return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
|
127 |
finally:
|
|
|
175 |
|
176 |
if source_is_url:
|
177 |
try:
|
178 |
+
response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
|
179 |
response.raise_for_status()
|
180 |
pdf_bytes_for_images = response.content
|
181 |
pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images)
|
182 |
yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
|
183 |
time.sleep(0.01)
|
184 |
+
except RequestsHTTPError as e:
|
185 |
logger.error(f"URL fetch HTTP error for PDF processing: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
|
186 |
yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
|
187 |
return
|
188 |
+
except requests.RequestException as e:
|
189 |
logger.error(f"URL fetch network error for PDF processing: {str(e)}", exc_info=True)
|
190 |
yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
|
191 |
return
|
|
|
209 |
tables = page.extract_tables()
|
210 |
if tables:
|
211 |
for table_idx, table_data in enumerate(tables):
|
212 |
+
if table_data and len(table_data) > 0 and table_data[0] is not None and len(table_data[0]) > 0 :
|
213 |
yield yield_message("status", {"message": f" Processing table {table_idx+1} on page {i+1}..."})
|
214 |
header_cells = table_data[0]
|
215 |
header = [" | ".join(str(cell) if cell is not None else "" for cell in header_cells)]
|
|
|
227 |
except Exception as e:
|
228 |
logger.error(f"Error during PDF text/table extraction: {str(e)}", exc_info=True)
|
229 |
yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
|
|
|
230 |
|
|
|
231 |
if not check_poppler():
|
232 |
yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
|
233 |
else:
|
|
|
237 |
yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
|
238 |
|
239 |
time.sleep(0.01)
|
240 |
+
extracted_pil_images_overall_count = 0 # Keep track of total images processed for numbering
|
241 |
try:
|
242 |
image_source_for_convert = None
|
243 |
if source_is_url and pdf_bytes_for_images:
|
244 |
image_source_for_convert = pdf_bytes_for_images
|
245 |
logger.info("Using downloaded bytes for image conversion.")
|
246 |
elif not source_is_url:
|
247 |
+
image_source_for_convert = pdf_input_source_path_or_url
|
248 |
logger.info("Using local file path for image conversion.")
|
249 |
|
250 |
if image_source_for_convert:
|
|
|
251 |
try:
|
252 |
pdf_info = None
|
253 |
if isinstance(image_source_for_convert, bytes):
|
254 |
pdf_info = pdf2image.pdfinfo_from_bytes(image_source_for_convert, userpw=None, poppler_path=None)
|
255 |
+
else:
|
256 |
pdf_info = pdf2image.pdfinfo_from_path(image_source_for_convert, userpw=None, poppler_path=None)
|
257 |
|
258 |
num_image_pages = pdf_info.get("Pages", 0)
|
259 |
yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for potential image extraction."})
|
260 |
|
261 |
+
batch_size = 1
|
|
|
262 |
for page_idx_start in range(1, num_image_pages + 1, batch_size):
|
263 |
page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
|
264 |
+
yield yield_message("status", {"message": f"Extracting images from PDF page(s) {page_idx_start}-{page_idx_end}..."})
|
265 |
time.sleep(0.01)
|
266 |
|
267 |
page_images_pil = []
|
268 |
if isinstance(image_source_for_convert, bytes):
|
269 |
page_images_pil = convert_from_bytes(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
|
270 |
+
else:
|
271 |
page_images_pil = convert_from_path(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
|
272 |
|
273 |
+
for img_idx_in_batch, img_pil in enumerate(page_images_pil):
|
274 |
+
extracted_pil_images_overall_count += 1
|
275 |
+
current_pdf_page_num = page_idx_start + img_idx_in_batch # Actual PDF page number
|
276 |
+
page_num_for_log = f"pdfpage_{current_pdf_page_num}"
|
277 |
+
|
278 |
+
yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
|
|
|
279 |
time.sleep(0.01)
|
280 |
|
281 |
ocr_text = ""
|
282 |
try:
|
283 |
ocr_text = pytesseract.image_to_string(img_pil).strip()
|
284 |
+
if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {extracted_pil_images_overall_count}."})
|
285 |
except Exception as ocr_e:
|
286 |
+
logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
|
287 |
ocr_text = f"OCR failed: {str(ocr_e)}"
|
288 |
|
289 |
image_md_chunk = ""
|
290 |
if HF_TOKEN:
|
291 |
image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
|
292 |
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
293 |
+
image_md_chunk += f"\n"
|
294 |
+
yield yield_message("status", {"message": f" Image {extracted_pil_images_overall_count} uploaded."})
|
295 |
else:
|
296 |
+
image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
|
297 |
+
yield yield_message("error", {"message": f"Failed to upload image {extracted_pil_images_overall_count}: {str(image_url_or_error)}"})
|
298 |
else:
|
299 |
+
image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
|
300 |
|
301 |
if ocr_text:
|
302 |
+
image_md_chunk += f"**Image {extracted_pil_images_overall_count} OCR Text:**\n```\n{ocr_text}\n```\n\n"
|
303 |
|
304 |
yield yield_message("image_md", {"content": image_md_chunk})
|
305 |
time.sleep(0.01)
|
306 |
except Exception as e_img_info:
|
307 |
logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
|
308 |
yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
|
309 |
+
# Fallback to bulk conversion
|
310 |
+
bulk_images_pil = []
|
311 |
if isinstance(image_source_for_convert, bytes):
|
312 |
+
bulk_images_pil = convert_from_bytes(image_source_for_convert, dpi=150)
|
313 |
+
else:
|
314 |
+
bulk_images_pil = convert_from_path(image_source_for_convert, dpi=150)
|
315 |
+
|
316 |
+
yield yield_message("status", {"message": f"Fallback: Extracted {len(bulk_images_pil)} images in bulk."})
|
317 |
+
for i, img_pil in enumerate(bulk_images_pil):
|
318 |
+
extracted_pil_images_overall_count +=1
|
319 |
+
page_num_for_log = f"bulk_image_{i+1}" # Less precise page info in fallback
|
320 |
+
yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk) (OCR & Upload)..."})
|
321 |
+
ocr_text = ""
|
322 |
+
try: ocr_text = pytesseract.image_to_string(img_pil).strip()
|
323 |
+
except Exception as e: ocr_text = f"OCR Error: {e}"
|
324 |
+
|
325 |
+
image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]\n"
|
326 |
+
if HF_TOKEN:
|
327 |
+
image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image_fallback", page_num_for_log)
|
328 |
+
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
329 |
+
image_md_chunk = f"\n"
|
330 |
+
else:
|
331 |
+
image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
|
332 |
+
if ocr_text: image_md_chunk += f"**OCR Text:**\n```\n{ocr_text}\n```\n\n"
|
333 |
+
else: image_md_chunk += "\n"
|
334 |
yield yield_message("image_md", {"content": image_md_chunk})
|
335 |
time.sleep(0.01)
|
336 |
|
337 |
+
else:
|
338 |
+
yield yield_message("status", {"message": "No valid source for image extraction."})
|
339 |
|
340 |
+
except Exception as e:
|
341 |
logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
|
342 |
yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
|
343 |
|
|
|
359 |
pdf_file = request.files.get('pdf_file')
|
360 |
pdf_url = request.form.get('pdf_url', '').strip()
|
361 |
|
362 |
+
outer_temp_pdf_path = None
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
def stream_processor():
|
365 |
+
nonlocal outer_temp_pdf_path
|
366 |
pdf_input_source_for_generator = None
|
367 |
|
368 |
try:
|
|
|
376 |
fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
|
377 |
os.close(fd)
|
378 |
pdf_file.save(temp_path)
|
379 |
+
outer_temp_pdf_path = temp_path
|
380 |
logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
|
381 |
pdf_input_source_for_generator = outer_temp_pdf_path
|
382 |
yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
|
|
|
401 |
except Exception as e:
|
402 |
logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
|
403 |
yield yield_message("error", {"message": f"Setup error: {str(e)}"})
|
|
|
|
|
|
|
404 |
finally:
|
405 |
if outer_temp_pdf_path and os.path.exists(outer_temp_pdf_path):
|
406 |
try:
|
407 |
os.remove(outer_temp_pdf_path)
|
408 |
logger.info(f"Cleaned up temporary PDF: {outer_temp_pdf_path}")
|
|
|
|
|
|
|
409 |
except OSError as ose:
|
410 |
logger.error(f"Error removing temporary PDF {outer_temp_pdf_path}: {ose}")
|
411 |
|