broadfield-dev commited on
Commit
47dc1da
·
verified ·
1 Parent(s): ad8348b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -74
app.py CHANGED
@@ -20,11 +20,13 @@ import requests # For requests.exceptions.HTTPError
20
  from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
21
 
22
  import pdfplumber
23
- from pdf2image import convert_from_path, convert_from_bytes
 
 
 
24
  import pytesseract
25
  from PIL import Image
26
  from huggingface_hub import HfApi, create_repo
27
- # from huggingface_hub.utils import HfHubHTTPError # This was the incorrect one
28
 
29
  # --- Flask App Initialization ---
30
  app = Flask(__name__)
@@ -69,34 +71,28 @@ def ensure_hf_dataset():
69
  logger.warning(msg)
70
  return "Error: " + msg
71
  try:
72
- # create_repo can raise huggingface_hub.utils.RepositoryNotFoundError,
73
- # huggingface_hub.utils.HfHubHTTPError (which inherits from requests.HTTPError for some cases),
74
- # or other requests.exceptions
75
  repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
76
  logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
77
  return repo_id_obj.repo_id
78
- except RequestsHTTPError as e: # Catch HTTP errors from requests library directly
79
- if e.response is not None and e.response.status_code == 409: # Conflict, repo already exists
80
  logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists (HTTP 409).")
81
- # Attempt to construct the full repo_id (namespace/repo_name)
82
  try:
83
- user_info = hf_api.whoami(token=HF_TOKEN) # This call could also fail
84
  namespace = user_info.get('name') if user_info else None
85
  if namespace:
86
  return f"{namespace}/{HF_DATASET_REPO_NAME}"
87
- else: # Fallback if namespace cannot be determined
88
  logger.warning(f"Could not determine namespace for existing repo '{HF_DATASET_REPO_NAME}'. Using generic ID.")
89
- return HF_DATASET_REPO_NAME # Or f"{YOUR_DEFAULT_USERNAME_IF_KNOWN}/{HF_DATASET_REPO_NAME}"
90
  except Exception as whoami_e:
91
  logger.error(f"Could not determine namespace for existing repo via whoami due to: {whoami_e}. Using generic ID.")
92
- return HF_DATASET_REPO_NAME # Fallback
93
- else: # Other HTTP errors
94
  status_code = e.response.status_code if e.response is not None else "Unknown"
95
  logger.error(f"Hugging Face dataset HTTP error (Status: {status_code}): {str(e)}")
96
  return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}' due to HTTP error: {str(e)}"
97
- except Exception as e: # Catch other non-HTTP exceptions from huggingface_hub or general errors
98
- # This could be Hf একাধিক RepoExistsError if exist_ok=False, or other utility errors.
99
- # For exist_ok=True, a 409 is the more likely signal for existing repo.
100
  logger.error(f"Hugging Face dataset general error: {str(e)}", exc_info=True)
101
  return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
102
 
@@ -112,6 +108,8 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
112
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
113
  repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
114
 
 
 
115
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
116
  temp_image_path = tmp_file.name
117
  image_pil.save(temp_image_path, format="PNG")
@@ -123,7 +121,7 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
123
  )
124
  logger.info(f"Successfully uploaded image: {file_url}")
125
  return file_url
126
- except Exception as e: # Catch broadly here; specific HF errors could be caught if needed
127
  logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
128
  return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
129
  finally:
@@ -177,17 +175,17 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
177
 
178
  if source_is_url:
179
  try:
180
- response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60) # stream=False to get content
181
  response.raise_for_status()
182
  pdf_bytes_for_images = response.content
183
  pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images)
184
  yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
185
  time.sleep(0.01)
186
- except RequestsHTTPError as e: # Catch HTTP errors specifically
187
  logger.error(f"URL fetch HTTP error for PDF processing: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
188
  yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
189
  return
190
- except requests.RequestException as e: # Catch other network errors
191
  logger.error(f"URL fetch network error for PDF processing: {str(e)}", exc_info=True)
192
  yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
193
  return
@@ -211,7 +209,7 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
211
  tables = page.extract_tables()
212
  if tables:
213
  for table_idx, table_data in enumerate(tables):
214
- if table_data and len(table_data) > 0 and len(table_data[0]) > 0 : # Check table has rows and columns
215
  yield yield_message("status", {"message": f" Processing table {table_idx+1} on page {i+1}..."})
216
  header_cells = table_data[0]
217
  header = [" | ".join(str(cell) if cell is not None else "" for cell in header_cells)]
@@ -229,9 +227,7 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
229
  except Exception as e:
230
  logger.error(f"Error during PDF text/table extraction: {str(e)}", exc_info=True)
231
  yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
232
- # Decide if to return or continue to image extraction. Let's try to continue.
233
 
234
- # 2. Image Extraction and OCR
235
  if not check_poppler():
236
  yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
237
  else:
@@ -241,97 +237,107 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
241
  yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
242
 
243
  time.sleep(0.01)
244
- extracted_pil_images = []
245
  try:
246
  image_source_for_convert = None
247
  if source_is_url and pdf_bytes_for_images:
248
  image_source_for_convert = pdf_bytes_for_images
249
  logger.info("Using downloaded bytes for image conversion.")
250
  elif not source_is_url:
251
- image_source_for_convert = pdf_input_source_path_or_url # Local file path
252
  logger.info("Using local file path for image conversion.")
253
 
254
  if image_source_for_convert:
255
- # Attempt to get page count for more granular image processing if pdf2image is the bottleneck
256
  try:
257
  pdf_info = None
258
  if isinstance(image_source_for_convert, bytes):
259
  pdf_info = pdf2image.pdfinfo_from_bytes(image_source_for_convert, userpw=None, poppler_path=None)
260
- else: # path
261
  pdf_info = pdf2image.pdfinfo_from_path(image_source_for_convert, userpw=None, poppler_path=None)
262
 
263
  num_image_pages = pdf_info.get("Pages", 0)
264
  yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for potential image extraction."})
265
 
266
- # Process images page by page (or small batches) to save memory
267
- batch_size = 1 # Process one page at a time for images
268
  for page_idx_start in range(1, num_image_pages + 1, batch_size):
269
  page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
270
- yield yield_message("status", {"message": f"Extracting images from page(s) {page_idx_start}-{page_idx_end}..."})
271
  time.sleep(0.01)
272
 
273
  page_images_pil = []
274
  if isinstance(image_source_for_convert, bytes):
275
  page_images_pil = convert_from_bytes(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
276
- else: # path
277
  page_images_pil = convert_from_path(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
278
 
279
- extracted_pil_images.extend(page_images_pil) # Add to overall list for sequential numbering later
280
-
281
- # Process this batch of images immediately
282
- for img_pil in page_images_pil:
283
- current_image_index = len(extracted_pil_images) # Current overall index
284
- page_num_for_log = f"page_{page_idx_start + page_images_pil.index(img_pil)}"
285
- yield yield_message("status", {"message": f"Processing image {current_image_index} (from PDF page {page_num_for_log}) (OCR & Upload)..."})
286
  time.sleep(0.01)
287
 
288
  ocr_text = ""
289
  try:
290
  ocr_text = pytesseract.image_to_string(img_pil).strip()
291
- if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {current_image_index}."})
292
  except Exception as ocr_e:
293
- logger.error(f"OCR error for image {current_image_index}: {str(ocr_e)}")
294
  ocr_text = f"OCR failed: {str(ocr_e)}"
295
 
296
  image_md_chunk = ""
297
  if HF_TOKEN:
298
  image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
299
  if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
300
- image_md_chunk += f"![Image {current_image_index}]({image_url_or_error})\n"
301
- yield yield_message("status", {"message": f" Image {current_image_index} uploaded."})
302
  else:
303
- image_md_chunk += f"**Image {current_image_index} (Upload Error):** {str(image_url_or_error)}\n\n"
304
- yield yield_message("error", {"message": f"Failed to upload image {current_image_index}: {str(image_url_or_error)}"})
305
  else:
306
- image_md_chunk += f"**Image {current_image_index} (not uploaded due to missing HF_TOKEN)**\n"
307
 
308
  if ocr_text:
309
- image_md_chunk += f"**Image {current_image_index} OCR Text:**\n```\n{ocr_text}\n```\n\n"
310
 
311
  yield yield_message("image_md", {"content": image_md_chunk})
312
  time.sleep(0.01)
313
  except Exception as e_img_info:
314
  logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
315
  yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
316
- # Fallback to bulk conversion if pdfinfo or batching fails (original behavior)
 
317
  if isinstance(image_source_for_convert, bytes):
318
- extracted_pil_images = convert_from_bytes(image_source_for_convert, dpi=150)
319
- else: # path
320
- extracted_pil_images = convert_from_path(image_source_for_convert, dpi=150)
321
- # Process these bulk images (copy-paste the loop from above, adjust indexing)
322
- for i, img_pil in enumerate(extracted_pil_images):
323
- page_num_for_log = f"bulk_image_{i+1}"
324
- yield yield_message("status", {"message": f"Processing image {i+1}/{len(extracted_pil_images)} (OCR & Upload)..."}) # ... (rest of loop) ...
325
- # (omitted rest of duplicated loop for brevity, but it would be the same as the inner loop above)
326
- ocr_text = pytesseract.image_to_string(img_pil).strip() # Simplified for brevity
327
- image_md_chunk = f"![Image {i+1} Fallback]\n**OCR:** {ocr_text}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
328
  yield yield_message("image_md", {"content": image_md_chunk})
329
  time.sleep(0.01)
330
 
331
- else: # No valid source for image conversion
332
- yield yield_message("status", {"message": "No valid source (URL download failed or no file path) for image extraction."})
333
 
334
- except Exception as e: # Catch errors from the image extraction block
335
  logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
336
  yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
337
 
@@ -353,15 +359,10 @@ def process_pdf_stream():
353
  pdf_file = request.files.get('pdf_file')
354
  pdf_url = request.form.get('pdf_url', '').strip()
355
 
356
- # Use a list to hold temp_pdf_path so it can be modified in the inner function
357
- # and accessed in finally. Or pass it around.
358
- # For simplicity, we'll rely on the generator's finally block if it's created within.
359
- # Here, temp_pdf_path is primarily for the *uploaded* file before passing its path.
360
-
361
- outer_temp_pdf_path = None # For uploaded file cleanup
362
 
363
  def stream_processor():
364
- nonlocal outer_temp_pdf_path # Make it accessible in this inner function for cleanup
365
  pdf_input_source_for_generator = None
366
 
367
  try:
@@ -375,7 +376,7 @@ def process_pdf_stream():
375
  fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
376
  os.close(fd)
377
  pdf_file.save(temp_path)
378
- outer_temp_pdf_path = temp_path # Store for cleanup
379
  logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
380
  pdf_input_source_for_generator = outer_temp_pdf_path
381
  yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
@@ -400,17 +401,11 @@ def process_pdf_stream():
400
  except Exception as e:
401
  logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
402
  yield yield_message("error", {"message": f"Setup error: {str(e)}"})
403
- # The 'finally' block for cleaning outer_temp_pdf_path will be outside this generator,
404
- # in the main route function after the Response is fully generated.
405
- # However, with stream_with_context, the 'finally' here is better.
406
  finally:
407
  if outer_temp_pdf_path and os.path.exists(outer_temp_pdf_path):
408
  try:
409
  os.remove(outer_temp_pdf_path)
410
  logger.info(f"Cleaned up temporary PDF: {outer_temp_pdf_path}")
411
- # Yielding from finally inside a generator that's part of a streamed response can be tricky.
412
- # It's better if status messages about cleanup are logged or handled differently.
413
- # For this case, logging is sufficient.
414
  except OSError as ose:
415
  logger.error(f"Error removing temporary PDF {outer_temp_pdf_path}: {ose}")
416
 
 
20
  from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
21
 
22
  import pdfplumber
23
+ import pdf2image # <<<<<<<<<<<<<<<< CORRECTED: Added this import
24
+ from pdf2image import convert_from_path, convert_from_bytes # Keep these for direct use too
25
+ # from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
26
+
27
  import pytesseract
28
  from PIL import Image
29
  from huggingface_hub import HfApi, create_repo
 
30
 
31
  # --- Flask App Initialization ---
32
  app = Flask(__name__)
 
71
  logger.warning(msg)
72
  return "Error: " + msg
73
  try:
 
 
 
74
  repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
75
  logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
76
  return repo_id_obj.repo_id
77
+ except RequestsHTTPError as e:
78
+ if e.response is not None and e.response.status_code == 409:
79
  logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists (HTTP 409).")
 
80
  try:
81
+ user_info = hf_api.whoami(token=HF_TOKEN)
82
  namespace = user_info.get('name') if user_info else None
83
  if namespace:
84
  return f"{namespace}/{HF_DATASET_REPO_NAME}"
85
+ else:
86
  logger.warning(f"Could not determine namespace for existing repo '{HF_DATASET_REPO_NAME}'. Using generic ID.")
87
+ return HF_DATASET_REPO_NAME
88
  except Exception as whoami_e:
89
  logger.error(f"Could not determine namespace for existing repo via whoami due to: {whoami_e}. Using generic ID.")
90
+ return HF_DATASET_REPO_NAME
91
+ else:
92
  status_code = e.response.status_code if e.response is not None else "Unknown"
93
  logger.error(f"Hugging Face dataset HTTP error (Status: {status_code}): {str(e)}")
94
  return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}' due to HTTP error: {str(e)}"
95
+ except Exception as e:
 
 
96
  logger.error(f"Hugging Face dataset general error: {str(e)}", exc_info=True)
97
  return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
98
 
 
108
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
109
  repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
110
 
111
+ # Ensure UPLOAD_FOLDER exists before writing temp file
112
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
113
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
114
  temp_image_path = tmp_file.name
115
  image_pil.save(temp_image_path, format="PNG")
 
121
  )
122
  logger.info(f"Successfully uploaded image: {file_url}")
123
  return file_url
124
+ except Exception as e:
125
  logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
126
  return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
127
  finally:
 
175
 
176
  if source_is_url:
177
  try:
178
+ response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
179
  response.raise_for_status()
180
  pdf_bytes_for_images = response.content
181
  pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images)
182
  yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
183
  time.sleep(0.01)
184
+ except RequestsHTTPError as e:
185
  logger.error(f"URL fetch HTTP error for PDF processing: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
186
  yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
187
  return
188
+ except requests.RequestException as e:
189
  logger.error(f"URL fetch network error for PDF processing: {str(e)}", exc_info=True)
190
  yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
191
  return
 
209
  tables = page.extract_tables()
210
  if tables:
211
  for table_idx, table_data in enumerate(tables):
212
+ if table_data and len(table_data) > 0 and table_data[0] is not None and len(table_data[0]) > 0 :
213
  yield yield_message("status", {"message": f" Processing table {table_idx+1} on page {i+1}..."})
214
  header_cells = table_data[0]
215
  header = [" | ".join(str(cell) if cell is not None else "" for cell in header_cells)]
 
227
  except Exception as e:
228
  logger.error(f"Error during PDF text/table extraction: {str(e)}", exc_info=True)
229
  yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
 
230
 
 
231
  if not check_poppler():
232
  yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
233
  else:
 
237
  yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
238
 
239
  time.sleep(0.01)
240
+ extracted_pil_images_overall_count = 0 # Keep track of total images processed for numbering
241
  try:
242
  image_source_for_convert = None
243
  if source_is_url and pdf_bytes_for_images:
244
  image_source_for_convert = pdf_bytes_for_images
245
  logger.info("Using downloaded bytes for image conversion.")
246
  elif not source_is_url:
247
+ image_source_for_convert = pdf_input_source_path_or_url
248
  logger.info("Using local file path for image conversion.")
249
 
250
  if image_source_for_convert:
 
251
  try:
252
  pdf_info = None
253
  if isinstance(image_source_for_convert, bytes):
254
  pdf_info = pdf2image.pdfinfo_from_bytes(image_source_for_convert, userpw=None, poppler_path=None)
255
+ else:
256
  pdf_info = pdf2image.pdfinfo_from_path(image_source_for_convert, userpw=None, poppler_path=None)
257
 
258
  num_image_pages = pdf_info.get("Pages", 0)
259
  yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for potential image extraction."})
260
 
261
+ batch_size = 1
 
262
  for page_idx_start in range(1, num_image_pages + 1, batch_size):
263
  page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
264
+ yield yield_message("status", {"message": f"Extracting images from PDF page(s) {page_idx_start}-{page_idx_end}..."})
265
  time.sleep(0.01)
266
 
267
  page_images_pil = []
268
  if isinstance(image_source_for_convert, bytes):
269
  page_images_pil = convert_from_bytes(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
270
+ else:
271
  page_images_pil = convert_from_path(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
272
 
273
+ for img_idx_in_batch, img_pil in enumerate(page_images_pil):
274
+ extracted_pil_images_overall_count += 1
275
+ current_pdf_page_num = page_idx_start + img_idx_in_batch # Actual PDF page number
276
+ page_num_for_log = f"pdfpage_{current_pdf_page_num}"
277
+
278
+ yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
 
279
  time.sleep(0.01)
280
 
281
  ocr_text = ""
282
  try:
283
  ocr_text = pytesseract.image_to_string(img_pil).strip()
284
+ if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {extracted_pil_images_overall_count}."})
285
  except Exception as ocr_e:
286
+ logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
287
  ocr_text = f"OCR failed: {str(ocr_e)}"
288
 
289
  image_md_chunk = ""
290
  if HF_TOKEN:
291
  image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
292
  if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
293
+ image_md_chunk += f"![Image {extracted_pil_images_overall_count}]({image_url_or_error})\n"
294
+ yield yield_message("status", {"message": f" Image {extracted_pil_images_overall_count} uploaded."})
295
  else:
296
+ image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
297
+ yield yield_message("error", {"message": f"Failed to upload image {extracted_pil_images_overall_count}: {str(image_url_or_error)}"})
298
  else:
299
+ image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
300
 
301
  if ocr_text:
302
+ image_md_chunk += f"**Image {extracted_pil_images_overall_count} OCR Text:**\n```\n{ocr_text}\n```\n\n"
303
 
304
  yield yield_message("image_md", {"content": image_md_chunk})
305
  time.sleep(0.01)
306
  except Exception as e_img_info:
307
  logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
308
  yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
309
+ # Fallback to bulk conversion
310
+ bulk_images_pil = []
311
  if isinstance(image_source_for_convert, bytes):
312
+ bulk_images_pil = convert_from_bytes(image_source_for_convert, dpi=150)
313
+ else:
314
+ bulk_images_pil = convert_from_path(image_source_for_convert, dpi=150)
315
+
316
+ yield yield_message("status", {"message": f"Fallback: Extracted {len(bulk_images_pil)} images in bulk."})
317
+ for i, img_pil in enumerate(bulk_images_pil):
318
+ extracted_pil_images_overall_count +=1
319
+ page_num_for_log = f"bulk_image_{i+1}" # Less precise page info in fallback
320
+ yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk) (OCR & Upload)..."})
321
+ ocr_text = ""
322
+ try: ocr_text = pytesseract.image_to_string(img_pil).strip()
323
+ except Exception as e: ocr_text = f"OCR Error: {e}"
324
+
325
+ image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]\n"
326
+ if HF_TOKEN:
327
+ image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image_fallback", page_num_for_log)
328
+ if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
329
+ image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]({image_url_or_error})\n"
330
+ else:
331
+ image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
332
+ if ocr_text: image_md_chunk += f"**OCR Text:**\n```\n{ocr_text}\n```\n\n"
333
+ else: image_md_chunk += "\n"
334
  yield yield_message("image_md", {"content": image_md_chunk})
335
  time.sleep(0.01)
336
 
337
+ else:
338
+ yield yield_message("status", {"message": "No valid source for image extraction."})
339
 
340
+ except Exception as e:
341
  logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
342
  yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
343
 
 
359
  pdf_file = request.files.get('pdf_file')
360
  pdf_url = request.form.get('pdf_url', '').strip()
361
 
362
+ outer_temp_pdf_path = None
 
 
 
 
 
363
 
364
  def stream_processor():
365
+ nonlocal outer_temp_pdf_path
366
  pdf_input_source_for_generator = None
367
 
368
  try:
 
376
  fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
377
  os.close(fd)
378
  pdf_file.save(temp_path)
379
+ outer_temp_pdf_path = temp_path
380
  logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
381
  pdf_input_source_for_generator = outer_temp_pdf_path
382
  yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
 
401
  except Exception as e:
402
  logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
403
  yield yield_message("error", {"message": f"Setup error: {str(e)}"})
 
 
 
404
  finally:
405
  if outer_temp_pdf_path and os.path.exists(outer_temp_pdf_path):
406
  try:
407
  os.remove(outer_temp_pdf_path)
408
  logger.info(f"Cleaned up temporary PDF: {outer_temp_pdf_path}")
 
 
 
409
  except OSError as ose:
410
  logger.error(f"Error removing temporary PDF {outer_temp_pdf_path}: {ose}")
411