broadfield-dev commited on
Commit
8323e8f
·
verified ·
1 Parent(s): a87a8f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -120
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import io
3
- import re
4
  import logging
5
  import subprocess
6
  from datetime import datetime
@@ -19,9 +19,9 @@ from werkzeug.utils import secure_filename
19
  import requests # For requests.exceptions.HTTPError
20
  from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
21
 
22
- import pdfplumber
23
- import pdf2image # <<<<<<<<<<<<<<<< CORRECTED: Added this import
24
- from pdf2image import convert_from_path, convert_from_bytes # Keep these for direct use too
25
  # from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
26
 
27
  import pytesseract
@@ -108,7 +108,6 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
108
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
109
  repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
110
 
111
- # Ensure UPLOAD_FOLDER exists before writing temp file
112
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
113
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
114
  temp_image_path = tmp_file.name
@@ -129,143 +128,86 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
129
  try: os.remove(temp_image_path)
130
  except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
131
 
132
-
133
- def format_page_text_to_markdown_chunk(page_text_content):
134
- chunk_md = ""
135
- page_text_content = re.sub(r'\n\s*\n+', '\n\n', page_text_content.strip())
136
- lines = page_text_content.split('\n')
137
- is_in_list = False
138
- for line_text in lines:
139
- line_stripped = line_text.strip()
140
- if not line_stripped:
141
- chunk_md += "\n"
142
- is_in_list = False
143
- continue
144
- list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
145
- is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
146
- if is_heading_candidate and not list_match:
147
- chunk_md += f"## {line_stripped}\n\n"
148
- is_in_list = False
149
- elif list_match:
150
- list_item_text = list_match.group(1)
151
- chunk_md += f"- {list_item_text}\n"
152
- is_in_list = True
153
- else:
154
- if is_in_list: chunk_md += "\n"
155
- chunk_md += f"{line_text}\n\n"
156
- is_in_list = False
157
- return re.sub(r'\n\s*\n+', '\n\n', chunk_md.strip()) + "\n\n"
158
-
159
 
160
  # --- Main PDF Processing Logic (Generator Function for Streaming) ---
161
 
162
  def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
163
  try:
164
- yield yield_message("markdown_replace", {"content": "# Extracted PDF Content\n\n"})
165
  time.sleep(0.01)
166
 
167
- yield yield_message("status", {"message": "Opening PDF for text extraction..."})
168
- time.sleep(0.01)
169
 
170
  source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
171
  pdf_input_source_path_or_url.startswith(('http://', 'https://'))
172
 
173
- pdf_handle_for_text = None
174
- pdf_bytes_for_images = None
175
-
176
  if source_is_url:
 
 
177
  try:
178
  response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
179
  response.raise_for_status()
180
- pdf_bytes_for_images = response.content
181
- pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images)
182
- yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
183
  time.sleep(0.01)
184
  except RequestsHTTPError as e:
185
- logger.error(f"URL fetch HTTP error for PDF processing: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
186
  yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
187
  return
188
  except requests.RequestException as e:
189
- logger.error(f"URL fetch network error for PDF processing: {str(e)}", exc_info=True)
190
  yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
191
  return
192
  else:
193
- pdf_handle_for_text = pdf_input_source_path_or_url
194
-
195
- total_text_pages = 0
196
- try:
197
- with pdfplumber.open(pdf_handle_for_text) as pdf:
198
- total_text_pages = len(pdf.pages)
199
- yield yield_message("status", {"message": f"Found {total_text_pages} page(s) for text extraction."})
200
- time.sleep(0.01)
201
 
202
- for i, page in enumerate(pdf.pages):
203
- yield yield_message("status", {"message": f"Extracting text from page {i+1}/{total_text_pages}..."})
204
- time.sleep(0.01)
205
-
206
- page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
207
-
208
- # Removed table extraction logic here
209
- # page_tables_md = "" # No longer needed
210
- # tables = page.extract_tables() # No longer needed
211
- # if tables: # No longer needed
212
- # ... (table processing code removed) ...
213
-
214
- formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
215
-
216
- yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
217
- # if page_tables_md: # No longer needed, as page_tables_md is not created
218
- # yield yield_message("markdown_chunk", {"content": page_tables_md})
219
- time.sleep(0.01)
220
- except Exception as e:
221
- logger.error(f"Error during PDF text extraction: {str(e)}", exc_info=True) # Updated log message
222
- yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
223
 
 
224
  if not check_poppler():
225
  yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
226
  else:
227
- yield yield_message("status", {"message": "Starting image extraction..."})
228
- yield yield_message("markdown_chunk", {"content": "## Extracted Images\n\n"})
 
229
  if not HF_TOKEN:
230
  yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
231
 
232
  time.sleep(0.01)
233
- extracted_pil_images_overall_count = 0 # Keep track of total images processed for numbering
234
  try:
235
- image_source_for_convert = None
236
- if source_is_url and pdf_bytes_for_images:
237
- image_source_for_convert = pdf_bytes_for_images
238
- logger.info("Using downloaded bytes for image conversion.")
239
- elif not source_is_url:
240
- image_source_for_convert = pdf_input_source_path_or_url
241
- logger.info("Using local file path for image conversion.")
242
-
243
- if image_source_for_convert:
244
- try:
245
  pdf_info = None
246
- if isinstance(image_source_for_convert, bytes):
247
- pdf_info = pdf2image.pdfinfo_from_bytes(image_source_for_convert, userpw=None, poppler_path=None)
248
  else:
249
- pdf_info = pdf2image.pdfinfo_from_path(image_source_for_convert, userpw=None, poppler_path=None)
250
 
251
  num_image_pages = pdf_info.get("Pages", 0)
252
- yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for potential image extraction."})
253
 
254
  batch_size = 1
255
  for page_idx_start in range(1, num_image_pages + 1, batch_size):
256
  page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
257
- yield yield_message("status", {"message": f"Extracting images from PDF page(s) {page_idx_start}-{page_idx_end}..."})
258
  time.sleep(0.01)
259
 
260
  page_images_pil = []
261
- if isinstance(image_source_for_convert, bytes):
262
- page_images_pil = convert_from_bytes(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
263
  else:
264
- page_images_pil = convert_from_path(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
265
 
266
  for img_idx_in_batch, img_pil in enumerate(page_images_pil):
267
  extracted_pil_images_overall_count += 1
268
- current_pdf_page_num = page_idx_start + img_idx_in_batch # Actual PDF page number
269
  page_num_for_log = f"pdfpage_{current_pdf_page_num}"
270
 
271
  yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
@@ -275,15 +217,16 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
275
  try:
276
  ocr_text = pytesseract.image_to_string(img_pil).strip()
277
  if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {extracted_pil_images_overall_count}."})
 
278
  except Exception as ocr_e:
279
  logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
280
  ocr_text = f"OCR failed: {str(ocr_e)}"
281
 
282
- image_md_chunk = ""
283
  if HF_TOKEN:
284
- image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
285
  if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
286
- image_md_chunk += f"![Image {extracted_pil_images_overall_count}]({image_url_or_error})\n"
287
  yield yield_message("status", {"message": f" Image {extracted_pil_images_overall_count} uploaded."})
288
  else:
289
  image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
@@ -292,49 +235,54 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
292
  image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
293
 
294
  if ocr_text:
295
- image_md_chunk += f"**Image {extracted_pil_images_overall_count} OCR Text:**\n```\n{ocr_text}\n```\n\n"
 
 
296
 
297
  yield yield_message("image_md", {"content": image_md_chunk})
298
  time.sleep(0.01)
299
  except Exception as e_img_info:
300
  logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
301
- yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
302
  # Fallback to bulk conversion
303
  bulk_images_pil = []
304
- if isinstance(image_source_for_convert, bytes):
305
- bulk_images_pil = convert_from_bytes(image_source_for_convert, dpi=150)
306
  else:
307
- bulk_images_pil = convert_from_path(image_source_for_convert, dpi=150)
308
 
309
- yield yield_message("status", {"message": f"Fallback: Extracted {len(bulk_images_pil)} images in bulk."})
310
  for i, img_pil in enumerate(bulk_images_pil):
311
  extracted_pil_images_overall_count +=1
312
- page_num_for_log = f"bulk_image_{i+1}" # Less precise page info in fallback
313
- yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk) (OCR & Upload)..."})
314
  ocr_text = ""
315
  try: ocr_text = pytesseract.image_to_string(img_pil).strip()
316
  except Exception as e: ocr_text = f"OCR Error: {e}"
317
 
318
- image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]\n"
319
  if HF_TOKEN:
320
- image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image_fallback", page_num_for_log)
321
  if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
322
- image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]({image_url_or_error})\n"
323
  else:
324
  image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
325
- if ocr_text: image_md_chunk += f"**OCR Text:**\n```\n{ocr_text}\n```\n\n"
326
- else: image_md_chunk += "\n"
 
 
 
327
  yield yield_message("image_md", {"content": image_md_chunk})
328
  time.sleep(0.01)
329
 
330
  else:
331
- yield yield_message("status", {"message": "No valid source for image extraction."})
332
 
333
  except Exception as e:
334
- logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
335
- yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
336
 
337
- yield yield_message("final_status", {"message": "All processing stages complete."})
338
 
339
  except Exception as e:
340
  logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
@@ -366,12 +314,13 @@ def process_pdf_stream():
366
 
367
  filename = secure_filename(pdf_file.filename)
368
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
 
369
  fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
370
- os.close(fd)
371
- pdf_file.save(temp_path)
372
- outer_temp_pdf_path = temp_path
373
  logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
374
- pdf_input_source_for_generator = outer_temp_pdf_path
375
  yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
376
  time.sleep(0.01)
377
 
@@ -381,7 +330,7 @@ def process_pdf_stream():
381
  yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
382
  return
383
 
384
- pdf_input_source_for_generator = unquoted_url
385
  yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
386
  time.sleep(0.01)
387
  else:
@@ -408,6 +357,6 @@ def process_pdf_stream():
408
  # --- Main Execution ---
409
  if __name__ == '__main__':
410
  if not check_poppler():
411
- logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
412
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
413
  app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)
 
1
  import os
2
  import io
3
+ import re # Still needed for some image filename manipulation if any, but not for text formatting
4
  import logging
5
  import subprocess
6
  from datetime import datetime
 
19
  import requests # For requests.exceptions.HTTPError
20
  from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
21
 
22
+ # pdfplumber is no longer needed
23
+ import pdf2image
24
+ from pdf2image import convert_from_path, convert_from_bytes
25
  # from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
26
 
27
  import pytesseract
 
108
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
109
  repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
110
 
 
111
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
112
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
113
  temp_image_path = tmp_file.name
 
128
  try: os.remove(temp_image_path)
129
  except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
130
 
131
+ # format_page_text_to_markdown_chunk function is removed as it's no longer used.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  # --- Main PDF Processing Logic (Generator Function for Streaming) ---
134
 
135
  def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
136
  try:
137
+ yield yield_message("markdown_replace", {"content": "# Extracted Images and OCR Text\n\n"})
138
  time.sleep(0.01)
139
 
140
+ actual_pdf_input_for_images = None
141
+ is_input_bytes = False
142
 
143
  source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
144
  pdf_input_source_path_or_url.startswith(('http://', 'https://'))
145
 
 
 
 
146
  if source_is_url:
147
+ yield yield_message("status", {"message": f"Downloading PDF from URL..."})
148
+ time.sleep(0.01)
149
  try:
150
  response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
151
  response.raise_for_status()
152
+ actual_pdf_input_for_images = response.content
153
+ is_input_bytes = True
154
+ yield yield_message("status", {"message": f"PDF downloaded from URL ({len(actual_pdf_input_for_images)/1024:.2f} KB)."})
155
  time.sleep(0.01)
156
  except RequestsHTTPError as e:
157
+ logger.error(f"URL fetch HTTP error: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
158
  yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
159
  return
160
  except requests.RequestException as e:
161
+ logger.error(f"URL fetch network error: {str(e)}", exc_info=True)
162
  yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
163
  return
164
  else:
165
+ actual_pdf_input_for_images = pdf_input_source_path_or_url
166
+ is_input_bytes = False
167
+ yield yield_message("status", {"message": f"Processing local PDF file..."})
168
+ time.sleep(0.01)
 
 
 
 
169
 
170
+ # ----- Direct Text Extraction (using pdfplumber) is REMOVED -----
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ # ----- Image Extraction and OCR -----
173
  if not check_poppler():
174
  yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
175
  else:
176
+ yield yield_message("status", {"message": "Starting image extraction and OCR..."})
177
+ # The "## Extracted Images" title is now more specific
178
+ yield yield_message("markdown_chunk", {"content": "## Extracted Images & OCR Text from PDF Pages\n\n"})
179
  if not HF_TOKEN:
180
  yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
181
 
182
  time.sleep(0.01)
183
+ extracted_pil_images_overall_count = 0
184
  try:
185
+ if actual_pdf_input_for_images:
186
+ try: # Batched conversion attempt
 
 
 
 
 
 
 
 
187
  pdf_info = None
188
+ if is_input_bytes:
189
+ pdf_info = pdf2image.pdfinfo_from_bytes(actual_pdf_input_for_images, userpw=None, poppler_path=None)
190
  else:
191
+ pdf_info = pdf2image.pdfinfo_from_path(actual_pdf_input_for_images, userpw=None, poppler_path=None)
192
 
193
  num_image_pages = pdf_info.get("Pages", 0)
194
+ yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for image conversion and OCR."})
195
 
196
  batch_size = 1
197
  for page_idx_start in range(1, num_image_pages + 1, batch_size):
198
  page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
199
+ yield yield_message("status", {"message": f"Converting PDF page(s) {page_idx_start}-{page_idx_end} to image(s)..."})
200
  time.sleep(0.01)
201
 
202
  page_images_pil = []
203
+ if is_input_bytes:
204
+ page_images_pil = convert_from_bytes(actual_pdf_input_for_images, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
205
  else:
206
+ page_images_pil = convert_from_path(actual_pdf_input_for_images, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
207
 
208
  for img_idx_in_batch, img_pil in enumerate(page_images_pil):
209
  extracted_pil_images_overall_count += 1
210
+ current_pdf_page_num = page_idx_start + img_idx_in_batch
211
  page_num_for_log = f"pdfpage_{current_pdf_page_num}"
212
 
213
  yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
 
217
  try:
218
  ocr_text = pytesseract.image_to_string(img_pil).strip()
219
  if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {extracted_pil_images_overall_count}."})
220
+ else: yield yield_message("status", {"message": f" OCR complete for image {extracted_pil_images_overall_count} (no text found)."})
221
  except Exception as ocr_e:
222
  logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
223
  ocr_text = f"OCR failed: {str(ocr_e)}"
224
 
225
+ image_md_chunk = f"### Image from PDF Page {current_pdf_page_num}\n"
226
  if HF_TOKEN:
227
+ image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_page_image", page_num_for_log)
228
  if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
229
+ image_md_chunk += f"![Image from PDF Page {current_pdf_page_num}]({image_url_or_error})\n"
230
  yield yield_message("status", {"message": f" Image {extracted_pil_images_overall_count} uploaded."})
231
  else:
232
  image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
 
235
  image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
236
 
237
  if ocr_text:
238
+ image_md_chunk += f"**OCR Text (from PDF Page {current_pdf_page_num}):**\n```\n{ocr_text}\n```\n\n"
239
+ else:
240
+ image_md_chunk += f"_(No text detected by OCR for image from PDF page {current_pdf_page_num})_\n\n"
241
 
242
  yield yield_message("image_md", {"content": image_md_chunk})
243
  time.sleep(0.01)
244
  except Exception as e_img_info:
245
  logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
246
+ yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk conversion."})
247
  # Fallback to bulk conversion
248
  bulk_images_pil = []
249
+ if is_input_bytes:
250
+ bulk_images_pil = convert_from_bytes(actual_pdf_input_for_images, dpi=150)
251
  else:
252
+ bulk_images_pil = convert_from_path(actual_pdf_input_for_images, dpi=150)
253
 
254
+ yield yield_message("status", {"message": f"Fallback: Converted {len(bulk_images_pil)} PDF pages to images in bulk."})
255
  for i, img_pil in enumerate(bulk_images_pil):
256
  extracted_pil_images_overall_count +=1
257
+ page_num_for_log = f"bulk_image_{i+1}"
258
+ yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk page {i+1}) (OCR & Upload)..."})
259
  ocr_text = ""
260
  try: ocr_text = pytesseract.image_to_string(img_pil).strip()
261
  except Exception as e: ocr_text = f"OCR Error: {e}"
262
 
263
+ image_md_chunk = f"### Image from PDF Page (Bulk {i+1})\n"
264
  if HF_TOKEN:
265
+ image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_page_image_fallback", page_num_for_log)
266
  if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
267
+ image_md_chunk += f"![Image {extracted_pil_images_overall_count} (Fallback)]({image_url_or_error})\n"
268
  else:
269
  image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
270
+ else:
271
+ image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Fallback - not uploaded)**\n"
272
+
273
+ if ocr_text: image_md_chunk += f"**OCR Text (Bulk Page {i+1}):**\n```\n{ocr_text}\n```\n\n"
274
+ else: image_md_chunk += f"_(No text detected by OCR for bulk image {i+1})_\n\n"
275
  yield yield_message("image_md", {"content": image_md_chunk})
276
  time.sleep(0.01)
277
 
278
  else:
279
+ yield yield_message("status", {"message": "No valid PDF input source provided for image extraction."})
280
 
281
  except Exception as e:
282
+ logger.error(f"Error during image extraction/OCR processing: {str(e)}", exc_info=True)
283
+ yield yield_message("error", {"message": f"Error during image extraction/OCR: {str(e)}"})
284
 
285
+ yield yield_message("final_status", {"message": "Image extraction and OCR processing complete."})
286
 
287
  except Exception as e:
288
  logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
 
314
 
315
  filename = secure_filename(pdf_file.filename)
316
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
317
+ # Save to a temporary file that generate_pdf_conversion_stream can access by path
318
  fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
319
+ os.close(fd) # Close the file descriptor from mkstemp
320
+ pdf_file.save(temp_path) # Save the uploaded file's content to this path
321
+ outer_temp_pdf_path = temp_path # Store for cleanup
322
  logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
323
+ pdf_input_source_for_generator = outer_temp_pdf_path # Pass the path
324
  yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
325
  time.sleep(0.01)
326
 
 
330
  yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
331
  return
332
 
333
+ pdf_input_source_for_generator = unquoted_url # Pass the URL string
334
  yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
335
  time.sleep(0.01)
336
  else:
 
357
  # --- Main Execution ---
358
  if __name__ == '__main__':
359
  if not check_poppler():
360
+ logger.warning("Poppler utilities might not be installed correctly. Image processing might fail.")
361
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
362
  app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)