Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

8323e8f

verified ·

1 Parent(s): a87a8f6

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -120

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import io
-import re
 import logging
 import subprocess
 from datetime import datetime
@@ -19,9 +19,9 @@ from werkzeug.utils import secure_filename
 import requests # For requests.exceptions.HTTPError
 from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
-import pdfplumber
-import pdf2image # <<<<<<<<<<<<<<<< CORRECTED: Added this import
-from pdf2image import convert_from_path, convert_from_bytes # Keep these for direct use too
 # from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
 import pytesseract
@@ -108,7 +108,6 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
         repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
-        # Ensure UPLOAD_FOLDER exists before writing temp file
         os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
             temp_image_path = tmp_file.name
@@ -129,143 +128,86 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
             try: os.remove(temp_image_path)
             except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
-def format_page_text_to_markdown_chunk(page_text_content):
-    chunk_md = ""
-    page_text_content = re.sub(r'\n\s*\n+', '\n\n', page_text_content.strip())
-    lines = page_text_content.split('\n')
-    is_in_list = False
-    for line_text in lines:
-        line_stripped = line_text.strip()
-        if not line_stripped:
-            chunk_md += "\n"
-            is_in_list = False
-            continue
-        list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
-        is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
-        if is_heading_candidate and not list_match:
-            chunk_md += f"## {line_stripped}\n\n"
-            is_in_list = False
-        elif list_match:
-            list_item_text = list_match.group(1)
-            chunk_md += f"- {list_item_text}\n"
-            is_in_list = True
-        else:
-            if is_in_list: chunk_md += "\n"
-            chunk_md += f"{line_text}\n\n"
-            is_in_list = False
-    return re.sub(r'\n\s*\n+', '\n\n', chunk_md.strip()) + "\n\n"
 # --- Main PDF Processing Logic (Generator Function for Streaming) ---
 def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
     try:
-        yield yield_message("markdown_replace", {"content": "# Extracted PDF Content\n\n"})
         time.sleep(0.01)
-        yield yield_message("status", {"message": "Opening PDF for text extraction..."})
-        time.sleep(0.01)
         source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
                         pdf_input_source_path_or_url.startswith(('http://', 'https://'))
-        pdf_handle_for_text = None
-        pdf_bytes_for_images = None
         if source_is_url:
             try:
                 response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
                 response.raise_for_status()
-                pdf_bytes_for_images = response.content
-                pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images)
-                yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
                 time.sleep(0.01)
             except RequestsHTTPError as e:
-                logger.error(f"URL fetch HTTP error for PDF processing: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
                 yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
                 return
             except requests.RequestException as e:
-                logger.error(f"URL fetch network error for PDF processing: {str(e)}", exc_info=True)
                 yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
                 return
         else:
-             pdf_handle_for_text = pdf_input_source_path_or_url
-        total_text_pages = 0
-        try:
-            with pdfplumber.open(pdf_handle_for_text) as pdf:
-                total_text_pages = len(pdf.pages)
-                yield yield_message("status", {"message": f"Found {total_text_pages} page(s) for text extraction."})
-                time.sleep(0.01)
-                for i, page in enumerate(pdf.pages):
-                    yield yield_message("status", {"message": f"Extracting text from page {i+1}/{total_text_pages}..."})
-                    time.sleep(0.01)
-                    page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
-                    # Removed table extraction logic here
-                    # page_tables_md = "" # No longer needed
-                    # tables = page.extract_tables() # No longer needed
-                    # if tables: # No longer needed
-                        # ... (table processing code removed) ...
-                    formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
-                    yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
-                    # if page_tables_md: # No longer needed, as page_tables_md is not created
-                    #     yield yield_message("markdown_chunk", {"content": page_tables_md})
-                    time.sleep(0.01)
-        except Exception as e:
-            logger.error(f"Error during PDF text extraction: {str(e)}", exc_info=True) # Updated log message
-            yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
         if not check_poppler():
             yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
         else:
-            yield yield_message("status", {"message": "Starting image extraction..."})
-            yield yield_message("markdown_chunk", {"content": "## Extracted Images\n\n"})
             if not HF_TOKEN:
                  yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
             time.sleep(0.01)
-            extracted_pil_images_overall_count = 0 # Keep track of total images processed for numbering
             try:
-                image_source_for_convert = None
-                if source_is_url and pdf_bytes_for_images:
-                    image_source_for_convert = pdf_bytes_for_images
-                    logger.info("Using downloaded bytes for image conversion.")
-                elif not source_is_url:
-                    image_source_for_convert = pdf_input_source_path_or_url
-                    logger.info("Using local file path for image conversion.")
-                if image_source_for_convert:
-                    try:
                         pdf_info = None
-                        if isinstance(image_source_for_convert, bytes):
-                            pdf_info = pdf2image.pdfinfo_from_bytes(image_source_for_convert, userpw=None, poppler_path=None)
                         else:
-                            pdf_info = pdf2image.pdfinfo_from_path(image_source_for_convert, userpw=None, poppler_path=None)
                         num_image_pages = pdf_info.get("Pages", 0)
-                        yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for potential image extraction."})
                         batch_size = 1
                         for page_idx_start in range(1, num_image_pages + 1, batch_size):
                             page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
-                            yield yield_message("status", {"message": f"Extracting images from PDF page(s) {page_idx_start}-{page_idx_end}..."})
                             time.sleep(0.01)
                             page_images_pil = []
-                            if isinstance(image_source_for_convert, bytes):
-                                page_images_pil = convert_from_bytes(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
                             else:
-                                page_images_pil = convert_from_path(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
                             for img_idx_in_batch, img_pil in enumerate(page_images_pil):
                                 extracted_pil_images_overall_count += 1
-                                current_pdf_page_num = page_idx_start + img_idx_in_batch # Actual PDF page number
                                 page_num_for_log = f"pdfpage_{current_pdf_page_num}"
                                 yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
@@ -275,15 +217,16 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
                                 try:
                                     ocr_text = pytesseract.image_to_string(img_pil).strip()
                                     if ocr_text: yield yield_message("status", {"message": f"  OCR successful for image {extracted_pil_images_overall_count}."})
                                 except Exception as ocr_e:
                                     logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
                                     ocr_text = f"OCR failed: {str(ocr_e)}"
-                                image_md_chunk = ""
                                 if HF_TOKEN:
-                                    image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
                                     if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
-                                        image_md_chunk += f"![Image {extracted_pil_images_overall_count}]({image_url_or_error})\n"
                                         yield yield_message("status", {"message": f"  Image {extracted_pil_images_overall_count} uploaded."})
                                     else:
                                         image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
@@ -292,49 +235,54 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
                                     image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
                                 if ocr_text:
-                                    image_md_chunk += f"**Image {extracted_pil_images_overall_count} OCR Text:**\n```\n{ocr_text}\n```\n\n"
                                 yield yield_message("image_md", {"content": image_md_chunk})
                                 time.sleep(0.01)
                     except Exception as e_img_info:
                         logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
-                        yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
                         # Fallback to bulk conversion
                         bulk_images_pil = []
-                        if isinstance(image_source_for_convert, bytes):
-                            bulk_images_pil = convert_from_bytes(image_source_for_convert, dpi=150)
                         else:
-                            bulk_images_pil = convert_from_path(image_source_for_convert, dpi=150)
-                        yield yield_message("status", {"message": f"Fallback: Extracted {len(bulk_images_pil)} images in bulk."})
                         for i, img_pil in enumerate(bulk_images_pil):
                             extracted_pil_images_overall_count +=1
-                            page_num_for_log = f"bulk_image_{i+1}" # Less precise page info in fallback
-                            yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk) (OCR & Upload)..."})
                             ocr_text = ""
                             try: ocr_text = pytesseract.image_to_string(img_pil).strip()
                             except Exception as e: ocr_text = f"OCR Error: {e}"
-                            image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]\n"
                             if HF_TOKEN:
-                                image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image_fallback", page_num_for_log)
                                 if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
-                                    image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]({image_url_or_error})\n"
                                 else:
                                     image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
-                            if ocr_text: image_md_chunk += f"**OCR Text:**\n```\n{ocr_text}\n```\n\n"
-                            else: image_md_chunk += "\n"
                             yield yield_message("image_md", {"content": image_md_chunk})
                             time.sleep(0.01)
                 else:
-                    yield yield_message("status", {"message": "No valid source for image extraction."})
             except Exception as e:
-                logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
-                yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
-        yield yield_message("final_status", {"message": "All processing stages complete."})
     except Exception as e:
         logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
@@ -366,12 +314,13 @@ def process_pdf_stream():
                 filename = secure_filename(pdf_file.filename)
                 os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
                 fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
-                os.close(fd)
-                pdf_file.save(temp_path)
-                outer_temp_pdf_path = temp_path
                 logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
-                pdf_input_source_for_generator = outer_temp_pdf_path
                 yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
                 time.sleep(0.01)
@@ -381,7 +330,7 @@ def process_pdf_stream():
                     yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
                     return
-                pdf_input_source_for_generator = unquoted_url
                 yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
                 time.sleep(0.01)
             else:
@@ -408,6 +357,6 @@ def process_pdf_stream():
 # --- Main Execution ---
 if __name__ == '__main__':
     if not check_poppler():
-        logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
     os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
     app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)

 import os
 import io
+import re # Still needed for some image filename manipulation if any, but not for text formatting
 import logging
 import subprocess
 from datetime import datetime
 import requests # For requests.exceptions.HTTPError
 from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
+# pdfplumber is no longer needed
+import pdf2image
+from pdf2image import convert_from_path, convert_from_bytes
 # from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
 import pytesseract
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
         repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
         os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
             temp_image_path = tmp_file.name
             try: os.remove(temp_image_path)
             except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
+# format_page_text_to_markdown_chunk function is removed as it's no longer used.
 # --- Main PDF Processing Logic (Generator Function for Streaming) ---
 def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
     try:
+        yield yield_message("markdown_replace", {"content": "# Extracted Images and OCR Text\n\n"})
         time.sleep(0.01)
+        actual_pdf_input_for_images = None
+        is_input_bytes = False
         source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
                         pdf_input_source_path_or_url.startswith(('http://', 'https://'))
         if source_is_url:
+            yield yield_message("status", {"message": f"Downloading PDF from URL..."})
+            time.sleep(0.01)
             try:
                 response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
                 response.raise_for_status()
+                actual_pdf_input_for_images = response.content
+                is_input_bytes = True
+                yield yield_message("status", {"message": f"PDF downloaded from URL ({len(actual_pdf_input_for_images)/1024:.2f} KB)."})
                 time.sleep(0.01)
             except RequestsHTTPError as e:
+                logger.error(f"URL fetch HTTP error: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
                 yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
                 return
             except requests.RequestException as e:
+                logger.error(f"URL fetch network error: {str(e)}", exc_info=True)
                 yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
                 return
         else:
+             actual_pdf_input_for_images = pdf_input_source_path_or_url
+             is_input_bytes = False
+             yield yield_message("status", {"message": f"Processing local PDF file..."})
+             time.sleep(0.01)
+        # ----- Direct Text Extraction (using pdfplumber) is REMOVED -----
+        # ----- Image Extraction and OCR -----
         if not check_poppler():
             yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
         else:
+            yield yield_message("status", {"message": "Starting image extraction and OCR..."})
+            # The "## Extracted Images" title is now more specific
+            yield yield_message("markdown_chunk", {"content": "## Extracted Images & OCR Text from PDF Pages\n\n"})
             if not HF_TOKEN:
                  yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
             time.sleep(0.01)
+            extracted_pil_images_overall_count = 0
             try:
+                if actual_pdf_input_for_images:
+                    try: # Batched conversion attempt
                         pdf_info = None
+                        if is_input_bytes:
+                            pdf_info = pdf2image.pdfinfo_from_bytes(actual_pdf_input_for_images, userpw=None, poppler_path=None)
                         else:
+                            pdf_info = pdf2image.pdfinfo_from_path(actual_pdf_input_for_images, userpw=None, poppler_path=None)
                         num_image_pages = pdf_info.get("Pages", 0)
+                        yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for image conversion and OCR."})
                         batch_size = 1
                         for page_idx_start in range(1, num_image_pages + 1, batch_size):
                             page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
+                            yield yield_message("status", {"message": f"Converting PDF page(s) {page_idx_start}-{page_idx_end} to image(s)..."})
                             time.sleep(0.01)
                             page_images_pil = []
+                            if is_input_bytes:
+                                page_images_pil = convert_from_bytes(actual_pdf_input_for_images, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
                             else:
+                                page_images_pil = convert_from_path(actual_pdf_input_for_images, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
                             for img_idx_in_batch, img_pil in enumerate(page_images_pil):
                                 extracted_pil_images_overall_count += 1
+                                current_pdf_page_num = page_idx_start + img_idx_in_batch
                                 page_num_for_log = f"pdfpage_{current_pdf_page_num}"
                                 yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
                                 try:
                                     ocr_text = pytesseract.image_to_string(img_pil).strip()
                                     if ocr_text: yield yield_message("status", {"message": f"  OCR successful for image {extracted_pil_images_overall_count}."})
+                                    else: yield yield_message("status", {"message": f"  OCR complete for image {extracted_pil_images_overall_count} (no text found)."})
                                 except Exception as ocr_e:
                                     logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
                                     ocr_text = f"OCR failed: {str(ocr_e)}"
+                                image_md_chunk = f"### Image from PDF Page {current_pdf_page_num}\n"
                                 if HF_TOKEN:
+                                    image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_page_image", page_num_for_log)
                                     if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
+                                        image_md_chunk += f"![Image from PDF Page {current_pdf_page_num}]({image_url_or_error})\n"
                                         yield yield_message("status", {"message": f"  Image {extracted_pil_images_overall_count} uploaded."})
                                     else:
                                         image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
                                     image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
                                 if ocr_text:
+                                    image_md_chunk += f"**OCR Text (from PDF Page {current_pdf_page_num}):**\n```\n{ocr_text}\n```\n\n"
+                                else:
+                                    image_md_chunk += f"_(No text detected by OCR for image from PDF page {current_pdf_page_num})_\n\n"
                                 yield yield_message("image_md", {"content": image_md_chunk})
                                 time.sleep(0.01)
                     except Exception as e_img_info:
                         logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
+                        yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk conversion."})
                         # Fallback to bulk conversion
                         bulk_images_pil = []
+                        if is_input_bytes:
+                            bulk_images_pil = convert_from_bytes(actual_pdf_input_for_images, dpi=150)
                         else:
+                            bulk_images_pil = convert_from_path(actual_pdf_input_for_images, dpi=150)
+                        yield yield_message("status", {"message": f"Fallback: Converted {len(bulk_images_pil)} PDF pages to images in bulk."})
                         for i, img_pil in enumerate(bulk_images_pil):
                             extracted_pil_images_overall_count +=1
+                            page_num_for_log = f"bulk_image_{i+1}"
+                            yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk page {i+1}) (OCR & Upload)..."})
                             ocr_text = ""
                             try: ocr_text = pytesseract.image_to_string(img_pil).strip()
                             except Exception as e: ocr_text = f"OCR Error: {e}"
+                            image_md_chunk = f"### Image from PDF Page (Bulk {i+1})\n"
                             if HF_TOKEN:
+                                image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_page_image_fallback", page_num_for_log)
                                 if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
+                                    image_md_chunk += f"![Image {extracted_pil_images_overall_count} (Fallback)]({image_url_or_error})\n"
                                 else:
                                     image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
+                            else:
+                                 image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Fallback - not uploaded)**\n"
+                            if ocr_text: image_md_chunk += f"**OCR Text (Bulk Page {i+1}):**\n```\n{ocr_text}\n```\n\n"
+                            else: image_md_chunk += f"_(No text detected by OCR for bulk image {i+1})_\n\n"
                             yield yield_message("image_md", {"content": image_md_chunk})
                             time.sleep(0.01)
                 else:
+                    yield yield_message("status", {"message": "No valid PDF input source provided for image extraction."})
             except Exception as e:
+                logger.error(f"Error during image extraction/OCR processing: {str(e)}", exc_info=True)
+                yield yield_message("error", {"message": f"Error during image extraction/OCR: {str(e)}"})
+        yield yield_message("final_status", {"message": "Image extraction and OCR processing complete."})
     except Exception as e:
         logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
                 filename = secure_filename(pdf_file.filename)
                 os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+                # Save to a temporary file that generate_pdf_conversion_stream can access by path
                 fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
+                os.close(fd) # Close the file descriptor from mkstemp
+                pdf_file.save(temp_path) # Save the uploaded file's content to this path
+                outer_temp_pdf_path = temp_path # Store for cleanup
                 logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
+                pdf_input_source_for_generator = outer_temp_pdf_path # Pass the path
                 yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
                 time.sleep(0.01)
                     yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
                     return
+                pdf_input_source_for_generator = unquoted_url # Pass the URL string
                 yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
                 time.sleep(0.01)
             else:
 # --- Main Execution ---
 if __name__ == '__main__':
     if not check_poppler():
+        logger.warning("Poppler utilities might not be installed correctly. Image processing might fail.")
     os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
     app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)