Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

35151aa

verified ·

1 Parent(s): cf5a0c5

Update app.py

Browse files

Files changed (1) hide show

app.py +281 -251

app.py CHANGED Viewed

@@ -6,21 +6,29 @@ import subprocess
 from datetime import datetime
 import urllib.parse
 import tempfile
-from flask import Flask, request, render_template, redirect, url_for
-from werkzeug.utils import secure_filename # For secure file handling
 import requests
 import pdfplumber
 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
-from huggingface_hub import HfApi, create_repo
 # --- Flask App Initialization ---
 app = Flask(__name__)
-app.config['UPLOAD_FOLDER'] = tempfile.gettempdir() # Use system temp dir
-app.config['MAX_CONTENT_LENGTH'] = 30 * 1024 * 1024  # 30 MB limit for uploads
 # --- Logging Configuration ---
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -28,13 +36,19 @@ logger = logging.getLogger(__name__)
 # --- Hugging Face Configuration ---
 HF_TOKEN = os.getenv("HF_TOKEN")
-HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted") # Allow override via env var
 hf_api = HfApi()
-# --- PDF Processing Helper Functions (Adapted from Gradio version) ---
 def check_poppler():
     try:
         result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
         version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
@@ -51,20 +65,37 @@ def check_poppler():
         return False
 def ensure_hf_dataset():
     if not HF_TOKEN:
-        logger.warning("HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail.")
-        return "Error: HF_TOKEN is not set. Please configure it in Space secrets for image uploads."
     try:
         repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
         return repo_id_obj.repo_id
     except Exception as e:
         logger.error(f"Hugging Face dataset error: {str(e)}", exc_info=True)
         return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
-def upload_image_to_hf(image_pil, filename_base):
     repo_id_or_error = ensure_hf_dataset()
     if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
         return repo_id_or_error
@@ -73,162 +104,199 @@ def upload_image_to_hf(image_pil, filename_base):
     temp_image_path = None
     try:
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-        repo_filename = f"images/{filename_base}_{timestamp}.png" # Path in repo
-        # Save PIL image to a temporary file to upload
         with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
             temp_image_path = tmp_file.name
         image_pil.save(temp_image_path, format="PNG")
         logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
         file_url = hf_api.upload_file(
-            path_or_fileobj=temp_image_path,
-            path_in_repo=repo_filename,
-            repo_id=repo_id,
-            repo_type="dataset",
-            token=HF_TOKEN
         )
         logger.info(f"Successfully uploaded image: {file_url}")
         return file_url
     except Exception as e:
-        logger.error(f"Image upload error for {filename_base}: {str(e)}", exc_info=True)
-        return f"Error uploading image {filename_base}: {str(e)}"
     finally:
         if temp_image_path and os.path.exists(temp_image_path):
-            try:
-                os.remove(temp_image_path)
-            except OSError as ose:
-                logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
-def extract_text_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
-    try:
-        pdf_file_like_object = None
-        if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
-            logger.info(f"Fetching PDF from URL for text extraction: {pdf_input_source}")
-            response = requests.get(pdf_input_source, stream=True, timeout=30)
-            response.raise_for_status()
-            pdf_file_like_object = io.BytesIO(response.content)
-            logger.info("PDF downloaded successfully from URL.")
-        elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
-            logger.info(f"Processing local PDF file for text extraction: {pdf_input_source}")
-            # pdfplumber.open can take a path directly
-            pdf_file_like_object = pdf_input_source
         else:
-            logger.error(f"Invalid pdf_input_source for text extraction: {pdf_input_source}")
-            return "Error: Invalid input for PDF text extraction (must be URL or valid file path)."
-        with pdfplumber.open(pdf_file_like_object) as pdf:
-            full_text = ""
-            for i, page in enumerate(pdf.pages):
-                page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
-                full_text += page_text + "\n\n"
-                tables = page.extract_tables()
-                if tables:
-                    for table_data in tables:
-                        if table_data:
-                            header = [" | ".join(str(cell) if cell is not None else "" for cell in table_data[0])]
-                            separator = [" | ".join(["---"] * len(table_data[0]))]
-                            body = [" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data[1:]]
-                            table_md_lines = header + separator + body
-                            full_text += f"**Table:**\n" + "\n".join(table_md_lines) + "\n\n"
-        logger.info("Text and table extraction successful.")
-        return full_text.strip()
-    except requests.RequestException as e:
-        logger.error(f"URL fetch error for text extraction: {str(e)}", exc_info=True)
-        return f"Error fetching PDF from URL: {str(e)}"
-    except Exception as e:
-        logger.error(f"Text extraction error: {str(e)}", exc_info=True)
-        return f"Error extracting text: {str(e)}"
-def extract_images_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
-    if not check_poppler():
-        return "Error: poppler-utils not found or not working correctly. Image extraction depends on it."
-    images_pil = []
     try:
-        if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
-            logger.info(f"Fetching PDF from URL for image extraction: {pdf_input_source}")
-            response = requests.get(pdf_input_source, stream=True, timeout=30)
-            response.raise_for_status()
-            logger.info("PDF downloaded successfully from URL, converting to images.")
-            images_pil = convert_from_bytes(response.content, dpi=200)
-        elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
-            logger.info(f"Processing local PDF file for image extraction: {pdf_input_source}")
-            images_pil = convert_from_path(pdf_input_source, dpi=200)
         else:
-            logger.error(f"Invalid pdf_input_source for image extraction: {pdf_input_source}")
-            return "Error: Invalid input for PDF image extraction (must be URL or valid file path)."
-        logger.info(f"Successfully extracted {len(images_pil)} image(s) from PDF.")
-        return images_pil
-    except requests.RequestException as e:
-        logger.error(f"URL fetch error for image extraction: {str(e)}", exc_info=True)
-        return f"Error fetching PDF from URL for image extraction: {str(e)}"
     except Exception as e:
-        logger.error(f"Image extraction error: {str(e)}", exc_info=True)
-        return f"Error extracting images: {str(e)}"
-def format_to_markdown(text_content, images_input):
-    markdown_output = "# Extracted PDF Content\n\n"
-    if text_content.startswith("Error"): # If text extraction itself failed
-        markdown_output += f"**Text Extraction Note:**\n{text_content}\n\n"
-    else:
-        text_content = re.sub(r'\n\s*\n+', '\n\n', text_content.strip())
-        lines = text_content.split('\n')
-        is_in_list = False
-        for line_text in lines:
-            line_stripped = line_text.strip()
-            if not line_stripped:
-                markdown_output += "\n"
-                is_in_list = False
-                continue
-            list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
-            is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
-            if is_heading_candidate and not list_match:
-                markdown_output += f"## {line_stripped}\n\n"
-                is_in_list = False
-            elif list_match:
-                list_item_text = list_match.group(1)
-                markdown_output += f"- {list_item_text}\n"
-                is_in_list = True
-            else:
-                if is_in_list: markdown_output += "\n"
-                markdown_output += f"{line_text}\n\n"
-                is_in_list = False
-        markdown_output = re.sub(r'\n\s*\n+', '\n\n', markdown_output.strip()) + "\n\n"
-    if isinstance(images_input, list) and images_input:
-        markdown_output += "## Extracted Images\n\n"
-        if not HF_TOKEN:
-            markdown_output += "**Note:** `HF_TOKEN` not set. Images were extracted but not uploaded to Hugging Face Hub.\n\n"
-        for i, img_pil in enumerate(images_input):
-            ocr_text = ""
-            try:
-                ocr_text = pytesseract.image_to_string(img_pil).strip()
-                logger.info(f"OCR for image {i+1} successful.")
-            except Exception as ocr_e:
-                logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
-                ocr_text = f"OCR failed: {str(ocr_e)}"
-            if HF_TOKEN: # Only attempt upload if token is present
-                image_filename_base = f"extracted_image_{i+1}"
-                image_url_or_error = upload_image_to_hf(img_pil, image_filename_base)
-                if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
-                    markdown_output += f"![Image {i+1}]({image_url_or_error})\n"
-                else:
-                    markdown_output += f"**Image {i+1} (Upload Error):** {str(image_url_or_error)}\n\n"
-            else: # No token, show placeholder or local info if we were saving them locally
-                 markdown_output += f"**Image {i+1} (not uploaded due to missing HF_TOKEN)**\n"
-            if ocr_text:
-                markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
-    elif isinstance(images_input, str) and images_input.startswith("Error"):
-        markdown_output += f"## Image Extraction Note\n\n{images_input}\n\n"
-    return markdown_output.strip()
 # --- Flask Routes ---
@@ -236,114 +304,76 @@ def format_to_markdown(text_content, images_input):
 def index():
     return render_template('index.html')
-@app.route('/process', methods=['POST'])
-def process_pdf_route():
     pdf_file = request.files.get('pdf_file')
     pdf_url = request.form.get('pdf_url', '').strip()
-    status_message = "Starting PDF processing..."
-    error_message = None
-    markdown_output = None
-    temp_pdf_path = None
-    pdf_input_source = None # This will be a URL string or a local file path
-    try:
-        if pdf_file and pdf_file.filename:
-            if not pdf_file.filename.lower().endswith('.pdf'):
-                raise ValueError("Uploaded file is not a PDF.")
-            filename = secure_filename(pdf_file.filename)
-            # Save to a temporary file
-            fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
-            os.close(fd) # close file descriptor from mkstemp
-            pdf_file.save(temp_pdf_path)
-            logger.info(f"Uploaded PDF saved to temporary path: {temp_pdf_path}")
-            pdf_input_source = temp_pdf_path
-            status_message = f"Processing uploaded PDF: {filename}"
-        elif pdf_url:
-            pdf_url = urllib.parse.unquote(pdf_url)
-            # Basic URL validation
-            if not (pdf_url.startswith('http://') or pdf_url.startswith('https://')):
-                raise ValueError("Invalid URL scheme. Must be http or https.")
-            if not pdf_url.lower().endswith('.pdf'):
-                 logger.warning(f"URL {pdf_url} does not end with .pdf. Proceeding with caution.")
-                # Allow proceeding but log warning, actual check is content-type or processing error
-            # Quick check with HEAD request (optional, but good practice)
-            try:
-                head_resp = requests.head(pdf_url, allow_redirects=True, timeout=10)
-                head_resp.raise_for_status()
-                content_type = head_resp.headers.get('content-type', '').lower()
-                if 'application/pdf' not in content_type:
-                    logger.warning(f"URL {pdf_url} content-type is '{content_type}', not 'application/pdf'.")
-                    # Depending on strictness, could raise ValueError here
-            except requests.RequestException as re:
-                logger.error(f"Failed HEAD request for URL {pdf_url}: {re}")
-                # Proceed, main request in extract functions will handle final failure
-            pdf_input_source = pdf_url
-            status_message = f"Processing PDF from URL: {pdf_url}"
-        else:
-            raise ValueError("No PDF file uploaded and no PDF URL provided.")
-        # --- Core Processing ---
-        status_message += "\nExtracting text..."
-        logger.info(status_message)
-        extracted_text = extract_text_from_pdf(pdf_input_source)
-        if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
-            # Let format_to_markdown handle displaying this error within its structure
-            logger.error(f"Text extraction resulted in error: {extracted_text}")
-        status_message += "\nExtracting images..."
-        logger.info(status_message)
-        extracted_images = extract_images_from_pdf(pdf_input_source) # list of PIL images or error string
-        if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
-            logger.error(f"Image extraction resulted in error: {extracted_images}")
-        status_message += "\nFormatting to Markdown..."
-        logger.info(status_message)
-        markdown_output = format_to_markdown(extracted_text, extracted_images)
-        status_message = "Processing complete."
-        if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
-             status_message += f" (Text extraction issues: {extracted_text.split(':', 1)[1].strip()})"
-        if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
-             status_message += f" (Image extraction issues: {extracted_images.split(':', 1)[1].strip()})"
-        if not HF_TOKEN and isinstance(extracted_images, list) and extracted_images:
-            status_message += " (Note: HF_TOKEN not set, images not uploaded to Hub)"
-    except ValueError as ve:
-        logger.error(f"Input validation error: {str(ve)}")
-        error_message = str(ve)
-        status_message = "Processing failed."
-    except Exception as e:
-        logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
-        error_message = f"An unexpected error occurred: {str(e)}"
-        status_message = "Processing failed due to an unexpected error."
-    finally:
-        if temp_pdf_path and os.path.exists(temp_pdf_path):
-            try:
-                os.remove(temp_pdf_path)
-                logger.info(f"Removed temporary PDF: {temp_pdf_path}")
-            except OSError as ose:
-                logger.error(f"Error removing temporary PDF {temp_pdf_path}: {ose}")
-    return render_template('index.html',
-                           markdown_output=markdown_output,
-                           status_message=status_message,
-                           error_message=error_message)
 # --- Main Execution ---
 if __name__ == '__main__':
-    # This is for local development. For Hugging Face Spaces, Gunicorn is used via Dockerfile CMD.
-    # Poppler check at startup for local dev convenience
-    if not check_poppler():
         logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
-    # Ensure UPLOAD_FOLDER exists
     os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
-    app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True)

 from datetime import datetime
 import urllib.parse
 import tempfile
+import json # For streaming JSON messages
+import time # For gevent.sleep
+from flask import Flask, request, render_template, Response, stream_with_context
+from werkzeug.utils import secure_filename
+# Ensure gevent is imported and monkey patched if needed for other libraries
+# that might not be gevent-friendly. For built-in libs and requests (with Gunicorn gevent worker),
+# this is often handled by Gunicorn.
+# from gevent import monkey
+# monkey.patch_all() # Apply this early if you suspect issues with other libs
 import requests
 import pdfplumber
 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
+from huggingface_hub import HfApi, create_repo, HfHubHTTPError
 # --- Flask App Initialization ---
 app = Flask(__name__)
+app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
+app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024  # 50 MB limit for uploads, adjust as needed
 # --- Logging Configuration ---
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # --- Hugging Face Configuration ---
 HF_TOKEN = os.getenv("HF_TOKEN")
+HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted")
 hf_api = HfApi()
+# --- Helper to yield messages for streaming ---
+def yield_message(type, data):
+    """Helper to format messages as JSON strings for streaming."""
+    # Add a newline so client can easily split messages
+    return json.dumps({"type": type, **data}) + "\n"
+# --- PDF Processing Helper Functions (Adapted for Streaming) ---
 def check_poppler():
+    # (Same as before)
     try:
         result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
         version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
         return False
 def ensure_hf_dataset():
+    # (Same as before, but logs info useful for streaming if an error occurs)
     if not HF_TOKEN:
+        msg = "HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail."
+        logger.warning(msg)
+        return "Error: " + msg
     try:
         repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
         return repo_id_obj.repo_id
+    except HfHubHTTPError as e:
+        if e.response.status_code == 409:
+             logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists.")
+             # Attempt to construct the full repo_id (namespace/repo_name)
+             try:
+                 user_info = hf_api.whoami(token=HF_TOKEN)
+                 namespace = user_info.get('name') if user_info else None
+                 if namespace:
+                     return f"{namespace}/{HF_DATASET_REPO_NAME}"
+             except Exception as whoami_e:
+                 logger.error(f"Could not determine namespace for existing repo via whoami: {whoami_e}")
+             return f"hf://datasets/{HF_DATASET_REPO_NAME}" # Fallback, might not be full id
+        logger.error(f"Hugging Face dataset error (HTTP {e.response.status_code}): {str(e)}")
+        return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
     except Exception as e:
         logger.error(f"Hugging Face dataset error: {str(e)}", exc_info=True)
         return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
+def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
+    # (Adapted to potentially yield status during this sub-process if it were longer)
+    # For now, it's synchronous but part of the larger stream.
     repo_id_or_error = ensure_hf_dataset()
     if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
         return repo_id_or_error
     temp_image_path = None
     try:
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
         with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
             temp_image_path = tmp_file.name
         image_pil.save(temp_image_path, format="PNG")
         logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
         file_url = hf_api.upload_file(
+            path_or_fileobj=temp_image_path, path_in_repo=repo_filename,
+            repo_id=repo_id, repo_type="dataset", token=HF_TOKEN
         )
         logger.info(f"Successfully uploaded image: {file_url}")
         return file_url
     except Exception as e:
+        logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
+        return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
     finally:
         if temp_image_path and os.path.exists(temp_image_path):
+            try: os.remove(temp_image_path)
+            except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
+def format_page_text_to_markdown_chunk(page_text_content):
+    """Formats a single page's text content into a markdown chunk.
+       More complex formatting logic can be applied here page by page.
+    """
+    chunk_md = ""
+    # Normalize newlines: multiple consecutive newlines become a single blank line (two \n chars)
+    page_text_content = re.sub(r'\n\s*\n+', '\n\n', page_text_content.strip())
+    lines = page_text_content.split('\n')
+    is_in_list = False
+    for line_text in lines:
+        line_stripped = line_text.strip()
+        if not line_stripped:
+            chunk_md += "\n"
+            is_in_list = False
+            continue
+        list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
+        is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
+        if is_heading_candidate and not list_match:
+            chunk_md += f"## {line_stripped}\n\n"
+            is_in_list = False
+        elif list_match:
+            list_item_text = list_match.group(1)
+            chunk_md += f"- {list_item_text}\n"
+            is_in_list = True
         else:
+            if is_in_list: chunk_md += "\n"
+            chunk_md += f"{line_text}\n\n"
+            is_in_list = False
+    return re.sub(r'\n\s*\n+', '\n\n', chunk_md.strip()) + "\n\n"
+# --- Main PDF Processing Logic (Generator Function for Streaming) ---
+def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
+    """
+    Processes the PDF incrementally and yields status messages and markdown content.
+    `pdf_input_source_path_or_url` is a local file path or a URL string.
+    """
     try:
+        # Initial Markdown Title
+        yield yield_message("markdown_replace", {"content": "# Extracted PDF Content\n\n"})
+        time.sleep(0.01) # Give gevent a chance to yield
+        # 1. Text and Table Extraction (Page by Page)
+        yield yield_message("status", {"message": "Opening PDF for text extraction..."})
+        time.sleep(0.01)
+        source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
+                        pdf_input_source_path_or_url.startswith(('http://', 'https://'))
+        pdf_handle_for_text = None
+        pdf_bytes_for_images = None # Store bytes if downloaded from URL for image extraction
+        if source_is_url:
+            try:
+                response = requests.get(pdf_input_source_path_or_url, stream=True, timeout=60) # Increased timeout
+                response.raise_for_status()
+                pdf_bytes_for_images = response.content # Read all content for pdf2image
+                pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images) # Use BytesIO for pdfplumber
+                yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
+                time.sleep(0.01)
+            except requests.RequestException as e:
+                logger.error(f"URL fetch error for PDF processing: {str(e)}", exc_info=True)
+                yield yield_message("error", {"message": f"Error fetching PDF from URL: {str(e)}"})
+                return # Stop generation
+        else: # Local file path
+             pdf_handle_for_text = pdf_input_source_path_or_url # pdfplumber takes path
+        total_text_pages = 0
+        try:
+            with pdfplumber.open(pdf_handle_for_text) as pdf:
+                total_text_pages = len(pdf.pages)
+                yield yield_message("status", {"message": f"Found {total_text_pages} page(s) for text extraction."})
+                time.sleep(0.01)
+                for i, page in enumerate(pdf.pages):
+                    yield yield_message("status", {"message": f"Extracting text from page {i+1}/{total_text_pages}..."})
+                    time.sleep(0.01) # gevent yield
+                    page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
+                    page_tables_md = ""
+                    tables = page.extract_tables()
+                    if tables:
+                        for table_idx, table_data in enumerate(tables):
+                            if table_data:
+                                yield yield_message("status", {"message": f"  Processing table {table_idx+1} on page {i+1}..."})
+                                header = [" | ".join(str(cell) if cell is not None else "" for cell in table_data[0])]
+                                separator = [" | ".join(["---"] * len(table_data[0]))]
+                                body = [" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data[1:]]
+                                table_md_lines = header + separator + body
+                                page_tables_md += f"**Table (Page {i+1}):**\n" + "\n".join(table_md_lines) + "\n\n"
+                    formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
+                    yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
+                    if page_tables_md:
+                        yield yield_message("markdown_chunk", {"content": page_tables_md})
+                    time.sleep(0.01) # gevent yield
+        except Exception as e:
+            logger.error(f"Error during PDF text/table extraction: {str(e)}", exc_info=True)
+            yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
+            # Continue to image extraction if possible, or return based on severity
+        # 2. Image Extraction and OCR
+        if not check_poppler():
+            yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
         else:
+            yield yield_message("status", {"message": "Starting image extraction..."})
+            yield yield_message("markdown_chunk", {"content": "## Extracted Images\n\n"})
+            if not HF_TOKEN:
+                 yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
+            time.sleep(0.01)
+            extracted_pil_images = []
+            try:
+                if source_is_url and pdf_bytes_for_images:
+                    # Use the already downloaded bytes
+                    extracted_pil_images = convert_from_bytes(pdf_bytes_for_images, dpi=150) # Lower DPI for speed/memory
+                elif not source_is_url: # local file path
+                    extracted_pil_images = convert_from_path(pdf_input_source_path_or_url, dpi=150)
+                yield yield_message("status", {"message": f"Found {len(extracted_pil_images)} image(s) in PDF (these are rasterized pages for now)."})
+                time.sleep(0.01)
+                # TODO: Implement more granular image extraction if pdf2image supports it,
+                # or if you integrate a library that can extract embedded images directly.
+                # For now, convert_from_path/bytes often gives full pages as images.
+                for i, img_pil in enumerate(extracted_pil_images):
+                    page_num_for_log = f"page_{i+1}" # Assuming one image per page from convert_from_path
+                    yield yield_message("status", {"message": f"Processing image {i+1}/{len(extracted_pil_images)} (OCR & Upload)..."})
+                    time.sleep(0.01)
+                    ocr_text = ""
+                    try:
+                        ocr_text = pytesseract.image_to_string(img_pil).strip()
+                        if ocr_text:
+                            yield yield_message("status", {"message": f"  OCR successful for image {i+1}."})
+                    except Exception as ocr_e:
+                        logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
+                        ocr_text = f"OCR failed: {str(ocr_e)}"
+                    image_md_chunk = ""
+                    if HF_TOKEN:
+                        image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
+                        if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
+                            image_md_chunk += f"![Image {i+1}]({image_url_or_error})\n"
+                            yield yield_message("status", {"message": f"  Image {i+1} uploaded."})
+                        else:
+                            image_md_chunk += f"**Image {i+1} (Upload Error):** {str(image_url_or_error)}\n\n"
+                            yield yield_message("error", {"message": f"Failed to upload image {i+1}: {str(image_url_or_error)}"})
+                    else:
+                        image_md_chunk += f"**Image {i+1} (not uploaded due to missing HF_TOKEN)**\n"
+                    if ocr_text:
+                        image_md_chunk += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
+                    yield yield_message("image_md", {"content": image_md_chunk})
+                    time.sleep(0.01) # gevent yield
+            except Exception as e:
+                logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
+                yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
+        yield yield_message("final_status", {"message": "All processing stages complete."})
     except Exception as e:
+        logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
+        yield yield_message("error", {"message": f"Critical processing error: {str(e)}"})
 # --- Flask Routes ---
 def index():
     return render_template('index.html')
+@app.route('/process-stream', methods=['POST'])
+def process_pdf_stream():
     pdf_file = request.files.get('pdf_file')
     pdf_url = request.form.get('pdf_url', '').strip()
+    temp_pdf_path = None # To store path of uploaded file for cleanup
+    pdf_input_source_for_generator = None
+    def stream_processor():
+        nonlocal temp_pdf_path # Make it accessible in this inner function for cleanup
+        nonlocal pdf_input_source_for_generator
+        try:
+            if pdf_file and pdf_file.filename:
+                if not pdf_file.filename.lower().endswith('.pdf'):
+                    yield yield_message("error", {"message": "Uploaded file is not a PDF."})
+                    return
+                filename = secure_filename(pdf_file.filename)
+                # Save to a temporary file (ensure UPLOAD_FOLDER is writable by app user)
+                os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+                fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
+                os.close(fd)
+                pdf_file.save(temp_pdf_path)
+                logger.info(f"Uploaded PDF saved to temporary path: {temp_pdf_path}")
+                pdf_input_source_for_generator = temp_pdf_path
+                yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
+                time.sleep(0.01)
+            elif pdf_url:
+                unquoted_url = urllib.parse.unquote(pdf_url)
+                if not (unquoted_url.startswith('http://') or unquoted_url.startswith('https://')):
+                    yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
+                    return
+                # Consider a light check for .pdf extension, but content-type is more reliable
+                pdf_input_source_for_generator = unquoted_url
+                yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
+                time.sleep(0.01)
+            else:
+                yield yield_message("error", {"message": "No PDF file uploaded and no PDF URL provided."})
+                return
+            # Yield from the main generator
+            for message_part in generate_pdf_conversion_stream(pdf_input_source_for_generator):
+                yield message_part
+                # time.sleep(0.01) # Allow gevent to switch context, important for streaming
+        except Exception as e:
+            logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
+            yield yield_message("error", {"message": f"Setup error: {str(e)}"})
+        finally:
+            if temp_pdf_path and os.path.exists(temp_pdf_path):
+                try:
+                    os.remove(temp_pdf_path)
+                    logger.info(f"Cleaned up temporary PDF: {temp_pdf_path}")
+                    yield yield_message("status", {"message": f"Cleaned up temporary file."})
+                except OSError as ose:
+                    logger.error(f"Error removing temporary PDF {temp_pdf_path}: {ose}")
+                    yield yield_message("error", {"message": f"Could not clean temp file: {ose}"})
+    # Using stream_with_context for proper handling of request context within the generator
+    return Response(stream_with_context(stream_processor()), mimetype='application/x-ndjson')
 # --- Main Execution ---
 if __name__ == '__main__':
+    if not check_poppler(): # Check Poppler at startup for local dev
         logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
     os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+    # For local dev, Flask's built-in server is fine. Gunicorn handles production.
+    # The 'threaded=True' or using gevent server locally can also help test streaming.
+    app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)