Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

pdf2markdown / app.py

broadfield-dev

Update app.py

35151aa verified 3 months ago

raw

history blame

19.5 kB

	import os
	import io
	import re
	import logging
	import subprocess
	from datetime import datetime
	import urllib.parse
	import tempfile
	import json # For streaming JSON messages
	import time # For gevent.sleep

	from flask import Flask, request, render_template, Response, stream_with_context
	from werkzeug.utils import secure_filename

	# Ensure gevent is imported and monkey patched if needed for other libraries
	# that might not be gevent-friendly. For built-in libs and requests (with Gunicorn gevent worker),
	# this is often handled by Gunicorn.
	# from gevent import monkey
	# monkey.patch_all() # Apply this early if you suspect issues with other libs

	import requests
	import pdfplumber
	from pdf2image import convert_from_path, convert_from_bytes
	import pytesseract
	from PIL import Image
	from huggingface_hub import HfApi, create_repo, HfHubHTTPError

	# --- Flask App Initialization ---
	app = Flask(__name__)
	app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
	app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50 MB limit for uploads, adjust as needed

	# --- Logging Configuration ---
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)

	# --- Hugging Face Configuration ---
	HF_TOKEN = os.getenv("HF_TOKEN")
	HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted")
	hf_api = HfApi()

	# --- Helper to yield messages for streaming ---
	def yield_message(type, data):
	"""Helper to format messages as JSON strings for streaming."""
	# Add a newline so client can easily split messages
	return json.dumps({"type": type, **data}) + "\n"

	# --- PDF Processing Helper Functions (Adapted for Streaming) ---

	def check_poppler():
	# (Same as before)
	try:
	result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
	version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
	if version_info_log:
	logger.info(f"Poppler version check: {version_info_log.splitlines()[0] if version_info_log else 'No version output'}")
	else:
	logger.info("Poppler 'pdftoppm -v' ran. Assuming Poppler is present.")
	return True
	except FileNotFoundError:
	logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
	return False
	except Exception as e:
	logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
	return False

	def ensure_hf_dataset():
	# (Same as before, but logs info useful for streaming if an error occurs)
	if not HF_TOKEN:
	msg = "HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail."
	logger.warning(msg)
	return "Error: " + msg
	try:
	repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
	logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
	return repo_id_obj.repo_id
	except HfHubHTTPError as e:
	if e.response.status_code == 409:
	logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists.")
	# Attempt to construct the full repo_id (namespace/repo_name)
	try:
	user_info = hf_api.whoami(token=HF_TOKEN)
	namespace = user_info.get('name') if user_info else None
	if namespace:
	return f"{namespace}/{HF_DATASET_REPO_NAME}"
	except Exception as whoami_e:
	logger.error(f"Could not determine namespace for existing repo via whoami: {whoami_e}")
	return f"hf://datasets/{HF_DATASET_REPO_NAME}" # Fallback, might not be full id
	logger.error(f"Hugging Face dataset error (HTTP {e.response.status_code}): {str(e)}")
	return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
	except Exception as e:
	logger.error(f"Hugging Face dataset error: {str(e)}", exc_info=True)
	return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"


	def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
	# (Adapted to potentially yield status during this sub-process if it were longer)
	# For now, it's synchronous but part of the larger stream.
	repo_id_or_error = ensure_hf_dataset()
	if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
	return repo_id_or_error

	repo_id = repo_id_or_error
	temp_image_path = None
	try:
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
	repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"

	with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
	temp_image_path = tmp_file.name
	image_pil.save(temp_image_path, format="PNG")

	logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
	file_url = hf_api.upload_file(
	path_or_fileobj=temp_image_path, path_in_repo=repo_filename,
	repo_id=repo_id, repo_type="dataset", token=HF_TOKEN
	)
	logger.info(f"Successfully uploaded image: {file_url}")
	return file_url
	except Exception as e:
	logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
	return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
	finally:
	if temp_image_path and os.path.exists(temp_image_path):
	try: os.remove(temp_image_path)
	except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")


	def format_page_text_to_markdown_chunk(page_text_content):
	"""Formats a single page's text content into a markdown chunk.
	More complex formatting logic can be applied here page by page.
	"""
	chunk_md = ""
	# Normalize newlines: multiple consecutive newlines become a single blank line (two \n chars)
	page_text_content = re.sub(r'\n\s*\n+', '\n\n', page_text_content.strip())
	lines = page_text_content.split('\n')
	is_in_list = False
	for line_text in lines:
	line_stripped = line_text.strip()
	if not line_stripped:
	chunk_md += "\n"
	is_in_list = False
	continue
	list_match = re.match(r'^\s(?:(?:\d+\.)\|[+-])\s+(.*)', line_stripped)
	is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
	if is_heading_candidate and not list_match:
	chunk_md += f"## {line_stripped}\n\n"
	is_in_list = False
	elif list_match:
	list_item_text = list_match.group(1)
	chunk_md += f"- {list_item_text}\n"
	is_in_list = True
	else:
	if is_in_list: chunk_md += "\n"
	chunk_md += f"{line_text}\n\n"
	is_in_list = False
	return re.sub(r'\n\s*\n+', '\n\n', chunk_md.strip()) + "\n\n"


	# --- Main PDF Processing Logic (Generator Function for Streaming) ---

	def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
	"""
	Processes the PDF incrementally and yields status messages and markdown content.
	`pdf_input_source_path_or_url` is a local file path or a URL string.
	"""
	try:
	# Initial Markdown Title
	yield yield_message("markdown_replace", {"content": "# Extracted PDF Content\n\n"})
	time.sleep(0.01) # Give gevent a chance to yield

	# 1. Text and Table Extraction (Page by Page)
	yield yield_message("status", {"message": "Opening PDF for text extraction..."})
	time.sleep(0.01)

	source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
	pdf_input_source_path_or_url.startswith(('http://', 'https://'))

	pdf_handle_for_text = None
	pdf_bytes_for_images = None # Store bytes if downloaded from URL for image extraction

	if source_is_url:
	try:
	response = requests.get(pdf_input_source_path_or_url, stream=True, timeout=60) # Increased timeout
	response.raise_for_status()
	pdf_bytes_for_images = response.content # Read all content for pdf2image
	pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images) # Use BytesIO for pdfplumber
	yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
	time.sleep(0.01)
	except requests.RequestException as e:
	logger.error(f"URL fetch error for PDF processing: {str(e)}", exc_info=True)
	yield yield_message("error", {"message": f"Error fetching PDF from URL: {str(e)}"})
	return # Stop generation
	else: # Local file path
	pdf_handle_for_text = pdf_input_source_path_or_url # pdfplumber takes path

	total_text_pages = 0
	try:
	with pdfplumber.open(pdf_handle_for_text) as pdf:
	total_text_pages = len(pdf.pages)
	yield yield_message("status", {"message": f"Found {total_text_pages} page(s) for text extraction."})
	time.sleep(0.01)

	for i, page in enumerate(pdf.pages):
	yield yield_message("status", {"message": f"Extracting text from page {i+1}/{total_text_pages}..."})
	time.sleep(0.01) # gevent yield

	page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""

	page_tables_md = ""
	tables = page.extract_tables()
	if tables:
	for table_idx, table_data in enumerate(tables):
	if table_data:
	yield yield_message("status", {"message": f" Processing table {table_idx+1} on page {i+1}..."})
	header = [" \| ".join(str(cell) if cell is not None else "" for cell in table_data[0])]
	separator = [" \| ".join(["---"] * len(table_data[0]))]
	body = [" \| ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data[1:]]
	table_md_lines = header + separator + body
	page_tables_md += f"Table (Page {i+1}):\n" + "\n".join(table_md_lines) + "\n\n"

	formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)

	yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
	if page_tables_md:
	yield yield_message("markdown_chunk", {"content": page_tables_md})
	time.sleep(0.01) # gevent yield
	except Exception as e:
	logger.error(f"Error during PDF text/table extraction: {str(e)}", exc_info=True)
	yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
	# Continue to image extraction if possible, or return based on severity

	# 2. Image Extraction and OCR
	if not check_poppler():
	yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
	else:
	yield yield_message("status", {"message": "Starting image extraction..."})
	yield yield_message("markdown_chunk", {"content": "## Extracted Images\n\n"})
	if not HF_TOKEN:
	yield yield_message("markdown_chunk", {"content": "Note: `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})

	time.sleep(0.01)
	extracted_pil_images = []
	try:
	if source_is_url and pdf_bytes_for_images:
	# Use the already downloaded bytes
	extracted_pil_images = convert_from_bytes(pdf_bytes_for_images, dpi=150) # Lower DPI for speed/memory
	elif not source_is_url: # local file path
	extracted_pil_images = convert_from_path(pdf_input_source_path_or_url, dpi=150)

	yield yield_message("status", {"message": f"Found {len(extracted_pil_images)} image(s) in PDF (these are rasterized pages for now)."})
	time.sleep(0.01)

	# TODO: Implement more granular image extraction if pdf2image supports it,
	# or if you integrate a library that can extract embedded images directly.
	# For now, convert_from_path/bytes often gives full pages as images.

	for i, img_pil in enumerate(extracted_pil_images):
	page_num_for_log = f"page_{i+1}" # Assuming one image per page from convert_from_path
	yield yield_message("status", {"message": f"Processing image {i+1}/{len(extracted_pil_images)} (OCR & Upload)..."})
	time.sleep(0.01)

	ocr_text = ""
	try:
	ocr_text = pytesseract.image_to_string(img_pil).strip()
	if ocr_text:
	yield yield_message("status", {"message": f" OCR successful for image {i+1}."})
	except Exception as ocr_e:
	logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
	ocr_text = f"OCR failed: {str(ocr_e)}"

	image_md_chunk = ""
	if HF_TOKEN:
	image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
	if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
	image_md_chunk += f"![Image {i+1}]({image_url_or_error})\n"
	yield yield_message("status", {"message": f" Image {i+1} uploaded."})
	else:
	image_md_chunk += f"Image {i+1} (Upload Error): {str(image_url_or_error)}\n\n"
	yield yield_message("error", {"message": f"Failed to upload image {i+1}: {str(image_url_or_error)}"})
	else:
	image_md_chunk += f"Image {i+1} (not uploaded due to missing HF_TOKEN)\n"

	if ocr_text:
	image_md_chunk += f"Image {i+1} OCR Text:\n```\n{ocr_text}\n```\n\n"

	yield yield_message("image_md", {"content": image_md_chunk})
	time.sleep(0.01) # gevent yield

	except Exception as e:
	logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
	yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})

	yield yield_message("final_status", {"message": "All processing stages complete."})

	except Exception as e:
	logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
	yield yield_message("error", {"message": f"Critical processing error: {str(e)}"})


	# --- Flask Routes ---

	@app.route('/', methods=['GET'])
	def index():
	return render_template('index.html')

	@app.route('/process-stream', methods=['POST'])
	def process_pdf_stream():
	pdf_file = request.files.get('pdf_file')
	pdf_url = request.form.get('pdf_url', '').strip()

	temp_pdf_path = None # To store path of uploaded file for cleanup
	pdf_input_source_for_generator = None

	def stream_processor():
	nonlocal temp_pdf_path # Make it accessible in this inner function for cleanup
	nonlocal pdf_input_source_for_generator

	try:
	if pdf_file and pdf_file.filename:
	if not pdf_file.filename.lower().endswith('.pdf'):
	yield yield_message("error", {"message": "Uploaded file is not a PDF."})
	return

	filename = secure_filename(pdf_file.filename)
	# Save to a temporary file (ensure UPLOAD_FOLDER is writable by app user)
	os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
	fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
	os.close(fd)
	pdf_file.save(temp_pdf_path)
	logger.info(f"Uploaded PDF saved to temporary path: {temp_pdf_path}")
	pdf_input_source_for_generator = temp_pdf_path
	yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
	time.sleep(0.01)

	elif pdf_url:
	unquoted_url = urllib.parse.unquote(pdf_url)
	if not (unquoted_url.startswith('http://') or unquoted_url.startswith('https://')):
	yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
	return
	# Consider a light check for .pdf extension, but content-type is more reliable

	pdf_input_source_for_generator = unquoted_url
	yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
	time.sleep(0.01)
	else:
	yield yield_message("error", {"message": "No PDF file uploaded and no PDF URL provided."})
	return

	# Yield from the main generator
	for message_part in generate_pdf_conversion_stream(pdf_input_source_for_generator):
	yield message_part
	# time.sleep(0.01) # Allow gevent to switch context, important for streaming

	except Exception as e:
	logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
	yield yield_message("error", {"message": f"Setup error: {str(e)}"})
	finally:
	if temp_pdf_path and os.path.exists(temp_pdf_path):
	try:
	os.remove(temp_pdf_path)
	logger.info(f"Cleaned up temporary PDF: {temp_pdf_path}")
	yield yield_message("status", {"message": f"Cleaned up temporary file."})
	except OSError as ose:
	logger.error(f"Error removing temporary PDF {temp_pdf_path}: {ose}")
	yield yield_message("error", {"message": f"Could not clean temp file: {ose}"})

	# Using stream_with_context for proper handling of request context within the generator
	return Response(stream_with_context(stream_processor()), mimetype='application/x-ndjson')


	# --- Main Execution ---
	if __name__ == '__main__':
	if not check_poppler(): # Check Poppler at startup for local dev
	logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
	os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
	# For local dev, Flask's built-in server is fine. Gunicorn handles production.
	# The 'threaded=True' or using gevent server locally can also help test streaming.
	app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)