Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

dc24da7

verified ·

1 Parent(s): dbea75b

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -3

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import re
 from datetime import datetime
 import urllib.parse
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -21,6 +22,16 @@ HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
 REPO_NAME = "pdf-images-extracted"  # Hugging Face dataset repo
 hf_api = HfApi()
 def ensure_hf_dataset():
     """Create or get Hugging Face dataset repository."""
     try:
@@ -76,6 +87,9 @@ def extract_text_from_pdf(pdf_input):
 def extract_images_from_pdf(pdf_input):
     """Extract images from PDF (URL or file) and convert to PIL images."""
     try:
         if isinstance(pdf_input, str):  # URL case
             logger.info(f"Downloading PDF from URL: {pdf_input}")
@@ -88,8 +102,6 @@ def extract_images_from_pdf(pdf_input):
         return images
     except Exception as e:
         logger.error(f"Error extracting images: {str(e)}")
-        if "poppler" in str(e).lower():
-            return "Error: Poppler not found. Ensure poppler-utils is installed and in PATH. In Hugging Face Spaces, poppler-utils should be pre-installed; contact support if this persists."
         return f"Error extracting images: {str(e)}"
 def format_to_markdown(text, images):
@@ -129,9 +141,13 @@ def format_to_markdown(text, images):
 def process_pdf(pdf_input, pdf_url):
     """Main function to process PDF input (file or URL) and generate Markdown."""
     if not HF_TOKEN:
         return "Error: HF_TOKEN not set in Spaces Secrets."
     # Decode URL-encoded string if provided
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
@@ -166,7 +182,8 @@ iface = gr.Interface(
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
-    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Note: Requires poppler-utils and tesseract-ocr, which are pre-installed in Hugging Face Spaces.",
 )
 if __name__ == "__main__":

 from datetime import datetime
 import urllib.parse
 import logging
+import subprocess
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 REPO_NAME = "pdf-images-extracted"  # Hugging Face dataset repo
 hf_api = HfApi()
+def check_poppler():
+    """Check if poppler-utils is installed."""
+    try:
+        result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
+        logger.info(f"Poppler version: {result.stdout}")
+        return True
+    except FileNotFoundError:
+        logger.error("Poppler not found in PATH.")
+        return False
 def ensure_hf_dataset():
     """Create or get Hugging Face dataset repository."""
     try:
 def extract_images_from_pdf(pdf_input):
     """Extract images from PDF (URL or file) and convert to PIL images."""
+    if not check_poppler():
+        return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
     try:
         if isinstance(pdf_input, str):  # URL case
             logger.info(f"Downloading PDF from URL: {pdf_input}")
         return images
     except Exception as e:
         logger.error(f"Error extracting images: {str(e)}")
         return f"Error extracting images: {str(e)}"
 def format_to_markdown(text, images):
 def process_pdf(pdf_input, pdf_url):
     """Main function to process PDF input (file or URL) and generate Markdown."""
+    logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
     if not HF_TOKEN:
         return "Error: HF_TOKEN not set in Spaces Secrets."
+    # Log poppler status
+    logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
     # Decode URL-encoded string if provided
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
+    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Uses Docker to ensure poppler-utils and tesseract-ocr are installed.",
+    flagging_dir="/tmp/flagged"  # Set writable flagging directory
 )
 if __name__ == "__main__":