Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

155ac2a

verified ·

1 Parent(s): 0e0f376

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -1

app.py CHANGED Viewed

@@ -10,6 +10,11 @@ from huggingface_hub import HfApi, create_repo
 import re
 from datetime import datetime
 import urllib.parse
 # Initialize Hugging Face API
 HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
@@ -22,6 +27,7 @@ def ensure_hf_dataset():
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         return repo_id
     except Exception as e:
         return f"Error creating dataset repo: {str(e)}"
 def upload_image_to_hf(image, filename):
@@ -46,6 +52,7 @@ def upload_image_to_hf(image, filename):
         os.remove(temp_path)
         return file_url
     except Exception as e:
         return f"Error uploading image: {str(e)}"
 def extract_text_from_pdf(pdf_input):
@@ -64,19 +71,25 @@ def extract_text_from_pdf(pdf_input):
             text += page_text + "\n\n"
         return text
     except Exception as e:
         return f"Error extracting text: {str(e)}"
 def extract_images_from_pdf(pdf_input):
     """Extract images from PDF (URL or file) and convert to PIL images."""
     try:
         if isinstance(pdf_input, str):  # URL case
             response = requests.get(pdf_input, stream=True)
             response.raise_for_status()
             images = convert_from_bytes(response.content)
         else:  # File upload case
             images = convert_from_path(pdf_input.name)
         return images
     except Exception as e:
         return f"Error extracting images: {str(e)}"
 def format_to_markdown(text, images):
@@ -122,11 +135,13 @@ def process_pdf(pdf_input, pdf_url):
     # Decode URL-encoded string if provided
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
         try:
             response = requests.head(pdf_url, allow_redirects=True)
             response.raise_for_status()
             pdf_input = pdf_url
         except requests.RequestException as e:
             return f"Error accessing URL: {str(e)}"
     elif not pdf_input:
         return "Error: Please provide a PDF file or URL."
@@ -151,7 +166,7 @@ iface = gr.Interface(
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
-    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
 )
 if __name__ == "__main__":

 import re
 from datetime import datetime
 import urllib.parse
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Initialize Hugging Face API
 HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         return repo_id
     except Exception as e:
+        logger.error(f"Error creating dataset repo: {str(e)}")
         return f"Error creating dataset repo: {str(e)}"
 def upload_image_to_hf(image, filename):
         os.remove(temp_path)
         return file_url
     except Exception as e:
+        logger.error(f"Error uploading image: {str(e)}")
         return f"Error uploading image: {str(e)}"
 def extract_text_from_pdf(pdf_input):
             text += page_text + "\n\n"
         return text
     except Exception as e:
+        logger.error(f"Error extracting text: {str(e)}")
         return f"Error extracting text: {str(e)}"
 def extract_images_from_pdf(pdf_input):
     """Extract images from PDF (URL or file) and convert to PIL images."""
     try:
         if isinstance(pdf_input, str):  # URL case
+            logger.info(f"Downloading PDF from URL: {pdf_input}")
             response = requests.get(pdf_input, stream=True)
             response.raise_for_status()
             images = convert_from_bytes(response.content)
         else:  # File upload case
+            logger.info(f"Processing uploaded PDF: {pdf_input.name}")
             images = convert_from_path(pdf_input.name)
         return images
     except Exception as e:
+        logger.error(f"Error extracting images: {str(e)}")
+        if "poppler" in str(e).lower():
+            return "Error: Poppler not found. Ensure poppler-utils is installed and in PATH. In Hugging Face Spaces, poppler-utils should be pre-installed; contact support if this persists."
         return f"Error extracting images: {str(e)}"
 def format_to_markdown(text, images):
     # Decode URL-encoded string if provided
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
+        logger.info(f"Decoded URL: {pdf_url}")
         try:
             response = requests.head(pdf_url, allow_redirects=True)
             response.raise_for_status()
             pdf_input = pdf_url
         except requests.RequestException as e:
+            logger.error(f"Error accessing URL: {str(e)}")
             return f"Error accessing URL: {str(e)}"
     elif not pdf_input:
         return "Error: Please provide a PDF file or URL."
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
+    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Note: Requires poppler-utils and tesseract-ocr, which are pre-installed in Hugging Face Spaces.",
 )
 if __name__ == "__main__":