Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ import re
|
|
11 |
from datetime import datetime
|
12 |
import urllib.parse
|
13 |
import logging
|
|
|
14 |
|
15 |
# Set up logging
|
16 |
logging.basicConfig(level=logging.INFO)
|
@@ -21,6 +22,16 @@ HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
|
|
21 |
REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
|
22 |
hf_api = HfApi()
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def ensure_hf_dataset():
|
25 |
"""Create or get Hugging Face dataset repository."""
|
26 |
try:
|
@@ -76,6 +87,9 @@ def extract_text_from_pdf(pdf_input):
|
|
76 |
|
77 |
def extract_images_from_pdf(pdf_input):
|
78 |
"""Extract images from PDF (URL or file) and convert to PIL images."""
|
|
|
|
|
|
|
79 |
try:
|
80 |
if isinstance(pdf_input, str): # URL case
|
81 |
logger.info(f"Downloading PDF from URL: {pdf_input}")
|
@@ -88,8 +102,6 @@ def extract_images_from_pdf(pdf_input):
|
|
88 |
return images
|
89 |
except Exception as e:
|
90 |
logger.error(f"Error extracting images: {str(e)}")
|
91 |
-
if "poppler" in str(e).lower():
|
92 |
-
return "Error: Poppler not found. Ensure poppler-utils is installed and in PATH. In Hugging Face Spaces, poppler-utils should be pre-installed; contact support if this persists."
|
93 |
return f"Error extracting images: {str(e)}"
|
94 |
|
95 |
def format_to_markdown(text, images):
|
@@ -129,9 +141,13 @@ def format_to_markdown(text, images):
|
|
129 |
|
130 |
def process_pdf(pdf_input, pdf_url):
|
131 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
|
|
132 |
if not HF_TOKEN:
|
133 |
return "Error: HF_TOKEN not set in Spaces Secrets."
|
134 |
|
|
|
|
|
|
|
135 |
# Decode URL-encoded string if provided
|
136 |
if pdf_url and pdf_url.strip():
|
137 |
pdf_url = urllib.parse.unquote(pdf_url)
|
@@ -166,7 +182,8 @@ iface = gr.Interface(
|
|
166 |
],
|
167 |
outputs=gr.Markdown(label="Markdown Output"),
|
168 |
title="PDF to Markdown Converter",
|
169 |
-
description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.
|
|
|
170 |
)
|
171 |
|
172 |
if __name__ == "__main__":
|
|
|
11 |
from datetime import datetime
|
12 |
import urllib.parse
|
13 |
import logging
|
14 |
+
import subprocess
|
15 |
|
16 |
# Set up logging
|
17 |
logging.basicConfig(level=logging.INFO)
|
|
|
22 |
REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
|
23 |
hf_api = HfApi()
|
24 |
|
25 |
+
def check_poppler():
|
26 |
+
"""Check if poppler-utils is installed."""
|
27 |
+
try:
|
28 |
+
result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
|
29 |
+
logger.info(f"Poppler version: {result.stdout}")
|
30 |
+
return True
|
31 |
+
except FileNotFoundError:
|
32 |
+
logger.error("Poppler not found in PATH.")
|
33 |
+
return False
|
34 |
+
|
35 |
def ensure_hf_dataset():
|
36 |
"""Create or get Hugging Face dataset repository."""
|
37 |
try:
|
|
|
87 |
|
88 |
def extract_images_from_pdf(pdf_input):
|
89 |
"""Extract images from PDF (URL or file) and convert to PIL images."""
|
90 |
+
if not check_poppler():
|
91 |
+
return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
|
92 |
+
|
93 |
try:
|
94 |
if isinstance(pdf_input, str): # URL case
|
95 |
logger.info(f"Downloading PDF from URL: {pdf_input}")
|
|
|
102 |
return images
|
103 |
except Exception as e:
|
104 |
logger.error(f"Error extracting images: {str(e)}")
|
|
|
|
|
105 |
return f"Error extracting images: {str(e)}"
|
106 |
|
107 |
def format_to_markdown(text, images):
|
|
|
141 |
|
142 |
def process_pdf(pdf_input, pdf_url):
|
143 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
144 |
+
logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
|
145 |
if not HF_TOKEN:
|
146 |
return "Error: HF_TOKEN not set in Spaces Secrets."
|
147 |
|
148 |
+
# Log poppler status
|
149 |
+
logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
|
150 |
+
|
151 |
# Decode URL-encoded string if provided
|
152 |
if pdf_url and pdf_url.strip():
|
153 |
pdf_url = urllib.parse.unquote(pdf_url)
|
|
|
182 |
],
|
183 |
outputs=gr.Markdown(label="Markdown Output"),
|
184 |
title="PDF to Markdown Converter",
|
185 |
+
description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Uses Docker to ensure poppler-utils and tesseract-ocr are installed.",
|
186 |
+
flagging_dir="/tmp/flagged" # Set writable flagging directory
|
187 |
)
|
188 |
|
189 |
if __name__ == "__main__":
|