broadfield-dev commited on
Commit
dc24da7
·
verified ·
1 Parent(s): dbea75b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -3
app.py CHANGED
@@ -11,6 +11,7 @@ import re
11
  from datetime import datetime
12
  import urllib.parse
13
  import logging
 
14
 
15
  # Set up logging
16
  logging.basicConfig(level=logging.INFO)
@@ -21,6 +22,16 @@ HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
21
  REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
22
  hf_api = HfApi()
23
 
 
 
 
 
 
 
 
 
 
 
24
  def ensure_hf_dataset():
25
  """Create or get Hugging Face dataset repository."""
26
  try:
@@ -76,6 +87,9 @@ def extract_text_from_pdf(pdf_input):
76
 
77
  def extract_images_from_pdf(pdf_input):
78
  """Extract images from PDF (URL or file) and convert to PIL images."""
 
 
 
79
  try:
80
  if isinstance(pdf_input, str): # URL case
81
  logger.info(f"Downloading PDF from URL: {pdf_input}")
@@ -88,8 +102,6 @@ def extract_images_from_pdf(pdf_input):
88
  return images
89
  except Exception as e:
90
  logger.error(f"Error extracting images: {str(e)}")
91
- if "poppler" in str(e).lower():
92
- return "Error: Poppler not found. Ensure poppler-utils is installed and in PATH. In Hugging Face Spaces, poppler-utils should be pre-installed; contact support if this persists."
93
  return f"Error extracting images: {str(e)}"
94
 
95
  def format_to_markdown(text, images):
@@ -129,9 +141,13 @@ def format_to_markdown(text, images):
129
 
130
  def process_pdf(pdf_input, pdf_url):
131
  """Main function to process PDF input (file or URL) and generate Markdown."""
 
132
  if not HF_TOKEN:
133
  return "Error: HF_TOKEN not set in Spaces Secrets."
134
 
 
 
 
135
  # Decode URL-encoded string if provided
136
  if pdf_url and pdf_url.strip():
137
  pdf_url = urllib.parse.unquote(pdf_url)
@@ -166,7 +182,8 @@ iface = gr.Interface(
166
  ],
167
  outputs=gr.Markdown(label="Markdown Output"),
168
  title="PDF to Markdown Converter",
169
- description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Note: Requires poppler-utils and tesseract-ocr, which are pre-installed in Hugging Face Spaces.",
 
170
  )
171
 
172
  if __name__ == "__main__":
 
11
  from datetime import datetime
12
  import urllib.parse
13
  import logging
14
+ import subprocess
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO)
 
22
  REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
23
  hf_api = HfApi()
24
 
25
+ def check_poppler():
26
+ """Check if poppler-utils is installed."""
27
+ try:
28
+ result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
29
+ logger.info(f"Poppler version: {result.stdout}")
30
+ return True
31
+ except FileNotFoundError:
32
+ logger.error("Poppler not found in PATH.")
33
+ return False
34
+
35
  def ensure_hf_dataset():
36
  """Create or get Hugging Face dataset repository."""
37
  try:
 
87
 
88
  def extract_images_from_pdf(pdf_input):
89
  """Extract images from PDF (URL or file) and convert to PIL images."""
90
+ if not check_poppler():
91
+ return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
92
+
93
  try:
94
  if isinstance(pdf_input, str): # URL case
95
  logger.info(f"Downloading PDF from URL: {pdf_input}")
 
102
  return images
103
  except Exception as e:
104
  logger.error(f"Error extracting images: {str(e)}")
 
 
105
  return f"Error extracting images: {str(e)}"
106
 
107
  def format_to_markdown(text, images):
 
141
 
142
  def process_pdf(pdf_input, pdf_url):
143
  """Main function to process PDF input (file or URL) and generate Markdown."""
144
+ logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
145
  if not HF_TOKEN:
146
  return "Error: HF_TOKEN not set in Spaces Secrets."
147
 
148
+ # Log poppler status
149
+ logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
150
+
151
  # Decode URL-encoded string if provided
152
  if pdf_url and pdf_url.strip():
153
  pdf_url = urllib.parse.unquote(pdf_url)
 
182
  ],
183
  outputs=gr.Markdown(label="Markdown Output"),
184
  title="PDF to Markdown Converter",
185
+ description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Uses Docker to ensure poppler-utils and tesseract-ocr are installed.",
186
+ flagging_dir="/tmp/flagged" # Set writable flagging directory
187
  )
188
 
189
  if __name__ == "__main__":