broadfield-dev commited on
Commit
155ac2a
·
verified ·
1 Parent(s): 0e0f376

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -1
app.py CHANGED
@@ -10,6 +10,11 @@ from huggingface_hub import HfApi, create_repo
10
  import re
11
  from datetime import datetime
12
  import urllib.parse
 
 
 
 
 
13
 
14
  # Initialize Hugging Face API
15
  HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
@@ -22,6 +27,7 @@ def ensure_hf_dataset():
22
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
23
  return repo_id
24
  except Exception as e:
 
25
  return f"Error creating dataset repo: {str(e)}"
26
 
27
  def upload_image_to_hf(image, filename):
@@ -46,6 +52,7 @@ def upload_image_to_hf(image, filename):
46
  os.remove(temp_path)
47
  return file_url
48
  except Exception as e:
 
49
  return f"Error uploading image: {str(e)}"
50
 
51
  def extract_text_from_pdf(pdf_input):
@@ -64,19 +71,25 @@ def extract_text_from_pdf(pdf_input):
64
  text += page_text + "\n\n"
65
  return text
66
  except Exception as e:
 
67
  return f"Error extracting text: {str(e)}"
68
 
69
  def extract_images_from_pdf(pdf_input):
70
  """Extract images from PDF (URL or file) and convert to PIL images."""
71
  try:
72
  if isinstance(pdf_input, str): # URL case
 
73
  response = requests.get(pdf_input, stream=True)
74
  response.raise_for_status()
75
  images = convert_from_bytes(response.content)
76
  else: # File upload case
 
77
  images = convert_from_path(pdf_input.name)
78
  return images
79
  except Exception as e:
 
 
 
80
  return f"Error extracting images: {str(e)}"
81
 
82
  def format_to_markdown(text, images):
@@ -122,11 +135,13 @@ def process_pdf(pdf_input, pdf_url):
122
  # Decode URL-encoded string if provided
123
  if pdf_url and pdf_url.strip():
124
  pdf_url = urllib.parse.unquote(pdf_url)
 
125
  try:
126
  response = requests.head(pdf_url, allow_redirects=True)
127
  response.raise_for_status()
128
  pdf_input = pdf_url
129
  except requests.RequestException as e:
 
130
  return f"Error accessing URL: {str(e)}"
131
  elif not pdf_input:
132
  return "Error: Please provide a PDF file or URL."
@@ -151,7 +166,7 @@ iface = gr.Interface(
151
  ],
152
  outputs=gr.Markdown(label="Markdown Output"),
153
  title="PDF to Markdown Converter",
154
- description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
155
  )
156
 
157
  if __name__ == "__main__":
 
10
  import re
11
  from datetime import datetime
12
  import urllib.parse
13
+ import logging
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
 
19
  # Initialize Hugging Face API
20
  HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
 
27
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
28
  return repo_id
29
  except Exception as e:
30
+ logger.error(f"Error creating dataset repo: {str(e)}")
31
  return f"Error creating dataset repo: {str(e)}"
32
 
33
  def upload_image_to_hf(image, filename):
 
52
  os.remove(temp_path)
53
  return file_url
54
  except Exception as e:
55
+ logger.error(f"Error uploading image: {str(e)}")
56
  return f"Error uploading image: {str(e)}"
57
 
58
  def extract_text_from_pdf(pdf_input):
 
71
  text += page_text + "\n\n"
72
  return text
73
  except Exception as e:
74
+ logger.error(f"Error extracting text: {str(e)}")
75
  return f"Error extracting text: {str(e)}"
76
 
77
  def extract_images_from_pdf(pdf_input):
78
  """Extract images from PDF (URL or file) and convert to PIL images."""
79
  try:
80
  if isinstance(pdf_input, str): # URL case
81
+ logger.info(f"Downloading PDF from URL: {pdf_input}")
82
  response = requests.get(pdf_input, stream=True)
83
  response.raise_for_status()
84
  images = convert_from_bytes(response.content)
85
  else: # File upload case
86
+ logger.info(f"Processing uploaded PDF: {pdf_input.name}")
87
  images = convert_from_path(pdf_input.name)
88
  return images
89
  except Exception as e:
90
+ logger.error(f"Error extracting images: {str(e)}")
91
+ if "poppler" in str(e).lower():
92
+ return "Error: Poppler not found. Ensure poppler-utils is installed and in PATH. In Hugging Face Spaces, poppler-utils should be pre-installed; contact support if this persists."
93
  return f"Error extracting images: {str(e)}"
94
 
95
  def format_to_markdown(text, images):
 
135
  # Decode URL-encoded string if provided
136
  if pdf_url and pdf_url.strip():
137
  pdf_url = urllib.parse.unquote(pdf_url)
138
+ logger.info(f"Decoded URL: {pdf_url}")
139
  try:
140
  response = requests.head(pdf_url, allow_redirects=True)
141
  response.raise_for_status()
142
  pdf_input = pdf_url
143
  except requests.RequestException as e:
144
+ logger.error(f"Error accessing URL: {str(e)}")
145
  return f"Error accessing URL: {str(e)}"
146
  elif not pdf_input:
147
  return "Error: Please provide a PDF file or URL."
 
166
  ],
167
  outputs=gr.Markdown(label="Markdown Output"),
168
  title="PDF to Markdown Converter",
169
+ description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Note: Requires poppler-utils and tesseract-ocr, which are pre-installed in Hugging Face Spaces.",
170
  )
171
 
172
  if __name__ == "__main__":