broadfield-dev commited on
Commit
ae3cd0d
·
verified ·
1 Parent(s): c062555

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -107
app.py CHANGED
@@ -19,177 +19,319 @@ logger = logging.getLogger(__name__)
19
 
20
  # Initialize Hugging Face API
21
  HF_TOKEN = os.getenv("HF_TOKEN")
22
- REPO_NAME = "pdf-images-extracted"
23
  hf_api = HfApi()
24
 
25
  def check_poppler():
26
  try:
27
  result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
28
- logger.info(f"Poppler version: {result.stdout}")
 
 
 
 
 
 
 
 
29
  return True
30
  except FileNotFoundError:
31
- logger.error("Poppler not found in PATH.")
 
 
 
32
  return False
33
 
34
  def ensure_hf_dataset():
35
  try:
36
  if not HF_TOKEN:
37
- raise ValueError("HF_TOKEN is not set")
38
- repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
39
- logger.info(f"Dataset repo: {repo_id}")
40
- return repo_id
 
 
 
 
 
 
41
  except Exception as e:
42
- logger.error(f"Dataset error: {str(e)}")
43
- return f"Error: Failed to access dataset: {str(e)}"
 
 
 
 
 
 
 
 
44
 
45
- def upload_image_to_hf(image, filename):
46
- repo_id = ensure_hf_dataset()
47
- if isinstance(repo_id, str) and repo_id.startswith("Error"):
48
- return repo_id
49
  try:
50
- temp_path = f"/tmp/temp_{filename}.png"
 
 
 
 
51
  image.save(temp_path, format="PNG")
 
 
 
52
  file_url = hf_api.upload_file(
53
  path_or_fileobj=temp_path,
54
- path_in_repo=f"images/{filename}.png",
55
  repo_id=repo_id,
56
  repo_type="dataset",
57
- token=HF_TOKEN
58
  )
59
  os.remove(temp_path)
60
- logger.info(f"Uploaded image: {file_url}")
61
  return file_url
62
  except Exception as e:
63
- logger.error(f"Image upload error: {str(e)}")
64
- return f"Error uploading image: {str(e)}"
 
 
 
 
 
 
65
 
66
- def extract_text_from_pdf(pdf_input):
67
  try:
68
- if isinstance(pdf_input, str):
69
- response = requests.get(pdf_input, stream=True, timeout=10)
 
70
  response.raise_for_status()
71
- pdf_file = io.BytesIO(response.content)
72
- else:
73
- pdf_file = pdf_input
74
- with pdfplumber.open(pdf_file) as pdf:
75
- text = ""
76
- for page in pdf.pages:
77
- page_text = page.extract_text(layout=True) or ""
78
- text += page_text + "\n\n"
 
 
 
 
 
 
79
  tables = page.extract_tables()
80
- for table in tables:
81
- text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
82
- return text
 
 
 
 
 
 
 
83
  except Exception as e:
84
- logger.error(f"Text extraction error: {str(e)}")
85
  return f"Error extracting text: {str(e)}"
86
 
87
- def extract_images_from_pdf(pdf_input):
88
  if not check_poppler():
89
- return "Error: poppler-utils not found."
 
90
  try:
91
- if isinstance(pdf_input, str):
92
- response = requests.get(pdf_input, stream=True, timeout=10)
 
 
93
  response.raise_for_status()
94
- images = convert_from_bytes(response.content)
95
- else:
96
- images = convert_from_path(pdf_input.name)
 
 
 
 
 
 
 
 
97
  return images
98
  except Exception as e:
99
- logger.error(f"Image extraction error: {str(e)}")
100
  return f"Error extracting images: {str(e)}"
101
 
102
- def format_to_markdown(text, images):
103
  markdown_output = "# Extracted PDF Content\n\n"
104
- text = re.sub(r'\n\s*\n+', '\n\n', text.strip())
105
- lines = text.split("\n")
106
- for line in lines:
107
- if line.isupper() and len(line) > 5:
108
- markdown_output += f"## {line}\n\n"
109
- elif re.match(r'^\s*[\d\-*+]\.\s+', line):
110
- markdown_output += f"- {line.strip()[2:]}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  else:
112
- markdown_output += f"{line}\n\n"
113
- if isinstance(images, list) and images:
 
 
 
 
 
 
114
  markdown_output += "## Extracted Images\n\n"
115
- for i, image in enumerate(images):
116
- ocr_text = pytesseract.image_to_string(image).strip()
117
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
118
- filename = f"image_{i}_{timestamp}"
119
- image_url = upload_image_to_hf(image, filename)
120
- if not image_url.startswith("Error"):
121
- markdown_output += f"![Image {i+1}]({image_url})\n"
122
- if ocr_text:
 
 
 
 
 
 
 
123
  markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
124
- else:
125
- markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
126
- return markdown_output
 
 
 
 
 
 
 
 
 
 
127
 
128
- def process_pdf(pdf_input, pdf_url):
129
- status = "Starting PDF processing..."
130
- logger.info(status)
131
  if not HF_TOKEN:
132
- status = "Error: HF_TOKEN not set."
133
- logger.error(status)
134
- return status, status
135
- if pdf_url and pdf_url.strip():
136
- pdf_url = urllib.parse.unquote(pdf_url)
137
- status = f"Downloading PDF from URL: {pdf_url}"
138
- logger.info(status)
 
 
 
 
 
 
139
  try:
140
- response = requests.head(pdf_url, allow_redirects=True, timeout=5)
 
141
  response.raise_for_status()
142
- pdf_input = pdf_url
 
 
 
 
 
 
143
  except requests.RequestException as e:
144
- status = f"Error accessing URL: {str(e)}"
145
- logger.error(status)
146
- return status, status
147
- elif not pdf_input:
148
- status = "Error: No PDF provided."
149
- logger.error(status)
150
- return status, status
151
- status = "Extracting text..."
152
- logger.info(status)
153
- text = extract_text_from_pdf(pdf_input)
154
- if isinstance(text, str) and text.startswith("Error"):
155
- status = "Text extraction failed."
156
- logger.error(status)
157
- return text, status
158
- status = "Extracting images..."
159
- logger.info(status)
160
- images = extract_images_from_pdf(pdf_input)
161
- if isinstance(images, str) and images.startswith("Error"):
162
- status = "Image extraction failed."
163
- logger.error(status)
164
- return images, status
165
- status = "Formatting output..."
166
- logger.info(status)
167
- markdown_output = format_to_markdown(text, images)
168
- status = "Processing complete."
169
- logger.info(status)
170
- return markdown_output, status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  # Gradio Interface
173
  iface = gr.Interface(
174
  fn=process_pdf,
175
  inputs=[
176
  gr.File(label="Upload PDF File", file_types=[".pdf"]),
177
- gr.Textbox(label="PDF URL", placeholder="Enter PDF URL (e.g., https://example.com/file.pdf)"),
178
  ],
179
  outputs=[
180
  gr.Markdown(label="Markdown Output"),
181
  gr.Textbox(label="Processing Status", interactive=False),
182
  ],
183
  title="PDF to Markdown Converter",
184
- description="Convert a PDF file or URL to Markdown. Extracts text, images, and tables, with images uploaded to a Hugging Face dataset. Supports URL-encoded strings. Requires HF_TOKEN in Spaces Secrets.",
185
- allow_flagging="never"
 
 
 
 
186
  )
187
 
188
  if __name__ == "__main__":
189
  logger.info("Starting Gradio app...")
190
  try:
191
- iface.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
 
192
  logger.info("Gradio app started successfully.")
193
  except Exception as e:
194
- logger.error(f"Failed to start Gradio app: {str(e)}")
 
195
  raise
 
19
 
20
  # Initialize Hugging Face API
21
  HF_TOKEN = os.getenv("HF_TOKEN")
22
+ REPO_NAME = "pdf-images-extracted" # Consider making this configurable if needed
23
  hf_api = HfApi()
24
 
25
  def check_poppler():
26
  try:
27
  result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
28
+ # pdftoppm -v typically prints version info to stderr
29
+ version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
30
+ if version_info_log:
31
+ # Log the first line of the version info
32
+ logger.info(f"Poppler version check: {version_info_log.splitlines()[0]}")
33
+ else:
34
+ logger.info("Poppler 'pdftoppm -v' ran, but no version output on stdout/stderr. Poppler is likely present.")
35
+ # The main goal is to confirm 'pdftoppm' is executable.
36
+ # FileNotFoundError is the primary concern for "not found".
37
  return True
38
  except FileNotFoundError:
39
+ logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
40
+ return False
41
+ except Exception as e: # Catch any other unexpected errors during subprocess execution
42
+ logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
43
  return False
44
 
45
  def ensure_hf_dataset():
46
  try:
47
  if not HF_TOKEN:
48
+ # This case should ideally be caught before attempting dataset operations
49
+ # However, having a check here is a good safeguard.
50
+ logger.error("HF_TOKEN is not set. Cannot ensure Hugging Face dataset.")
51
+ return "Error: HF_TOKEN is not set. Please configure it in Space secrets."
52
+
53
+ # Use hf_api instance which might be pre-configured with token, or pass token explicitly
54
+ # create_repo will use token from HfApi if initialized with one, or passed token, or env.
55
+ repo_id_obj = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
56
+ logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
57
+ return repo_id_obj.repo_id # repo_id_obj is a RepoUrl object or similar
58
  except Exception as e:
59
+ logger.error(f"Hugging Face dataset error: {str(e)}")
60
+ return f"Error: Failed to access or create dataset '{REPO_NAME}': {str(e)}"
61
+
62
+ def upload_image_to_hf(image, filename_base):
63
+ # filename_base should not include extension, it will be added.
64
+ repo_id_or_error = ensure_hf_dataset()
65
+ if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
66
+ return repo_id_or_error # Return error message from ensure_hf_dataset
67
+
68
+ repo_id = repo_id_or_error # Now it's confirmed to be the repo_id string
69
 
 
 
 
 
70
  try:
71
+ # Create a unique filename with timestamp in the repo to avoid collisions
72
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") # Added microseconds for more uniqueness
73
+ repo_filename = f"images/{filename_base}_{timestamp}.png"
74
+
75
+ temp_path = f"/tmp/{filename_base}_{timestamp}.png" # Use unique temp name too
76
  image.save(temp_path, format="PNG")
77
+
78
+ logger.info(f"Attempting to upload {temp_path} to {repo_id}/{repo_filename}")
79
+
80
  file_url = hf_api.upload_file(
81
  path_or_fileobj=temp_path,
82
+ path_in_repo=repo_filename,
83
  repo_id=repo_id,
84
  repo_type="dataset",
85
+ token=HF_TOKEN # Explicitly pass token for clarity
86
  )
87
  os.remove(temp_path)
88
+ logger.info(f"Successfully uploaded image: {file_url}")
89
  return file_url
90
  except Exception as e:
91
+ logger.error(f"Image upload error for {filename_base}: {str(e)}")
92
+ # Clean up temp file if it exists and an error occurred after its creation
93
+ if 'temp_path' in locals() and os.path.exists(temp_path):
94
+ try:
95
+ os.remove(temp_path)
96
+ except OSError as ose:
97
+ logger.error(f"Error removing temp file {temp_path} after upload failure: {ose}")
98
+ return f"Error uploading image {filename_base}: {str(e)}"
99
 
100
+ def extract_text_from_pdf(pdf_input_source): # Renamed for clarity (source can be path, URL, or file obj)
101
  try:
102
+ if isinstance(pdf_input_source, str): # Indicates a URL
103
+ logger.info(f"Fetching PDF from URL for text extraction: {pdf_input_source}")
104
+ response = requests.get(pdf_input_source, stream=True, timeout=20) # Increased timeout slightly
105
  response.raise_for_status()
106
+ pdf_file_like_object = io.BytesIO(response.content)
107
+ logger.info("PDF downloaded successfully from URL.")
108
+ else: # Assumes a file object (e.g., from Gradio upload)
109
+ logger.info(f"Processing uploaded PDF file for text extraction: {getattr(pdf_input_source, 'name', 'N/A')}")
110
+ pdf_file_like_object = pdf_input_source
111
+
112
+ with pdfplumber.open(pdf_file_like_object) as pdf:
113
+ full_text = ""
114
+ for i, page in enumerate(pdf.pages):
115
+ logger.debug(f"Extracting text from page {i+1}")
116
+ page_text = page.extract_text(layout=True, x_density=1, y_density=1) or "" # x_density/y_density can impact layout accuracy
117
+ full_text += page_text + "\n\n" # Add double newline as page separator
118
+
119
+ logger.debug(f"Extracting tables from page {i+1}")
120
  tables = page.extract_tables()
121
+ if tables:
122
+ for table_idx, table_data in enumerate(tables):
123
+ logger.debug(f"Processing table {table_idx+1} on page {i+1}")
124
+ if table_data: # Ensure table_data is not empty
125
+ table_md = "\n".join([" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data])
126
+ header_separator = " | ".join(["---"] * len(table_data[0])) if table_data[0] else ""
127
+ full_text += f"**Table:**\n{table_md[:table_md.find(chr(10)) if table_md.find(chr(10)) > 0 else len(table_md)]}\n{header_separator}\n{table_md[table_md.find(chr(10))+1 if table_md.find(chr(10)) > 0 else '']}\n\n"
128
+ # full_text += f"**Table:**\n{table_md}\n\n" # Simpler table version
129
+ logger.info("Text and table extraction successful.")
130
+ return full_text
131
  except Exception as e:
132
+ logger.error(f"Text extraction error: {str(e)}", exc_info=True)
133
  return f"Error extracting text: {str(e)}"
134
 
135
+ def extract_images_from_pdf(pdf_input_source): # Renamed for clarity
136
  if not check_poppler():
137
+ return "Error: poppler-utils not found or not working correctly. Image extraction depends on it."
138
+
139
  try:
140
+ images = []
141
+ if isinstance(pdf_input_source, str): # Indicates a URL
142
+ logger.info(f"Fetching PDF from URL for image extraction: {pdf_input_source}")
143
+ response = requests.get(pdf_input_source, stream=True, timeout=20) # Increased timeout
144
  response.raise_for_status()
145
+ logger.info("PDF downloaded successfully, converting to images.")
146
+ images = convert_from_bytes(response.content, dpi=200) # dpi can be adjusted
147
+ else: # Assumes a file object (e.g., from Gradio upload which is a TemporaryFileWrapper)
148
+ file_path = getattr(pdf_input_source, 'name', None)
149
+ if not file_path:
150
+ logger.error("Uploaded PDF file has no name attribute, cannot process for images.")
151
+ return "Error: Could not get path from uploaded PDF file for image extraction."
152
+ logger.info(f"Processing uploaded PDF file for image extraction: {file_path}")
153
+ images = convert_from_path(file_path, dpi=200)
154
+
155
+ logger.info(f"Successfully extracted {len(images)} image(s) from PDF.")
156
  return images
157
  except Exception as e:
158
+ logger.error(f"Image extraction error: {str(e)}", exc_info=True)
159
  return f"Error extracting images: {str(e)}"
160
 
161
+ def format_to_markdown(text_content, images_list):
162
  markdown_output = "# Extracted PDF Content\n\n"
163
+
164
+ # Normalize newlines: multiple consecutive newlines become a single blank line (two \n chars)
165
+ text_content = re.sub(r'\n\s*\n+', '\n\n', text_content.strip())
166
+
167
+ lines = text_content.split('\n') # Split by single newline. Blank lines between paragraphs become empty strings.
168
+
169
+ for i, line_text in enumerate(lines):
170
+ line_stripped = line_text.strip()
171
+
172
+ if not line_stripped: # Handle blank lines explicitly
173
+ # Add a single newline to markdown. This helps maintain paragraph separation.
174
+ markdown_output += "\n"
175
+ continue
176
+
177
+ # Regex for various list markers: "1.", "*", "-", "+" followed by space and content
178
+ list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
179
+
180
+ is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100 # Length constraint for ALL CAPS headings
181
+
182
+ if is_heading_candidate and not list_match: # Check it's not an ALL CAPS list item
183
+ markdown_output += f"## {line_stripped}\n\n"
184
+ elif list_match:
185
+ list_item_text = list_match.group(1) # Get the content part of the list item
186
+ markdown_output += f"- {list_item_text}\n" # Single newline for list items to keep them together
187
  else:
188
+ # Default: treat as a paragraph line, add double newline for Markdown paragraph
189
+ markdown_output += f"{line_text}\n\n"
190
+
191
+ # Consolidate potentially excessive newlines that might arise from the logic above
192
+ markdown_output = re.sub(r'\n\s*\n+', '\n\n', markdown_output.strip())
193
+ markdown_output += "\n\n" # Ensure a blank line at the end of text content before images
194
+
195
+ if isinstance(images_list, list) and images_list:
196
  markdown_output += "## Extracted Images\n\n"
197
+ for i, img_pil in enumerate(images_list):
198
+ ocr_text = ""
199
+ try:
200
+ ocr_text = pytesseract.image_to_string(img_pil).strip()
201
+ logger.info(f"OCR for image {i+1} successful.")
202
+ except Exception as ocr_e:
203
+ logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
204
+ ocr_text = f"OCR failed: {str(ocr_e)}"
205
+
206
+ image_filename_base = f"extracted_image_{i+1}"
207
+ image_url_or_error = upload_image_to_hf(img_pil, image_filename_base)
208
+
209
+ if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
210
+ markdown_output += f"![Image {i+1}]({image_url_or_error})\n"
211
+ if ocr_text and not ocr_text.startswith("OCR failed:"):
212
  markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
213
+ elif ocr_text: # OCR failed message
214
+ markdown_output += f"**Image {i+1} OCR Note:** {ocr_text}\n\n"
215
+
216
+ else: # Error during upload or from ensure_hf_dataset
217
+ error_message = str(image_url_or_error) # Ensure it's a string
218
+ markdown_output += f"**Image {i+1} (Upload Error):** {error_message}\n\n"
219
+
220
+ return markdown_output.strip()
221
+
222
+
223
+ def process_pdf(pdf_file_upload, pdf_url_input):
224
+ current_status = "Starting PDF processing..."
225
+ logger.info(current_status)
226
 
 
 
 
227
  if not HF_TOKEN:
228
+ current_status = "Error: HF_TOKEN is not set. Please set it in Space secrets for image uploads."
229
+ logger.error(current_status)
230
+ # App can still try to process text, but image uploads will fail.
231
+ # Let's allow text extraction to proceed but warn about images.
232
+ # For a stricter approach, uncomment return:
233
+ # return current_status, current_status
234
+
235
+ pdf_input_source = None
236
+
237
+ if pdf_url_input and pdf_url_input.strip():
238
+ resolved_url = urllib.parse.unquote(pdf_url_input.strip())
239
+ current_status = f"Attempting to download PDF from URL: {resolved_url}"
240
+ logger.info(current_status)
241
  try:
242
+ # Use HEAD request to check URL validity and content type quickly
243
+ response = requests.head(resolved_url, allow_redirects=True, timeout=10)
244
  response.raise_for_status()
245
+ content_type = response.headers.get('content-type', '').lower()
246
+ if 'application/pdf' not in content_type:
247
+ current_status = f"Error: URL does not point to a PDF file (Content-Type: {content_type})."
248
+ logger.error(current_status)
249
+ return current_status, current_status
250
+ pdf_input_source = resolved_url # Use the URL string as the source
251
+ logger.info("PDF URL validated.")
252
  except requests.RequestException as e:
253
+ current_status = f"Error accessing URL '{resolved_url}': {str(e)}"
254
+ logger.error(current_status)
255
+ return current_status, current_status
256
+ elif pdf_file_upload:
257
+ # pdf_file_upload is a tempfile._TemporaryFileWrapper object from Gradio
258
+ pdf_input_source = pdf_file_upload
259
+ current_status = f"Processing uploaded PDF file: {pdf_file_upload.name}"
260
+ logger.info(current_status)
261
+ else:
262
+ current_status = "Error: No PDF file uploaded and no PDF URL provided."
263
+ logger.error(current_status)
264
+ return current_status, current_status
265
+
266
+ current_status = "Extracting text and tables from PDF..."
267
+ logger.info(current_status)
268
+ extracted_text = extract_text_from_pdf(pdf_input_source)
269
+ if isinstance(extracted_text, str) and extracted_text.startswith("Error extracting text:"):
270
+ current_status = f"Text extraction failed. {extracted_text}"
271
+ logger.error(current_status)
272
+ # Decide if to stop or continue for images
273
+ # For now, let's return the error directly
274
+ return extracted_text, current_status
275
+
276
+ # If pdf_input_source was a URL, extract_text_from_pdf already downloaded it.
277
+ # For extract_images_from_pdf, we need to pass the URL or file path again.
278
+ # If it was an uploaded file, its stream might have been consumed or pointer moved.
279
+ # It's safer to re-open/re-access for different libraries if they don't handle streams well.
280
+ # However, pdfplumber and pdf2image should handle file paths/objects correctly.
281
+ # If pdf_input_source is a file object, reset its read pointer if necessary.
282
+ if hasattr(pdf_input_source, 'seek') and not isinstance(pdf_input_source, str):
283
+ pdf_input_source.seek(0)
284
+
285
+ current_status = "Extracting images from PDF..."
286
+ logger.info(current_status)
287
+ extracted_images = extract_images_from_pdf(pdf_input_source)
288
+ if isinstance(extracted_images, str) and extracted_images.startswith("Error"): # Error string from extraction
289
+ current_status = f"Image extraction failed or partially failed. {extracted_images}"
290
+ logger.warning(current_status) # Warning, as text might still be useful
291
+ # We can proceed to format markdown with text and image error.
292
+ # Set images to empty list to avoid error in format_to_markdown
293
+ extracted_images = [] # Or pass the error string to be included by format_to_markdown
294
+ # Let format_to_markdown handle this, for now, we will pass the error string if it happened
295
+ # No, format_to_markdown expects a list of images or an error string from check_poppler
296
+ # if isinstance(extracted_images, str) -> it's an error string, that is fine.
297
+
298
+ current_status = "Formatting content to Markdown..."
299
+ logger.info(current_status)
300
+ # Pass the original extracted_images (which could be an error string or list of PIL images)
301
+ markdown_result = format_to_markdown(extracted_text, extracted_images)
302
+
303
+ current_status = "PDF processing complete."
304
+ logger.info(current_status)
305
+ return markdown_result, current_status
306
 
307
  # Gradio Interface
308
  iface = gr.Interface(
309
  fn=process_pdf,
310
  inputs=[
311
  gr.File(label="Upload PDF File", file_types=[".pdf"]),
312
+ gr.Textbox(label="Or Enter PDF URL", placeholder="e.g., https://example.com/file.pdf"),
313
  ],
314
  outputs=[
315
  gr.Markdown(label="Markdown Output"),
316
  gr.Textbox(label="Processing Status", interactive=False),
317
  ],
318
  title="PDF to Markdown Converter",
319
+ description="Convert a PDF (uploaded file or URL) to Markdown. Extracts text, tables, and images. Images are uploaded to a Hugging Face dataset. Requires HF_TOKEN in Spaces Secrets for image functionality.",
320
+ allow_flagging="never",
321
+ examples=[
322
+ [None, "https.arxiv.org/pdf/1706.03762.pdf"], # Attention is All You Need
323
+ [None, "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"] # A simple dummy PDF
324
+ ]
325
  )
326
 
327
  if __name__ == "__main__":
328
  logger.info("Starting Gradio app...")
329
  try:
330
+ # When running in Hugging Face Spaces, share=False is recommended.
331
+ # The Space itself provides the public URL.
332
+ iface.launch(server_name="0.0.0.0", server_port=7860, share=False)
333
  logger.info("Gradio app started successfully.")
334
  except Exception as e:
335
+ logger.error(f"Failed to start Gradio app: {str(e)}", exc_info=True)
336
+ # Re-raise the exception to ensure the script exits if Gradio fails to launch
337
  raise