avans06 commited on
Commit
69656bf
·
1 Parent(s): 0c0ff58

Add direct HTML text input mode

Browse files

Implements a new feature allowing users to convert HTML by pasting it directly into a textarea.
This provides an alternative to the URL crawler for offline or single-file conversions.

Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +298 -225
  3. requirements.txt +1 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 📝
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.34.2
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -100,6 +100,32 @@ def convert_html_to_md(html_content, output_md_path, pandoc_output_format, pando
100
  logging.warning(f"Could not remove empty/failed output file {output_md_path}: {remove_err}")
101
  return False
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  def create_zip_archive(source_dir, output_zip_path):
105
  """Creates a ZIP archive from the contents of source_dir."""
@@ -116,260 +142,253 @@ def create_zip_archive(source_dir, output_zip_path):
116
  except Exception as e:
117
  logging.error(f"Failed to create ZIP archive {output_zip_path}: {e}")
118
  return False
119
-
120
- # --- Main Gradio Function ---
121
- def process_conversion_request(start_url_str, restrict_path, use_aggressive_conversion, progress=gr.Progress(track_tqdm=True)):
122
- """The main function triggered by the Gradio interface."""
123
 
124
- # --- 0. Check Pandoc via pypandoc ---
 
 
 
 
 
 
 
 
 
125
  if not check_pandoc_available():
126
- return "Error: pypandoc could not find a Pandoc executable. Please ensure Pandoc is installed or install `pypandoc_binary`.", None
127
-
128
- # --- 1. Validate URL and Determine Restriction Path ---
129
- start_url_str = start_url_str.strip()
130
- start_path_dir_for_restriction = None # Initialize restriction path base
131
-
132
- if not start_url_str:
133
- return "Error: Starting URL cannot be empty.", None
134
- try:
135
- parsed_start_url = urlparse(start_url_str)
136
- if not parsed_start_url.scheme or not parsed_start_url.netloc:
137
- raise ValueError("Invalid URL format (missing scheme or domain).")
138
- base_netloc = parsed_start_url.netloc
139
- base_scheme = parsed_start_url.scheme
140
-
141
- # Calculate the base directory path for comparison if restriction is enabled
142
- start_path_cleaned = parsed_start_url.path.strip('/')
143
- if start_path_cleaned: # If not root path
144
- # Use os.path.dirname to get the directory part
145
- # dirname('main/index.html') -> 'main'
146
- # dirname('main') -> '' (This needs correction if start URL is like /main/)
147
- # Let's adjust: if no '/' it means it's the first level dir or a root file
148
- if '/' not in start_path_cleaned and '.' not in start_path_cleaned:
149
- start_path_dir_for_restriction = start_path_cleaned # e.g. 'main'
150
- else:
151
- start_path_dir_for_restriction = os.path.dirname(start_path_cleaned) # e.g. 'main' from main/index.html, or '' from /index.html
152
- if start_path_dir_for_restriction == '': # Handle case like /index.html correctly
153
- start_path_dir_for_restriction = None # Treat like root, don't restrict path based on this
154
-
155
- except ValueError as e:
156
- return f"Error: Invalid starting URL '{start_url_str}': {e}", None
157
-
158
- # Log restriction status
159
- restriction_msg = f"Path restriction enabled: limiting to paths starting like '{start_path_dir_for_restriction}/'." if restrict_path and start_path_dir_for_restriction else "Path restriction disabled or starting from root."
160
 
161
  # --- Determine Pandoc Settings based on Checkbox ---
162
  # wrap=none, Prevent auto-wrapping lines
163
  if use_aggressive_conversion:
164
  pandoc_format_to_use = 'gfm-raw_html+hard_line_breaks'
165
  pandoc_args_to_use = ['--wrap=none', '--markdown-headings=atx']
166
- conversion_mode_msg = "Using aggressive Markdown conversion (less raw HTML, ATX headers)."
167
  else:
168
  # Using gfm+hard_line_breaks ensures GitHub compatibility and respects single newlines
169
  pandoc_format_to_use = 'gfm+hard_line_breaks'
170
  pandoc_args_to_use = ['--wrap=none']
171
- conversion_mode_msg = "Using standard Markdown conversion (may preserve more raw HTML)."
172
-
173
  logging.info(conversion_mode_msg) # Log the mode
174
 
175
- # --- 2. Setup Temporary Directory & Crawler ---
176
- staging_dir = tempfile.mkdtemp(prefix="md_convert_")
177
- logging.info(f"Created temporary staging directory: {staging_dir}")
178
- output_zip_file = None
179
-
180
- urls_to_process = Queue()
181
- processed_urls = set() # Still needed to avoid duplicates
182
- failed_urls = set()
183
- converted_count = 0
184
- url_count_estimate = 1 # Total unique URLs discovered so far (starts with the first one)
185
- dequeued_count = 0
186
-
187
- urls_to_process.put(start_url_str)
188
- processed_urls.add(start_url_str) # Add start URL here
189
-
190
- log_messages = ["Process started...", restriction_msg, conversion_mode_msg]
191
-
192
- try:
193
- # --- 3. Crawl and Convert Loop ---
194
- while not urls_to_process.empty():
195
- # --- Get URL and Increment Dequeued Count ---
196
- current_url = urls_to_process.get()
197
- dequeued_count += 1 # Increment when an item is taken for processing
198
-
199
- # --- Update Progress Bar ---
200
- # Calculate progress based on dequeued items vs. total discovered
201
- # Denominator is the total number of unique URLs added to processed_urls/queue so far
202
- denominator = max(1, url_count_estimate) # url_count_estimate increases when new links are found
203
- current_progress_value = dequeued_count / denominator
204
-
205
- # Update Gradio progress - use dequeued_count for user display
206
- # Display: Processed X / Total_Discovered Y
207
- progress(current_progress_value, desc=f"Processing {dequeued_count}/{url_count_estimate}. Queue: {urls_to_process.qsize()}")
208
-
209
- # --- Process the current URL ---
210
- log_message = f"\nProcessing ({dequeued_count}/{url_count_estimate}): {current_url}"
211
- logging.info(log_message)
212
- log_messages.append(log_message)
213
-
214
- # --- 3a. Fetch HTML ---
215
- time.sleep(POLITENESS_DELAY)
216
- html_content = fetch_html(current_url)
217
- if not html_content:
218
- failed_urls.add(current_url)
219
- log_message = f" -> Failed to fetch content."
220
- logging.warning(log_message)
221
- log_messages.append(log_message)
222
- continue
223
-
224
- # --- 3b. Determine Output Path ---
225
- parsed_current_url = urlparse(current_url)
226
- # Get the path part of the URL, removing leading/trailing slashes
227
- url_path_segment = parsed_current_url.path.strip('/') # e.g., "main/index.html", "HEAD/index.html", ""
228
- # If the path is empty (domain root like https://example.com/), use 'index' as the base name
229
- if not url_path_segment:
230
- path_in_zip_base = 'index'
231
- else:
232
- path_in_zip_base = url_path_segment # e.g., "main/index.html", "HEAD/index.html"
233
-
234
- # Now, determine the final .md filename based on the path base
235
- if path_in_zip_base.lower().endswith('.html'):
236
- relative_md_filename = os.path.splitext(path_in_zip_base)[0] + ".md"
237
- elif path_in_zip_base.endswith('/'): # Should not happen often with strip('/') but handle defensively
238
- # If URL was like /docs/, path_in_zip_base would be 'docs' after strip.
239
- # This case is less likely needed now, but safe to keep.
240
- relative_md_filename = os.path.join(path_in_zip_base, "index.md")
241
- else:
242
- # If it's not empty and doesn't end with .html, assume it's a directory path
243
- # Append 'index.md' to treat it like accessing a directory index
244
- # e.g., if URL path was /main, url_path_segment is 'main', output becomes 'main/index.md'
245
- # If URL path was /path/to/file (no .html), output becomes 'path/to/file.md' if '.' in basename, else 'path/to/file/index.md'
246
- basename = os.path.basename(path_in_zip_base)
247
- if '.' in basename: # Check if it looks like a file without .html extension
248
- relative_md_filename = path_in_zip_base + ".md"
249
- else: # Assume it's a directory reference
250
- relative_md_filename = os.path.join(path_in_zip_base, "index.md")
251
-
252
- # Construct full path within the temporary staging directory
253
- output_md_full_path = os.path.join(staging_dir, relative_md_filename)
254
- output_md_dir = os.path.dirname(output_md_full_path)
255
 
256
- # Create directories if they don't exist (check if output_md_dir is not empty)
257
  try:
258
- if output_md_dir and not os.path.exists(output_md_dir):
259
- os.makedirs(output_md_dir)
260
- except OSError as e:
261
- log_message = f" -> Error creating directory {output_md_dir}: {e}. Skipping conversion for this URL."
262
- logging.error(log_message)
263
- log_messages.append(log_message)
264
- failed_urls.add(current_url)
265
- continue # Skip to next URL
266
-
267
- # --- 3c. Convert HTML to Markdown ---
268
- if convert_html_to_md(html_content, output_md_full_path, pandoc_format_to_use, pandoc_args_to_use):
269
- converted_count += 1
270
- log_message = f" -> Converted successfully to {os.path.relpath(output_md_full_path, staging_dir)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  logging.info(log_message)
272
  log_messages.append(log_message)
273
- else:
274
- failed_urls.add(current_url)
275
- log_message = f" -> Conversion failed."
276
- logging.warning(log_message)
277
- log_messages.append(log_message)
278
-
279
- # --- 3d. Find and Add New Links ---
280
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  soup = BeautifulSoup(html_content, 'lxml')
282
- links_found_this_page = 0
283
- links_skipped_due_to_path = 0
284
  for link in soup.find_all('a', href=True):
285
- href = link['href']
286
- absolute_url = urljoin(current_url, href)
287
- absolute_url = urlparse(absolute_url)._replace(fragment="").geturl()
288
  parsed_absolute_url = urlparse(absolute_url)
289
-
290
  # Basic Filtering (scheme, domain, looks like html)
291
  is_valid_target = (
292
  parsed_absolute_url.scheme == base_scheme and
293
- parsed_absolute_url.netloc == base_netloc and
294
- (not parsed_absolute_url.path or
295
- parsed_absolute_url.path == '/' or
296
- parsed_absolute_url.path.lower().endswith('.html') or
297
- '.' not in os.path.basename(parsed_absolute_url.path.rstrip('/')) # Include directory links
298
- )
299
- )
300
-
301
- if not is_valid_target:
302
- continue # Skip invalid links early
303
 
 
 
304
  # --- Path Restriction Check ---
305
  path_restricted = False
306
  # Only apply if checkbox is checked AND we derived a non-root restriction path
307
- if restrict_path and start_path_dir_for_restriction is not None:
308
- candidate_path_clean = parsed_absolute_url.path.strip('/')
309
  # Check if the cleaned candidate path starts with the restriction dir + '/'
310
  # OR if the candidate path is exactly the restriction dir (e.g. /main matching main)
311
- if not (candidate_path_clean.startswith(start_path_dir_for_restriction + '/') or \
312
- candidate_path_clean == start_path_dir_for_restriction):
313
  path_restricted = True
314
- links_skipped_due_to_path += 1
315
  # --- End Path Restriction Check ---
316
-
317
  # Add to queue only if NOT restricted and NOT already processed
318
  if not path_restricted and absolute_url not in processed_urls:
319
  processed_urls.add(absolute_url) # Add to set immediately
320
  urls_to_process.put(absolute_url)
321
- links_found_this_page += 1
322
  url_count_estimate += 1
323
 
324
- # Log link discovery summary for the page
325
- log_links_msg = f" -> Found {links_found_this_page} new link(s) to process."
326
- if links_skipped_due_to_path > 0:
327
- log_links_msg += f" Skipped {links_skipped_due_to_path} link(s) due to path restriction."
328
- logging.info(log_links_msg)
329
- log_messages.append(log_links_msg)
330
- except Exception as e:
331
- log_message = f" -> Error parsing links on {current_url}: {e}"
332
- logging.error(log_message)
333
- log_messages.append(log_message)
334
 
335
- # --- 4. Create ZIP Archive ---
336
- progress(1.0, desc="Zipping files...")
337
- log_messages.append("\nCrawling complete. Creating ZIP file...")
338
- yield "\n".join(log_messages), None
339
-
340
- with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as temp_zip:
341
- output_zip_path = temp_zip.name
342
-
343
- if create_zip_archive(staging_dir, output_zip_path):
344
- log_messages.append(f"\nProcess finished successfully!")
345
- log_messages.append(f"Converted {converted_count} pages using {'aggressive' if use_aggressive_conversion else 'standard'} mode.") # Inform user of mode used
346
- if failed_urls:
347
- log_messages.append(f"Failed to process {len(failed_urls)} URLs (check logs).")
348
- log_messages.append(f"ZIP file ready: {os.path.basename(output_zip_path)}")
349
- yield "\n".join(log_messages), output_zip_path
350
- else:
351
- log_messages.append("\nError: Failed to create the final ZIP archive.")
352
- yield "\n".join(log_messages), None
353
 
354
- except KeyboardInterrupt:
355
- log_messages.append("\nProcess interrupted by user.")
356
- yield "\n".join(log_messages), None
357
- except Exception as e:
358
- log_messages.append(f"\nAn unexpected error occurred: {e}")
359
- logging.error("Unhandled exception in process_conversion_request:")
360
- logging.error(traceback.format_exc())
361
- yield "\n".join(log_messages), None
362
- finally:
363
- # --- 5. Cleanup ---
364
- if os.path.exists(staging_dir):
365
- try:
 
 
 
 
 
366
  shutil.rmtree(staging_dir)
367
  logging.info(f"Cleaned up temporary directory: {staging_dir}")
368
- except Exception as e:
369
- logging.error(f"Error cleaning up temporary directory {staging_dir}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
  css = """
372
  textarea[rows]:not([rows="1"]) {
 
373
  overflow-y: auto !important;
374
  scrollbar-width: thin !important;
375
  }
@@ -384,45 +403,99 @@ textarea[rows]:not([rows="1"])::-webkit-scrollbar-thumb {
384
  """
385
 
386
  # --- Gradio UI Definition ---
387
- with gr.Blocks(title="HTML Docs to Markdown Converter", css=css) as demo:
388
  gr.Markdown(
389
  """
390
- # HTML Documentation to Markdown Converter (via pypandoc)
391
- Enter the starting `index.html` URL of an online documentation site.
392
- The script will crawl internal HTML links, convert pages to Markdown, and package results into a ZIP file.
393
- **Requires `pip install pypandoc_binary`**.
 
 
394
  """
395
  )
396
 
397
- with gr.Row():
 
 
 
 
 
 
 
 
398
  url_input = gr.Textbox(
399
  label="Starting Index HTML URL",
400
  placeholder="e.g., https://dghs-imgutils.deepghs.org/main/index.html"
401
  )
402
-
403
- with gr.Row():
404
  restrict_path_checkbox = gr.Checkbox(
405
  label="Restrict crawl to starting path structure (e.g., if start is '/main/index.html', only crawl '/main/...' URLs)",
406
  value=True # Default to restricting path
407
  )
 
 
 
 
 
 
 
 
 
 
 
408
  aggressive_md_checkbox = gr.Checkbox(
409
  label="Aggressive Markdown conversion (disable raw HTML, use ATX headers)",
410
  value=True # Default to aggressive conversion
411
  )
412
-
413
  with gr.Row():
414
  start_button = gr.Button("Start Conversion", variant="primary")
415
 
416
- with gr.Row():
417
- log_output = gr.Textbox(label="Progress Logs", lines=15, interactive=False, show_copy_button=True)
418
-
419
- with gr.Row():
420
- zip_output = gr.File(label="Download Markdown ZIP")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
 
422
  start_button.click(
423
  fn=process_conversion_request,
424
- inputs=[url_input, restrict_path_checkbox, aggressive_md_checkbox],
425
- outputs=[log_output, zip_output],
 
 
 
 
 
 
 
 
 
 
 
 
426
  show_progress="full"
427
  )
428
 
 
100
  logging.warning(f"Could not remove empty/failed output file {output_md_path}: {remove_err}")
101
  return False
102
 
103
+ # --- Function for direct HTML to Markdown conversion ---
104
+ def convert_html_text_to_md_string(html_content, pandoc_output_format, pandoc_extra_args):
105
+ """
106
+ Converts an HTML string directly to a Markdown string using pypandoc.
107
+ """
108
+ if not html_content or not html_content.strip():
109
+ logging.warning("Input HTML content is empty. Conversion skipped.")
110
+ return None, "Error: HTML content cannot be empty."
111
+
112
+ input_format = 'html+smart'
113
+ try:
114
+ logging.debug(f"pypandoc converting text to {pandoc_output_format} with args: {pandoc_extra_args}")
115
+ output_md = pypandoc.convert_text(
116
+ source=html_content,
117
+ to=pandoc_output_format,
118
+ format=input_format,
119
+ extra_args=pandoc_extra_args,
120
+ encoding='utf-8'
121
+ )
122
+ logging.info("Successfully converted HTML text to Markdown string.")
123
+ return output_md, "Conversion successful."
124
+ except Exception as e:
125
+ error_msg = f"Error during pypandoc conversion: {e}"
126
+ logging.error(error_msg)
127
+ logging.error(traceback.format_exc())
128
+ return None, error_msg
129
 
130
  def create_zip_archive(source_dir, output_zip_path):
131
  """Creates a ZIP archive from the contents of source_dir."""
 
142
  except Exception as e:
143
  logging.error(f"Failed to create ZIP archive {output_zip_path}: {e}")
144
  return False
 
 
 
 
145
 
146
+ # --- Main Gradio Function (handles both modes) ---
147
+ # The function now handles both URL and direct HTML text input.
148
+ # It needs to be a generator (`yield`) to support progress updates in URL mode.
149
+ def process_conversion_request(
150
+ input_type, start_url_str, html_text_input,
151
+ restrict_path, use_aggressive_conversion,
152
+ progress=gr.Progress(track_tqdm=True)
153
+ ):
154
+ """The main function triggered by the Gradio interface, handling both modes."""
155
+ # --- 0. Check Pandoc Availability ---
156
  if not check_pandoc_available():
157
+ error_msg = "Error: Pandoc executable not found. Please ensure Pandoc is installed or run `pip install pypandoc_binary`."
158
+ # Yield a final state for all outputs
159
+ yield error_msg, None, gr.Markdown(visible=False), None
160
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  # --- Determine Pandoc Settings based on Checkbox ---
163
  # wrap=none, Prevent auto-wrapping lines
164
  if use_aggressive_conversion:
165
  pandoc_format_to_use = 'gfm-raw_html+hard_line_breaks'
166
  pandoc_args_to_use = ['--wrap=none', '--markdown-headings=atx']
167
+ conversion_mode_msg = "Using aggressive conversion mode (disabling raw HTML, using ATX headers)."
168
  else:
169
  # Using gfm+hard_line_breaks ensures GitHub compatibility and respects single newlines
170
  pandoc_format_to_use = 'gfm+hard_line_breaks'
171
  pandoc_args_to_use = ['--wrap=none']
172
+ conversion_mode_msg = "Using standard conversion mode (may preserve more raw HTML)."
 
173
  logging.info(conversion_mode_msg) # Log the mode
174
 
175
+ # --- MODE 1: Convert from URL ---
176
+ if input_type == "Convert from URL":
177
+ staging_dir = None # Initialize to ensure it exists for the finally block
178
+ try:
179
+ # --- 1. Validate URL and Determine Restriction Path ---
180
+ start_url_str = start_url_str.strip()
181
+ if not start_url_str:
182
+ yield "Error: Starting URL cannot be empty.", None, gr.Markdown(visible=False), None
183
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
 
185
  try:
186
+ parsed_start_url = urlparse(start_url_str)
187
+ if not parsed_start_url.scheme or not parsed_start_url.netloc:
188
+ raise ValueError("Invalid URL format (missing scheme or domain).")
189
+ base_netloc = parsed_start_url.netloc
190
+ base_scheme = parsed_start_url.scheme
191
+
192
+ # Calculate the base directory path for comparison if restriction is enabled
193
+ start_path_cleaned = parsed_start_url.path.strip('/')
194
+ start_path_dir_for_restriction = None # Initialize restriction path base
195
+ if start_path_cleaned: # If not root path
196
+ # Use os.path.dirname to get the directory part
197
+ # dirname('main/index.html') -> 'main'
198
+ # dirname('main') -> '' (This needs correction if start URL is like /main/)
199
+ # Let's adjust: if no '/' it means it's the first level dir or a root file
200
+ if '/' not in start_path_cleaned and '.' not in start_path_cleaned:
201
+ start_path_dir_for_restriction = start_path_cleaned # e.g. 'main'
202
+ else:
203
+ start_path_dir_for_restriction = os.path.dirname(start_path_cleaned) # e.g. 'main' from main/index.html, or '' from /index.html
204
+ if start_path_dir_for_restriction == '': # Handle case like /index.html correctly
205
+ start_path_dir_for_restriction = None # Treat like root, don't restrict path based on this
206
+
207
+ except ValueError as e:
208
+ yield f"Error: Invalid starting URL '{start_url_str}': {e}", None, gr.Markdown(visible=False), None
209
+ return
210
+
211
+ # Log restriction status
212
+ restriction_msg = f"Path restriction enabled: limiting to paths starting like '{start_path_dir_for_restriction}/'." if restrict_path and start_path_dir_for_restriction else "Path restriction disabled or starting from root."
213
+
214
+ # --- 2. Setup Temporary Directory & Crawler ---
215
+ staging_dir = tempfile.mkdtemp(prefix="md_convert_")
216
+ logging.info(f"Created temporary directory: {staging_dir}")
217
+
218
+ urls_to_process = Queue()
219
+ processed_urls = set() # Still needed to avoid duplicates
220
+ urls_to_process.put(start_url_str)
221
+ processed_urls.add(start_url_str) # Add start URL here
222
+ failed_urls = set()
223
+ converted_count = 0
224
+ url_count_estimate = 1 # Total unique URLs discovered so far (starts with the first one)
225
+ dequeued_count = 0
226
+
227
+ log_messages = ["Process started...", restriction_msg, conversion_mode_msg]
228
+ yield "\n".join(log_messages), None, gr.Markdown(visible=False), None
229
+
230
+ # --- 3. Crawl and Convert Loop ---
231
+ while not urls_to_process.empty():
232
+ # --- Get URL and Increment Dequeued Count ---
233
+ current_url = urls_to_process.get()
234
+ dequeued_count += 1 # Increment when an item is taken for processing
235
+
236
+ # --- Update Progress Bar ---
237
+ # Calculate progress based on dequeued items vs. total discovered
238
+ # Denominator is the total number of unique URLs added to processed_urls/queue so far
239
+ denominator = max(1, url_count_estimate) # url_count_estimate increases when new links are found
240
+ current_progress_value = dequeued_count / denominator
241
+
242
+ # Update Gradio progress - use dequeued_count for user display
243
+ # Display: Processed X / Total_Discovered Y
244
+ progress(current_progress_value, desc=f"Processing {dequeued_count}/{url_count_estimate}. Queue: {urls_to_process.qsize()}")
245
+
246
+ # --- Process the current URL ---
247
+ log_message = f"\nProcessing ({dequeued_count}/{url_count_estimate}): {current_url}"
248
  logging.info(log_message)
249
  log_messages.append(log_message)
250
+
251
+ # Fetch HTML
252
+ time.sleep(POLITENESS_DELAY)
253
+ html_content = fetch_html(current_url)
254
+ if not html_content:
255
+ failed_urls.add(current_url)
256
+ log_message = f" -> Failed to fetch content."
257
+ logging.warning(log_message)
258
+ log_messages.append(log_message)
259
+ continue
260
+
261
+ # Determine Output Path
262
+ parsed_current_url = urlparse(current_url)
263
+ # Get the path part of the URL, removing leading/trailing slashes
264
+ url_path_segment = parsed_current_url.path.strip('/') or 'index' # e.g., "main/index.html", "HEAD/index.html", ""
265
+
266
+ # Now, determine the final .md filename based on the path base
267
+ if url_path_segment.lower().endswith('.html'):
268
+ relative_md_filename = os.path.splitext(url_path_segment)[0] + ".md"
269
+ else:
270
+ # If it's not empty and doesn't end with .html, assume it's a directory path
271
+ # Append 'index.md' to treat it like accessing a directory index
272
+ # e.g., if URL path was /main, url_path_segment is 'main', output becomes 'main/index.md'
273
+ # If URL path was /path/to/file (no .html), output becomes 'path/to/file.md' if '.' in basename, else 'path/to/file/index.md'
274
+ basename = os.path.basename(url_path_segment)
275
+ if '.' in basename: # Check if it looks like a file without .html extension
276
+ relative_md_filename = url_path_segment + ".md"
277
+ else: # Assume it's a directory reference
278
+ relative_md_filename = os.path.join(url_path_segment, "index.md")
279
+
280
+ # Construct full path within the temporary staging directory
281
+ output_md_full_path = os.path.join(staging_dir, relative_md_filename)
282
+ os.makedirs(os.path.dirname(output_md_full_path), exist_ok=True)
283
+
284
+ # Convert HTML to Markdown
285
+ if convert_html_to_md(html_content, output_md_full_path, pandoc_format_to_use, pandoc_args_to_use):
286
+ converted_count += 1
287
+ log_messages.append(f" -> Converted successfully to {os.path.relpath(output_md_full_path, staging_dir)}")
288
+ else:
289
+ failed_urls.add(current_url)
290
+ log_messages.append(" -> Conversion failed.")
291
+
292
+ # Find and Add New Links
293
  soup = BeautifulSoup(html_content, 'lxml')
 
 
294
  for link in soup.find_all('a', href=True):
295
+ absolute_url = urljoin(current_url, link['href']).split('#', 1)[0]
 
 
296
  parsed_absolute_url = urlparse(absolute_url)
297
+
298
  # Basic Filtering (scheme, domain, looks like html)
299
  is_valid_target = (
300
  parsed_absolute_url.scheme == base_scheme and
301
+ parsed_absolute_url.netloc == base_netloc)
 
 
 
 
 
 
 
 
 
302
 
303
+ if not is_valid_target: continue # Skip invalid links early
304
+
305
  # --- Path Restriction Check ---
306
  path_restricted = False
307
  # Only apply if checkbox is checked AND we derived a non-root restriction path
308
+ if restrict_path and start_path_dir_for_restriction:
309
+ candidate_path = parsed_absolute_url.path.strip('/')
310
  # Check if the cleaned candidate path starts with the restriction dir + '/'
311
  # OR if the candidate path is exactly the restriction dir (e.g. /main matching main)
312
+ if not (candidate_path.startswith(start_path_dir_for_restriction + '/') or candidate_path == start_path_dir_for_restriction):
 
313
  path_restricted = True
 
314
  # --- End Path Restriction Check ---
315
+
316
  # Add to queue only if NOT restricted and NOT already processed
317
  if not path_restricted and absolute_url not in processed_urls:
318
  processed_urls.add(absolute_url) # Add to set immediately
319
  urls_to_process.put(absolute_url)
 
320
  url_count_estimate += 1
321
 
322
+ # --- 4. Create ZIP Archive ---
323
+ progress(1.0, desc="Zipping files...")
324
+ log_messages.append("\nCrawling complete. Creating ZIP file...")
325
+ yield "\n".join(log_messages), None, gr.Markdown(visible=False), None
 
 
 
 
 
 
326
 
327
+ with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as temp_zip:
328
+ output_zip_path = temp_zip.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
+ if create_zip_archive(staging_dir, output_zip_path):
331
+ log_messages.append(f"\nProcess finished successfully!")
332
+ log_messages.append(f"Converted {converted_count} pages using {'aggressive' if use_aggressive_conversion else 'standard'} mode.")
333
+ if failed_urls:
334
+ log_messages.append(f"Failed to process {len(failed_urls)} URLs.")
335
+ yield "\n".join(log_messages), output_zip_path, gr.Markdown(visible=False), None
336
+ else:
337
+ log_messages.append("\nError: Failed to create the final ZIP archive.")
338
+ yield "\n".join(log_messages), None, gr.Markdown(visible=False), None
339
+
340
+ except Exception as e:
341
+ error_log = f"\nAn unexpected error occurred: {e}\n{traceback.format_exc()}"
342
+ logging.error(error_log)
343
+ yield error_log, None, gr.Markdown(visible=False), None
344
+ finally:
345
+ # --- Cleanup ---
346
+ if staging_dir and os.path.exists(staging_dir):
347
  shutil.rmtree(staging_dir)
348
  logging.info(f"Cleaned up temporary directory: {staging_dir}")
349
+
350
+ # --- MODE 2: Convert from HTML Text ---
351
+ elif input_type == "Convert from HTML Text":
352
+ log_messages = [f"Process started...", conversion_mode_msg]
353
+
354
+ if not html_text_input or not html_text_input.strip():
355
+ log_messages.append("Error: HTML content cannot be empty.")
356
+ yield "\n".join(log_messages), None, gr.Markdown(visible=False), None
357
+ return
358
+
359
+ progress(0.5, desc="Converting HTML text...")
360
+
361
+ # Use the dedicated string conversion function
362
+ markdown_output, status_msg = convert_html_text_to_md_string(
363
+ html_text_input, pandoc_format_to_use, pandoc_args_to_use
364
+ )
365
+
366
+ log_messages.append(status_msg)
367
+ progress(1.0, desc="Complete")
368
+
369
+ if markdown_output is not None:
370
+ # Create a temporary file for download
371
+ with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix=".md", delete=False) as temp_md:
372
+ temp_md.write(markdown_output)
373
+ temp_md_path = temp_md.name
374
+
375
+ log_messages.append("\nMarkdown has been generated. You can preview it below or download the file.")
376
+
377
+ # Yield the final state: update logs, clear zip, show markdown preview, provide md file
378
+ yield ("\n".join(log_messages),
379
+ None,
380
+ gr.Markdown(value=markdown_output, visible=True),
381
+ temp_md_path)
382
+ else:
383
+ # Conversion failed, show logs and hide/clear other outputs
384
+ yield ("\n".join(log_messages),
385
+ None,
386
+ gr.Markdown(visible=False),
387
+ None)
388
 
389
  css = """
390
  textarea[rows]:not([rows="1"]) {
391
+ height: 250px; /* Give the HTML input box a fixed height */
392
  overflow-y: auto !important;
393
  scrollbar-width: thin !important;
394
  }
 
403
  """
404
 
405
  # --- Gradio UI Definition ---
406
+ with gr.Blocks(title="HTML to Markdown Converter", css=css) as demo:
407
  gr.Markdown(
408
  """
409
+ # HTML to Markdown Converter (via pypandoc)
410
+ Choose an input method:
411
+ 1. **Convert from URL**: Enter the starting `index.html` URL of an online documentation site. The script will crawl internal links, convert pages to Markdown, and package the results into a ZIP file.
412
+ 2. **Convert from HTML Text**: Paste raw HTML source code directly to convert it into a single Markdown output.
413
+
414
+ **This tool requires `pip install pypandoc_binary` to function correctly.**
415
  """
416
  )
417
 
418
+ # --- Input type selector ---
419
+ input_type_radio = gr.Radio(
420
+ ["Convert from URL", "Convert from HTML Text"],
421
+ label="Input Type",
422
+ value="Convert from URL"
423
+ )
424
+
425
+ # --- URL Mode UI ---
426
+ with gr.Column(visible=True) as url_mode_ui:
427
  url_input = gr.Textbox(
428
  label="Starting Index HTML URL",
429
  placeholder="e.g., https://dghs-imgutils.deepghs.org/main/index.html"
430
  )
 
 
431
  restrict_path_checkbox = gr.Checkbox(
432
  label="Restrict crawl to starting path structure (e.g., if start is '/main/index.html', only crawl '/main/...' URLs)",
433
  value=True # Default to restricting path
434
  )
435
+
436
+ # --- HTML Text Mode UI ---
437
+ with gr.Column(visible=False) as text_mode_ui:
438
+ html_text_input = gr.Textbox(
439
+ label="Paste HTML Source Code Here",
440
+ lines=10, # Give it a decent initial size
441
+ placeholder="<html><body><h1>Title</h1><p>This is a paragraph.</p></body></html>"
442
+ )
443
+
444
+ # --- Common Options ---
445
+ with gr.Row():
446
  aggressive_md_checkbox = gr.Checkbox(
447
  label="Aggressive Markdown conversion (disable raw HTML, use ATX headers)",
448
  value=True # Default to aggressive conversion
449
  )
450
+
451
  with gr.Row():
452
  start_button = gr.Button("Start Conversion", variant="primary")
453
 
454
+ # --- URL Mode Outputs ---
455
+ with gr.Column(visible=True) as url_mode_outputs:
456
+ log_output = gr.Textbox(label="Progress Logs", lines=15, interactive=False, show_copy_button=True)
457
+ zip_output = gr.File(label="Download Markdown Archive (ZIP)")
458
+
459
+ # --- HTML Text Mode Outputs ---
460
+ with gr.Column(visible=False) as text_mode_outputs:
461
+ gr.Markdown("---")
462
+ gr.Markdown("### Markdown Conversion Result")
463
+ md_output_display = gr.Markdown(label="Preview") # Preview the result
464
+ md_output_file = gr.File(label="Download Markdown File (.md)") # Download the single file
465
+
466
+ # --- UI Logic to switch between modes ---
467
+ def update_ui_visibility(input_type):
468
+ is_url_mode = (input_type == "Convert from URL")
469
+ return {
470
+ url_mode_ui: gr.update(visible=is_url_mode),
471
+ text_mode_ui: gr.update(visible=not is_url_mode),
472
+ url_mode_outputs: gr.update(visible=is_url_mode),
473
+ text_mode_outputs: gr.update(visible=not is_url_mode),
474
+ }
475
+
476
+ input_type_radio.change(
477
+ fn=update_ui_visibility,
478
+ inputs=input_type_radio,
479
+ outputs=[url_mode_ui, text_mode_ui, url_mode_outputs, text_mode_outputs]
480
+ )
481
 
482
+ # --- Button click event wiring ---
483
  start_button.click(
484
  fn=process_conversion_request,
485
+ inputs=[
486
+ input_type_radio,
487
+ url_input,
488
+ html_text_input,
489
+ restrict_path_checkbox,
490
+ aggressive_md_checkbox
491
+ ],
492
+ # The function now needs to update all possible outputs
493
+ outputs=[
494
+ log_output,
495
+ zip_output,
496
+ md_output_display,
497
+ md_output_file
498
+ ],
499
  show_progress="full"
500
  )
501
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio==5.29.0
2
  requests
3
  beautifulsoup4
4
  lxml
 
1
+ gradio==5.34.2
2
  requests
3
  beautifulsoup4
4
  lxml