Spaces:
Sleeping
Sleeping
import argparse | |
import os | |
from pathlib import Path | |
import trafilatura | |
def convert_html_to_markdown(input_dir: str, output_dir: str): | |
""" | |
Convert all HTML files in the input directory (including subdirectories) to Markdown files | |
while preserving the directory structure. | |
Args: | |
input_dir (str): Path to the input directory containing HTML files | |
output_dir (str): Path to the output directory where Markdown files will be saved | |
""" | |
# Convert paths to Path objects | |
input_path = Path(input_dir) | |
output_path = Path(output_dir) | |
# Create output directory if it doesn't exist | |
output_path.mkdir(parents=True, exist_ok=True) | |
# Walk through the input directory | |
for root, dirs, files in os.walk(input_path): | |
# Get the relative path from input directory | |
rel_path = Path(root).relative_to(input_path) | |
# Create corresponding output directory | |
current_output_dir = output_path / rel_path | |
current_output_dir.mkdir(parents=True, exist_ok=True) | |
# Process each file in the current directory | |
for file in files: | |
if file.endswith((".html", ".htm")): | |
# Construct input and output file paths | |
input_file = Path(root) / file | |
output_file = current_output_dir / f"{file.rsplit('.', 1)[0]}.md" | |
try: | |
# Read HTML file | |
with open(input_file, "r", encoding="utf-8") as f: | |
html_content = f.read() | |
# Convert HTML to Markdown | |
markdown_content = trafilatura.extract( | |
html_content, output_format="markdown" | |
) | |
if markdown_content: | |
# Create the relative path string | |
relative_path = str(rel_path / file).replace("\\", "/") | |
# Add the path information at the top of the markdown content | |
path_header = f"<!-- Original URL path: {relative_path} -->\n\n" | |
markdown_content = path_header + markdown_content | |
# Write Markdown content to output file | |
with open(output_file, "w", encoding="utf-8") as f: | |
f.write(markdown_content) | |
print(f"Converted: {input_file} -> {output_file}") | |
else: | |
print(f"Warning: Could not extract content from {input_file}") | |
except Exception as e: | |
print(f"Error processing {input_file}: {str(e)}") | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Convert HTML files to Markdown using trafilatura" | |
) | |
parser.add_argument("input_dir", help="Input directory containing HTML files") | |
parser.add_argument("output_dir", help="Output directory for Markdown files") | |
args = parser.parse_args() | |
convert_html_to_markdown(args.input_dir, args.output_dir) | |
if __name__ == "__main__": | |
main() | |