oracle-demo / html_to_markdown.py
sasan's picture
Upload folder using huggingface_hub
617df14 verified
raw
history blame
3.07 kB
import argparse
import os
from pathlib import Path
import trafilatura
def convert_html_to_markdown(input_dir: str, output_dir: str):
"""
Convert all HTML files in the input directory (including subdirectories) to Markdown files
while preserving the directory structure.
Args:
input_dir (str): Path to the input directory containing HTML files
output_dir (str): Path to the output directory where Markdown files will be saved
"""
# Convert paths to Path objects
input_path = Path(input_dir)
output_path = Path(output_dir)
# Create output directory if it doesn't exist
output_path.mkdir(parents=True, exist_ok=True)
# Walk through the input directory
for root, dirs, files in os.walk(input_path):
# Get the relative path from input directory
rel_path = Path(root).relative_to(input_path)
# Create corresponding output directory
current_output_dir = output_path / rel_path
current_output_dir.mkdir(parents=True, exist_ok=True)
# Process each file in the current directory
for file in files:
if file.endswith((".html", ".htm")):
# Construct input and output file paths
input_file = Path(root) / file
output_file = current_output_dir / f"{file.rsplit('.', 1)[0]}.md"
try:
# Read HTML file
with open(input_file, "r", encoding="utf-8") as f:
html_content = f.read()
# Convert HTML to Markdown
markdown_content = trafilatura.extract(
html_content, output_format="markdown"
)
if markdown_content:
# Create the relative path string
relative_path = str(rel_path / file).replace("\\", "/")
# Add the path information at the top of the markdown content
path_header = f"<!-- Original URL path: {relative_path} -->\n\n"
markdown_content = path_header + markdown_content
# Write Markdown content to output file
with open(output_file, "w", encoding="utf-8") as f:
f.write(markdown_content)
print(f"Converted: {input_file} -> {output_file}")
else:
print(f"Warning: Could not extract content from {input_file}")
except Exception as e:
print(f"Error processing {input_file}: {str(e)}")
def main():
parser = argparse.ArgumentParser(
description="Convert HTML files to Markdown using trafilatura"
)
parser.add_argument("input_dir", help="Input directory containing HTML files")
parser.add_argument("output_dir", help="Output directory for Markdown files")
args = parser.parse_args()
convert_html_to_markdown(args.input_dir, args.output_dir)
if __name__ == "__main__":
main()