File size: 3,073 Bytes
617df14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import argparse
import os
from pathlib import Path

import trafilatura


def convert_html_to_markdown(input_dir: str, output_dir: str):
    """
    Convert all HTML files in the input directory (including subdirectories) to Markdown files
    while preserving the directory structure.

    Args:
        input_dir (str): Path to the input directory containing HTML files
        output_dir (str): Path to the output directory where Markdown files will be saved
    """
    # Convert paths to Path objects
    input_path = Path(input_dir)
    output_path = Path(output_dir)

    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)

    # Walk through the input directory
    for root, dirs, files in os.walk(input_path):
        # Get the relative path from input directory
        rel_path = Path(root).relative_to(input_path)

        # Create corresponding output directory
        current_output_dir = output_path / rel_path
        current_output_dir.mkdir(parents=True, exist_ok=True)

        # Process each file in the current directory
        for file in files:
            if file.endswith((".html", ".htm")):
                # Construct input and output file paths
                input_file = Path(root) / file
                output_file = current_output_dir / f"{file.rsplit('.', 1)[0]}.md"

                try:
                    # Read HTML file
                    with open(input_file, "r", encoding="utf-8") as f:
                        html_content = f.read()

                    # Convert HTML to Markdown
                    markdown_content = trafilatura.extract(
                        html_content, output_format="markdown"
                    )

                    if markdown_content:
                        # Create the relative path string
                        relative_path = str(rel_path / file).replace("\\", "/")

                        # Add the path information at the top of the markdown content
                        path_header = f"<!-- Original URL path: {relative_path} -->\n\n"
                        markdown_content = path_header + markdown_content

                        # Write Markdown content to output file
                        with open(output_file, "w", encoding="utf-8") as f:
                            f.write(markdown_content)
                        print(f"Converted: {input_file} -> {output_file}")
                    else:
                        print(f"Warning: Could not extract content from {input_file}")

                except Exception as e:
                    print(f"Error processing {input_file}: {str(e)}")


def main():
    parser = argparse.ArgumentParser(
        description="Convert HTML files to Markdown using trafilatura"
    )
    parser.add_argument("input_dir", help="Input directory containing HTML files")
    parser.add_argument("output_dir", help="Output directory for Markdown files")

    args = parser.parse_args()

    convert_html_to_markdown(args.input_dir, args.output_dir)


if __name__ == "__main__":
    main()