import argparse import os from pathlib import Path import trafilatura def convert_html_to_markdown(input_dir: str, output_dir: str): """ Convert all HTML files in the input directory (including subdirectories) to Markdown files while preserving the directory structure. Args: input_dir (str): Path to the input directory containing HTML files output_dir (str): Path to the output directory where Markdown files will be saved """ # Convert paths to Path objects input_path = Path(input_dir) output_path = Path(output_dir) # Create output directory if it doesn't exist output_path.mkdir(parents=True, exist_ok=True) # Walk through the input directory for root, dirs, files in os.walk(input_path): # Get the relative path from input directory rel_path = Path(root).relative_to(input_path) # Create corresponding output directory current_output_dir = output_path / rel_path current_output_dir.mkdir(parents=True, exist_ok=True) # Process each file in the current directory for file in files: if file.endswith((".html", ".htm")): # Construct input and output file paths input_file = Path(root) / file output_file = current_output_dir / f"{file.rsplit('.', 1)[0]}.md" try: # Read HTML file with open(input_file, "r", encoding="utf-8") as f: html_content = f.read() # Convert HTML to Markdown markdown_content = trafilatura.extract( html_content, output_format="markdown" ) if markdown_content: # Create the relative path string relative_path = str(rel_path / file).replace("\\", "/") # Add the path information at the top of the markdown content path_header = f"\n\n" markdown_content = path_header + markdown_content # Write Markdown content to output file with open(output_file, "w", encoding="utf-8") as f: f.write(markdown_content) print(f"Converted: {input_file} -> {output_file}") else: print(f"Warning: Could not extract content from {input_file}") except Exception as e: print(f"Error processing {input_file}: {str(e)}") def main(): parser = argparse.ArgumentParser( description="Convert HTML files to Markdown using trafilatura" ) parser.add_argument("input_dir", help="Input directory containing HTML files") parser.add_argument("output_dir", help="Output directory for Markdown files") args = parser.parse_args() convert_html_to_markdown(args.input_dir, args.output_dir) if __name__ == "__main__": main()