|
|
|
""" |
|
HTML to Markdown Converter |
|
|
|
This script converts HTML files to Markdown format. |
|
Usage: python html_to_md.py input.html [output.md] |
|
If no output file is specified, it will use the input filename with .md extension. |
|
""" |
|
|
|
import sys |
|
import os |
|
import argparse |
|
import html2text |
|
import requests |
|
from urllib.parse import urlparse |
|
|
|
def is_url(path): |
|
"""Check if the given path is a URL.""" |
|
parsed = urlparse(path) |
|
return parsed.scheme != '' and parsed.netloc != '' |
|
|
|
def convert_html_to_markdown(html_content, **options): |
|
"""Convert HTML content to Markdown.""" |
|
converter = html2text.HTML2Text() |
|
|
|
|
|
converter.ignore_links = options.get('ignore_links', False) |
|
converter.ignore_images = options.get('ignore_images', False) |
|
converter.ignore_tables = options.get('ignore_tables', False) |
|
converter.body_width = options.get('body_width', 0) |
|
converter.unicode_snob = options.get('unicode_snob', True) |
|
converter.wrap_links = options.get('wrap_links', False) |
|
converter.inline_links = options.get('inline_links', True) |
|
|
|
|
|
return converter.handle(html_content) |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='Convert HTML to Markdown') |
|
parser.add_argument('input', help='Input HTML file or URL') |
|
parser.add_argument('output', nargs='?', help='Output Markdown file (optional)') |
|
parser.add_argument('--ignore-links', action='store_true', help='Ignore links in the HTML') |
|
parser.add_argument('--ignore-images', action='store_true', help='Ignore images in the HTML') |
|
parser.add_argument('--ignore-tables', action='store_true', help='Ignore tables in the HTML') |
|
parser.add_argument('--body-width', type=int, default=0, help='Wrap text at this width (0 for no wrapping)') |
|
parser.add_argument('--unicode', action='store_true', help='Use Unicode characters instead of ASCII approximations') |
|
parser.add_argument('--wrap-links', action='store_true', help='Wrap links in angle brackets') |
|
parser.add_argument('--reference-links', action='store_true', help='Use reference style links instead of inline links') |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
if is_url(args.input): |
|
try: |
|
response = requests.get(args.input) |
|
response.raise_for_status() |
|
html_content = response.text |
|
except requests.exceptions.RequestException as e: |
|
print(f"Error fetching URL: {e}", file=sys.stderr) |
|
return 1 |
|
else: |
|
try: |
|
with open(args.input, 'r', encoding='utf-8') as f: |
|
html_content = f.read() |
|
except IOError as e: |
|
print(f"Error reading file: {e}", file=sys.stderr) |
|
return 1 |
|
|
|
|
|
options = { |
|
'ignore_links': args.ignore_links, |
|
'ignore_images': args.ignore_images, |
|
'ignore_tables': args.ignore_tables, |
|
'body_width': args.body_width, |
|
'unicode_snob': args.unicode, |
|
'wrap_links': args.wrap_links, |
|
'inline_links': not args.reference_links, |
|
} |
|
|
|
|
|
markdown_content = convert_html_to_markdown(html_content, **options) |
|
|
|
|
|
if args.output: |
|
output_file = args.output |
|
else: |
|
if is_url(args.input): |
|
|
|
url_parts = urlparse(args.input) |
|
base_name = os.path.basename(url_parts.path) or 'index' |
|
if not base_name.endswith('.html'): |
|
base_name += '.html' |
|
output_file = os.path.splitext(base_name)[0] + '.md' |
|
else: |
|
|
|
output_file = os.path.splitext(args.input)[0] + '.md' |
|
|
|
|
|
try: |
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
f.write(markdown_content) |
|
print(f"Conversion successful! Output saved to: {output_file}") |
|
except IOError as e: |
|
print(f"Error writing file: {e}", file=sys.stderr) |
|
return 1 |
|
|
|
return 0 |
|
|
|
if __name__ == "__main__": |
|
sys.exit(main()) |