File size: 5,072 Bytes
6c9ac8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# Modified from:
# https://github.com/allenai/allennlp/blob/main/scripts/check_links.py
import argparse
import logging
import os
import pathlib
import re
import sys
from multiprocessing.dummy import Pool
from typing import NamedTuple, Optional, Tuple
import requests
from mmengine.logging import MMLogger
def parse_args():
parser = argparse.ArgumentParser(
description='Goes through all the inline-links '
'in markdown files and reports the breakages')
parser.add_argument(
'--num-threads',
type=int,
default=100,
help='Number of processes to confirm the link')
parser.add_argument('--https-proxy', type=str, help='https proxy')
parser.add_argument(
'--out',
type=str,
default='link_reports.txt',
help='output path of reports')
args = parser.parse_args()
return args
OK_STATUS_CODES = (
200,
401, # the resource exists but may require some sort of login.
403, # ^ same
405, # HEAD method not allowed.
# the resource exists, but our default 'Accept-' header may not
# match what the server can provide.
406,
)
class MatchTuple(NamedTuple):
source: str
name: str
link: str
def check_link(
match_tuple: MatchTuple,
http_session: requests.Session,
logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]:
reason: Optional[str] = None
if match_tuple.link.startswith('http'):
result_ok, reason = check_url(match_tuple, http_session)
else:
result_ok = check_path(match_tuple)
if logger is None:
print(f" {'✓' if result_ok else '✗'} {match_tuple.link}")
else:
logger.info(f" {'✓' if result_ok else '✗'} {match_tuple.link}")
return match_tuple, result_ok, reason
def check_url(match_tuple: MatchTuple,
http_session: requests.Session) -> Tuple[bool, str]:
"""Check if a URL is reachable."""
try:
result = http_session.head(
match_tuple.link, timeout=5, allow_redirects=True)
return (
result.ok or result.status_code in OK_STATUS_CODES,
f'status code = {result.status_code}',
)
except (requests.ConnectionError, requests.Timeout):
return False, 'connection error'
def check_path(match_tuple: MatchTuple) -> bool:
"""Check if a file in this repository exists."""
relative_path = match_tuple.link.split('#')[0]
full_path = os.path.join(
os.path.dirname(str(match_tuple.source)), relative_path)
return os.path.exists(full_path)
def main():
args = parse_args()
# setup logger
logger = MMLogger.get_instance(name='mmdet', log_file=args.out)
# setup https_proxy
if args.https_proxy:
os.environ['https_proxy'] = args.https_proxy
# setup http_session
http_session = requests.Session()
for resource_prefix in ('http://', 'https://'):
http_session.mount(
resource_prefix,
requests.adapters.HTTPAdapter(
max_retries=5,
pool_connections=20,
pool_maxsize=args.num_threads),
)
logger.info('Finding all markdown files in the current directory...')
project_root = (pathlib.Path(__file__).parent / '..').resolve()
markdown_files = project_root.glob('**/*.md')
all_matches = set()
url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)')
for markdown_file in markdown_files:
with open(markdown_file) as handle:
for line in handle.readlines():
matches = url_regex.findall(line)
for name, link in matches:
if 'localhost' not in link:
all_matches.add(
MatchTuple(
source=str(markdown_file),
name=name,
link=link))
logger.info(f' {len(all_matches)} markdown files found')
logger.info('Checking to make sure we can retrieve each link...')
with Pool(processes=args.num_threads) as pool:
results = pool.starmap(check_link, [(match, http_session, logger)
for match in list(all_matches)])
# collect unreachable results
unreachable_results = [(match_tuple, reason)
for match_tuple, success, reason in results
if not success]
if unreachable_results:
logger.info('================================================')
logger.info(f'Unreachable links ({len(unreachable_results)}):')
for match_tuple, reason in unreachable_results:
logger.info(' > Source: ' + match_tuple.source)
logger.info(' Name: ' + match_tuple.name)
logger.info(' Link: ' + match_tuple.link)
if reason is not None:
logger.info(' Reason: ' + reason)
sys.exit(1)
logger.info('No Unreachable link found.')
if __name__ == '__main__':
main()
|