File size: 5,072 Bytes
6c9ac8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Modified from:
# https://github.com/allenai/allennlp/blob/main/scripts/check_links.py

import argparse
import logging
import os
import pathlib
import re
import sys
from multiprocessing.dummy import Pool
from typing import NamedTuple, Optional, Tuple

import requests
from mmengine.logging import MMLogger


def parse_args():
    parser = argparse.ArgumentParser(
        description='Goes through all the inline-links '
        'in markdown files and reports the breakages')
    parser.add_argument(
        '--num-threads',
        type=int,
        default=100,
        help='Number of processes to confirm the link')
    parser.add_argument('--https-proxy', type=str, help='https proxy')
    parser.add_argument(
        '--out',
        type=str,
        default='link_reports.txt',
        help='output path of reports')
    args = parser.parse_args()
    return args


OK_STATUS_CODES = (
    200,
    401,  # the resource exists but may require some sort of login.
    403,  # ^ same
    405,  # HEAD method not allowed.
    # the resource exists, but our default 'Accept-' header may not
    # match what the server can provide.
    406,
)


class MatchTuple(NamedTuple):
    source: str
    name: str
    link: str


def check_link(
        match_tuple: MatchTuple,
        http_session: requests.Session,
        logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]:
    reason: Optional[str] = None
    if match_tuple.link.startswith('http'):
        result_ok, reason = check_url(match_tuple, http_session)
    else:
        result_ok = check_path(match_tuple)
    if logger is None:
        print(f"  {'✓' if result_ok else '✗'} {match_tuple.link}")
    else:
        logger.info(f"  {'✓' if result_ok else '✗'} {match_tuple.link}")
    return match_tuple, result_ok, reason


def check_url(match_tuple: MatchTuple,
              http_session: requests.Session) -> Tuple[bool, str]:
    """Check if a URL is reachable."""
    try:
        result = http_session.head(
            match_tuple.link, timeout=5, allow_redirects=True)
        return (
            result.ok or result.status_code in OK_STATUS_CODES,
            f'status code = {result.status_code}',
        )
    except (requests.ConnectionError, requests.Timeout):
        return False, 'connection error'


def check_path(match_tuple: MatchTuple) -> bool:
    """Check if a file in this repository exists."""
    relative_path = match_tuple.link.split('#')[0]
    full_path = os.path.join(
        os.path.dirname(str(match_tuple.source)), relative_path)
    return os.path.exists(full_path)


def main():
    args = parse_args()

    # setup logger
    logger = MMLogger.get_instance(name='mmdet', log_file=args.out)

    # setup https_proxy
    if args.https_proxy:
        os.environ['https_proxy'] = args.https_proxy

    # setup http_session
    http_session = requests.Session()
    for resource_prefix in ('http://', 'https://'):
        http_session.mount(
            resource_prefix,
            requests.adapters.HTTPAdapter(
                max_retries=5,
                pool_connections=20,
                pool_maxsize=args.num_threads),
        )

    logger.info('Finding all markdown files in the current directory...')

    project_root = (pathlib.Path(__file__).parent / '..').resolve()
    markdown_files = project_root.glob('**/*.md')

    all_matches = set()
    url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)')
    for markdown_file in markdown_files:
        with open(markdown_file) as handle:
            for line in handle.readlines():
                matches = url_regex.findall(line)
                for name, link in matches:
                    if 'localhost' not in link:
                        all_matches.add(
                            MatchTuple(
                                source=str(markdown_file),
                                name=name,
                                link=link))

    logger.info(f'  {len(all_matches)} markdown files found')
    logger.info('Checking to make sure we can retrieve each link...')

    with Pool(processes=args.num_threads) as pool:
        results = pool.starmap(check_link, [(match, http_session, logger)
                                            for match in list(all_matches)])

    # collect unreachable results
    unreachable_results = [(match_tuple, reason)
                           for match_tuple, success, reason in results
                           if not success]

    if unreachable_results:
        logger.info('================================================')
        logger.info(f'Unreachable links ({len(unreachable_results)}):')
        for match_tuple, reason in unreachable_results:
            logger.info('  > Source: ' + match_tuple.source)
            logger.info('    Name: ' + match_tuple.name)
            logger.info('    Link: ' + match_tuple.link)
            if reason is not None:
                logger.info('    Reason: ' + reason)
        sys.exit(1)
    logger.info('No Unreachable link found.')


if __name__ == '__main__':
    main()