|
| 1 | +#!/usr/bin/python3 |
| 2 | + |
| 3 | +""" |
| 4 | +Inspect a set of markdown files, and warn if there are: |
| 5 | +- duplicate links |
| 6 | +- malformed links |
| 7 | +""" |
| 8 | + |
| 9 | +import argparse |
| 10 | +import bs4 |
| 11 | +import logging |
| 12 | +import markdown |
| 13 | +import os |
| 14 | +import re |
| 15 | +import sys |
| 16 | +import urllib |
| 17 | + |
| 18 | +LOG = logging.getLogger(__name__) |
| 19 | +LOG.setLevel(logging.INFO) |
| 20 | + |
| 21 | + |
| 22 | +class Warnings: |
| 23 | + """ A singleton object for gathering warnings to be printed later. """ |
| 24 | + |
| 25 | + def __init__(self): |
| 26 | + self.warnings = [] |
| 27 | + self.silent = False |
| 28 | + |
| 29 | + def silence(self, val): |
| 30 | + self.silent = val |
| 31 | + |
| 32 | + def warn(self, msg): |
| 33 | + if not self.silent: |
| 34 | + self.warnings.append(msg) |
| 35 | + |
| 36 | + def get(self): |
| 37 | + return self.warnings |
| 38 | + |
| 39 | + |
| 40 | +# The singleton object that gathers warnings, for later reporting. |
| 41 | +warnings = Warnings() |
| 42 | + |
| 43 | +# A regex that matches filenames to inspect. |
| 44 | +RE_FILENAME = re.compile(r'\d\d\d\d-\d\d-\d\d-this-week-in-rust.md') |
| 45 | + |
| 46 | +# A block-list of tracking parameters |
| 47 | +TRACKING_PARAMETERS = set([ |
| 48 | + 'utm_source', |
| 49 | + 'utm_campaign', |
| 50 | + 'utm_medium', |
| 51 | + 'utm_content', |
| 52 | +]) |
| 53 | + |
| 54 | +# A list of section titles that will trigger duplicate-tag detection. |
| 55 | +STRICT_TITLES = [ |
| 56 | + 'updates from rust community', |
| 57 | +] |
| 58 | + |
| 59 | + |
| 60 | +def is_strict_title(title): |
| 61 | + """ Return True if this title is one that needs strict checks. """ |
| 62 | + title = str(title) |
| 63 | + # .lower() doesn't necessarily handle unicode in a robust way, |
| 64 | + # but the set of strings we care about is tiny, and use only ascii. |
| 65 | + return title.lower() in STRICT_TITLES |
| 66 | + |
| 67 | + |
| 68 | +def extract_links(html): |
| 69 | + """ Return a list of links from this file. |
| 70 | +
|
| 71 | + Links will only be returned if they are within a section deemed "strict". |
| 72 | + This allows us to ignore links that are deliberately repeated (to this |
| 73 | + github repo and twitter account, for example). |
| 74 | +
|
| 75 | + Side-effects: |
| 76 | + - If links are malformed, warnings may be recorded. See `parse_url` |
| 77 | + for details. |
| 78 | +
|
| 79 | + """ |
| 80 | + strict_mode = False |
| 81 | + tags = ['a', 'h1', 'h2', 'h3', 'h4'] |
| 82 | + urls = [] |
| 83 | + |
| 84 | + # Remember the header level (h2, h3, etc) when we turned on |
| 85 | + # strict_mode. |
| 86 | + header_level = None |
| 87 | + |
| 88 | + for tag in bs4.BeautifulSoup(html, 'html.parser').find_all(tags): |
| 89 | + if tag.name == 'a': |
| 90 | + link = tag.get('href') |
| 91 | + LOG.debug(f'found link tag: {link}') |
| 92 | + if strict_mode: |
| 93 | + trimmed_url = parse_url(link) |
| 94 | + urls.append(trimmed_url) |
| 95 | + else: |
| 96 | + level = tag.name |
| 97 | + if header_level and level > header_level: |
| 98 | + LOG.debug(f'skipping {tag}, overridden at {header_level}') |
| 99 | + continue |
| 100 | + |
| 101 | + # This is the title of a section. If this title is "strict", |
| 102 | + # we will check for any duplicate links inside it. |
| 103 | + |
| 104 | + strict_mode = is_strict_title(tag.string) |
| 105 | + if strict_mode: |
| 106 | + header_level = level |
| 107 | + else: |
| 108 | + header_level = None |
| 109 | + LOG.debug(f'found heading tag: {tag} (strict={strict_mode})') |
| 110 | + |
| 111 | + return urls |
| 112 | + |
| 113 | + |
| 114 | +def scrub_parameters(url, query): |
| 115 | + """ Strip tracking parameters from the URL """ |
| 116 | + query_dict = urllib.parse.parse_qs(query) |
| 117 | + |
| 118 | + filtered_dict = {} |
| 119 | + found_tracking = [] |
| 120 | + for k, v in query_dict.items(): |
| 121 | + if k in TRACKING_PARAMETERS: |
| 122 | + found_tracking.append(k) |
| 123 | + else: |
| 124 | + filtered_dict[k] = v |
| 125 | + |
| 126 | + # Store a warning if |
| 127 | + if found_tracking: |
| 128 | + warnings.warn(f'found tracking parameters on {url}: {found_tracking}') |
| 129 | + |
| 130 | + # If there are no query parameters left, return the empty string. |
| 131 | + if not filtered_dict: |
| 132 | + return '' |
| 133 | + |
| 134 | + # Re-encode remaining URL paramaters |
| 135 | + return urllib.parse.urlencode(filtered_dict, doseq=True) |
| 136 | + |
| 137 | + |
| 138 | +def parse_url(link): |
| 139 | + """ Parse a URL and return it in a stripped-down form. |
| 140 | +
|
| 141 | + This will strip HTTP parameters and anchors (in an effort to better |
| 142 | + detect duplicate URLs). However, as this would break some common URLs |
| 143 | + we don't strip parameters that are on the KEEP_PARAMETERS list. |
| 144 | +
|
| 145 | + Side-effects: |
| 146 | + - If a link does not have a recognized protocol, we will |
| 147 | + record a warning. |
| 148 | + """ |
| 149 | + parsed_url = urllib.parse.urlsplit(link) |
| 150 | + if parsed_url.scheme not in ('mailto', 'http', 'https'): |
| 151 | + warnings.warn(f'possibly malformed link: {link}') |
| 152 | + |
| 153 | + # If there are query parameters present, give them a cleanup pass to remove irrelevant ones. |
| 154 | + query = parsed_url.query |
| 155 | + if query: |
| 156 | + LOG.debug(f'{parsed_url.geturl()} found query parameters: {query}') |
| 157 | + query = scrub_parameters(link, query) |
| 158 | + if query: |
| 159 | + LOG.debug( |
| 160 | + f'{parsed_url.geturl()} keeping query parameters: {query}') |
| 161 | + |
| 162 | + # Re-constitute the URL with the reduced set of query parameters. |
| 163 | + (sch, loc, path, _, frag) = parsed_url |
| 164 | + reconstituted = urllib.parse.urlunsplit((sch, loc, path, query, frag)) |
| 165 | + if reconstituted != link: |
| 166 | + LOG.debug(f'reconstituted: {reconstituted}') |
| 167 | + warnings.warn(f'link can be simplified: {link} -> {reconstituted}') |
| 168 | + return reconstituted |
| 169 | + |
| 170 | + |
| 171 | +def inspect_file(filename): |
| 172 | + LOG.info(f'inspecting file {filename}') |
| 173 | + md_text = open(filename).read() |
| 174 | + html = markdown.markdown(md_text) |
| 175 | + links = extract_links(html) |
| 176 | + LOG.debug(f'examining {len(links)} links') |
| 177 | + return links |
| 178 | + |
| 179 | + |
| 180 | +def get_recent_files(dirs, count): |
| 181 | + """ return a list of the N most recent markdown files in `dir`. |
| 182 | +
|
| 183 | + We assume the files are named "YYYY-MM-DD-this-week-in-rust-md". |
| 184 | + """ |
| 185 | + LOG.debug(f'searching for {count} recent files in "{dirs}"') |
| 186 | + |
| 187 | + listing = [] |
| 188 | + for dir in dirs.split(':'): |
| 189 | + files = os.listdir(path=dir) |
| 190 | + if not files: |
| 191 | + raise Exception(f'No files found in {dir}') |
| 192 | + files = list(filter(RE_FILENAME.match, files)) |
| 193 | + if not files: |
| 194 | + raise Exception(f'No matching files found in {dir}') |
| 195 | + |
| 196 | + # create a tuple (file, file+path) so we can sort by filename |
| 197 | + file_tuples = [(f, os.path.join(dir, f)) for f in files] |
| 198 | + listing.extend(file_tuples) |
| 199 | + |
| 200 | + listing.sort() |
| 201 | + listing = listing[-count:] |
| 202 | + |
| 203 | + # return the file+path. |
| 204 | + listing = [tup[1] for tup in listing] |
| 205 | + |
| 206 | + LOG.info(f'recent files: {listing}') |
| 207 | + return listing |
| 208 | + |
| 209 | + |
| 210 | +def inspect_files(file_list, num_warn): |
| 211 | + """ Inspect a set of files, storing warnings about duplicate links. """ |
| 212 | + linkset = {} |
| 213 | + |
| 214 | + # If we inspect 5 files (enumerated 0-4), and want to warn on 2, |
| 215 | + # then the warnings start at N=3 (length - 1 - num_warn). |
| 216 | + warn_index = len(file_list) - 1 - num_warn |
| 217 | + |
| 218 | + for index, file in enumerate(file_list): |
| 219 | + warnings.silence(index < warn_index) |
| 220 | + links = inspect_file(file) |
| 221 | + LOG.debug(f'found links: {links}') |
| 222 | + for link in links: |
| 223 | + collision = linkset.get(link) |
| 224 | + if collision: |
| 225 | + warnings.warn( |
| 226 | + f"possible duplicate link {link} in file {file} (also found in {collision}") |
| 227 | + else: |
| 228 | + linkset[link] = file |
| 229 | + |
| 230 | + |
| 231 | +def main(): |
| 232 | + parser = argparse.ArgumentParser() |
| 233 | + parser.add_argument('--paths', default='content:draft', |
| 234 | + help="Directory paths to inspect (colon separated)") |
| 235 | + parser.add_argument('--num-recent', default=25, type=int, |
| 236 | + help="Number of recent files to inspect") |
| 237 | + parser.add_argument('--num-warn', default=1, type=int, |
| 238 | + help="Number of recent files to warn about") |
| 239 | + parser.add_argument('--debug', action='store_true') |
| 240 | + args = parser.parse_args() |
| 241 | + if args.debug: |
| 242 | + LOG.setLevel(logging.DEBUG) |
| 243 | + LOG.debug(f'command-line arguments: {args}') |
| 244 | + file_list = get_recent_files(args.paths, args.num_recent) |
| 245 | + inspect_files(file_list, args.num_warn) |
| 246 | + |
| 247 | + |
| 248 | +def setup_logging(): |
| 249 | + log_stderr = logging.StreamHandler() |
| 250 | + logging.getLogger('').addHandler(log_stderr) |
| 251 | + |
| 252 | + |
| 253 | +if __name__ == "__main__": |
| 254 | + setup_logging() |
| 255 | + main() |
| 256 | + |
| 257 | + warns = warnings.get() |
| 258 | + if warns: |
| 259 | + print("warnings exist:") |
| 260 | + for w in warns: |
| 261 | + print(w) |
| 262 | + sys.exit(1) |
| 263 | + else: |
| 264 | + print("everything is ok!") |
0 commit comments