Skip to content

Commit 033542e

Browse files
authored
Merge pull request #2379 from ericseppanen/inspect_links
add a script to inspect links and report problems
2 parents 9c4893c + 73d371e commit 033542e

File tree

2 files changed

+266
-0
lines changed

2 files changed

+266
-0
lines changed

tools/inspect_links.py

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
#!/usr/bin/python3
2+
3+
"""
4+
Inspect a set of markdown files, and warn if there are:
5+
- duplicate links
6+
- malformed links
7+
"""
8+
9+
import argparse
10+
import bs4
11+
import logging
12+
import markdown
13+
import os
14+
import re
15+
import sys
16+
import urllib
17+
18+
LOG = logging.getLogger(__name__)
19+
LOG.setLevel(logging.INFO)
20+
21+
22+
class Warnings:
23+
""" A singleton object for gathering warnings to be printed later. """
24+
25+
def __init__(self):
26+
self.warnings = []
27+
self.silent = False
28+
29+
def silence(self, val):
30+
self.silent = val
31+
32+
def warn(self, msg):
33+
if not self.silent:
34+
self.warnings.append(msg)
35+
36+
def get(self):
37+
return self.warnings
38+
39+
40+
# The singleton object that gathers warnings, for later reporting.
41+
warnings = Warnings()
42+
43+
# A regex that matches filenames to inspect.
44+
RE_FILENAME = re.compile(r'\d\d\d\d-\d\d-\d\d-this-week-in-rust.md')
45+
46+
# A block-list of tracking parameters
47+
TRACKING_PARAMETERS = set([
48+
'utm_source',
49+
'utm_campaign',
50+
'utm_medium',
51+
'utm_content',
52+
])
53+
54+
# A list of section titles that will trigger duplicate-tag detection.
55+
STRICT_TITLES = [
56+
'updates from rust community',
57+
]
58+
59+
60+
def is_strict_title(title):
61+
""" Return True if this title is one that needs strict checks. """
62+
title = str(title)
63+
# .lower() doesn't necessarily handle unicode in a robust way,
64+
# but the set of strings we care about is tiny, and use only ascii.
65+
return title.lower() in STRICT_TITLES
66+
67+
68+
def extract_links(html):
69+
""" Return a list of links from this file.
70+
71+
Links will only be returned if they are within a section deemed "strict".
72+
This allows us to ignore links that are deliberately repeated (to this
73+
github repo and twitter account, for example).
74+
75+
Side-effects:
76+
- If links are malformed, warnings may be recorded. See `parse_url`
77+
for details.
78+
79+
"""
80+
strict_mode = False
81+
tags = ['a', 'h1', 'h2', 'h3', 'h4']
82+
urls = []
83+
84+
# Remember the header level (h2, h3, etc) when we turned on
85+
# strict_mode.
86+
header_level = None
87+
88+
for tag in bs4.BeautifulSoup(html, 'html.parser').find_all(tags):
89+
if tag.name == 'a':
90+
link = tag.get('href')
91+
LOG.debug(f'found link tag: {link}')
92+
if strict_mode:
93+
trimmed_url = parse_url(link)
94+
urls.append(trimmed_url)
95+
else:
96+
level = tag.name
97+
if header_level and level > header_level:
98+
LOG.debug(f'skipping {tag}, overridden at {header_level}')
99+
continue
100+
101+
# This is the title of a section. If this title is "strict",
102+
# we will check for any duplicate links inside it.
103+
104+
strict_mode = is_strict_title(tag.string)
105+
if strict_mode:
106+
header_level = level
107+
else:
108+
header_level = None
109+
LOG.debug(f'found heading tag: {tag} (strict={strict_mode})')
110+
111+
return urls
112+
113+
114+
def scrub_parameters(url, query):
115+
""" Strip tracking parameters from the URL """
116+
query_dict = urllib.parse.parse_qs(query)
117+
118+
filtered_dict = {}
119+
found_tracking = []
120+
for k, v in query_dict.items():
121+
if k in TRACKING_PARAMETERS:
122+
found_tracking.append(k)
123+
else:
124+
filtered_dict[k] = v
125+
126+
# Store a warning if
127+
if found_tracking:
128+
warnings.warn(f'found tracking parameters on {url}: {found_tracking}')
129+
130+
# If there are no query parameters left, return the empty string.
131+
if not filtered_dict:
132+
return ''
133+
134+
# Re-encode remaining URL paramaters
135+
return urllib.parse.urlencode(filtered_dict, doseq=True)
136+
137+
138+
def parse_url(link):
139+
""" Parse a URL and return it in a stripped-down form.
140+
141+
This will strip HTTP parameters and anchors (in an effort to better
142+
detect duplicate URLs). However, as this would break some common URLs
143+
we don't strip parameters that are on the KEEP_PARAMETERS list.
144+
145+
Side-effects:
146+
- If a link does not have a recognized protocol, we will
147+
record a warning.
148+
"""
149+
parsed_url = urllib.parse.urlsplit(link)
150+
if parsed_url.scheme not in ('mailto', 'http', 'https'):
151+
warnings.warn(f'possibly malformed link: {link}')
152+
153+
# If there are query parameters present, give them a cleanup pass to remove irrelevant ones.
154+
query = parsed_url.query
155+
if query:
156+
LOG.debug(f'{parsed_url.geturl()} found query parameters: {query}')
157+
query = scrub_parameters(link, query)
158+
if query:
159+
LOG.debug(
160+
f'{parsed_url.geturl()} keeping query parameters: {query}')
161+
162+
# Re-constitute the URL with the reduced set of query parameters.
163+
(sch, loc, path, _, frag) = parsed_url
164+
reconstituted = urllib.parse.urlunsplit((sch, loc, path, query, frag))
165+
if reconstituted != link:
166+
LOG.debug(f'reconstituted: {reconstituted}')
167+
warnings.warn(f'link can be simplified: {link} -> {reconstituted}')
168+
return reconstituted
169+
170+
171+
def inspect_file(filename):
172+
LOG.info(f'inspecting file {filename}')
173+
md_text = open(filename).read()
174+
html = markdown.markdown(md_text)
175+
links = extract_links(html)
176+
LOG.debug(f'examining {len(links)} links')
177+
return links
178+
179+
180+
def get_recent_files(dirs, count):
181+
""" return a list of the N most recent markdown files in `dir`.
182+
183+
We assume the files are named "YYYY-MM-DD-this-week-in-rust-md".
184+
"""
185+
LOG.debug(f'searching for {count} recent files in "{dirs}"')
186+
187+
listing = []
188+
for dir in dirs.split(':'):
189+
files = os.listdir(path=dir)
190+
if not files:
191+
raise Exception(f'No files found in {dir}')
192+
files = list(filter(RE_FILENAME.match, files))
193+
if not files:
194+
raise Exception(f'No matching files found in {dir}')
195+
196+
# create a tuple (file, file+path) so we can sort by filename
197+
file_tuples = [(f, os.path.join(dir, f)) for f in files]
198+
listing.extend(file_tuples)
199+
200+
listing.sort()
201+
listing = listing[-count:]
202+
203+
# return the file+path.
204+
listing = [tup[1] for tup in listing]
205+
206+
LOG.info(f'recent files: {listing}')
207+
return listing
208+
209+
210+
def inspect_files(file_list, num_warn):
211+
""" Inspect a set of files, storing warnings about duplicate links. """
212+
linkset = {}
213+
214+
# If we inspect 5 files (enumerated 0-4), and want to warn on 2,
215+
# then the warnings start at N=3 (length - 1 - num_warn).
216+
warn_index = len(file_list) - 1 - num_warn
217+
218+
for index, file in enumerate(file_list):
219+
warnings.silence(index < warn_index)
220+
links = inspect_file(file)
221+
LOG.debug(f'found links: {links}')
222+
for link in links:
223+
collision = linkset.get(link)
224+
if collision:
225+
warnings.warn(
226+
f"possible duplicate link {link} in file {file} (also found in {collision}")
227+
else:
228+
linkset[link] = file
229+
230+
231+
def main():
232+
parser = argparse.ArgumentParser()
233+
parser.add_argument('--paths', default='content:draft',
234+
help="Directory paths to inspect (colon separated)")
235+
parser.add_argument('--num-recent', default=25, type=int,
236+
help="Number of recent files to inspect")
237+
parser.add_argument('--num-warn', default=1, type=int,
238+
help="Number of recent files to warn about")
239+
parser.add_argument('--debug', action='store_true')
240+
args = parser.parse_args()
241+
if args.debug:
242+
LOG.setLevel(logging.DEBUG)
243+
LOG.debug(f'command-line arguments: {args}')
244+
file_list = get_recent_files(args.paths, args.num_recent)
245+
inspect_files(file_list, args.num_warn)
246+
247+
248+
def setup_logging():
249+
log_stderr = logging.StreamHandler()
250+
logging.getLogger('').addHandler(log_stderr)
251+
252+
253+
if __name__ == "__main__":
254+
setup_logging()
255+
main()
256+
257+
warns = warnings.get()
258+
if warns:
259+
print("warnings exist:")
260+
for w in warns:
261+
print(w)
262+
sys.exit(1)
263+
else:
264+
print("everything is ok!")

tools/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
beautifulsoup4==4.10.0
2+
Markdown==3.3.5

0 commit comments

Comments
 (0)