forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidate_documentation.py
executable file
·306 lines (257 loc) · 9.56 KB
/
validate_documentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/usr/bin/env python
"""
Analyze documentation to detect errors.
Requires the documentation to be build before usage.
If no argument is provided, it validates all pages in the source folder for
which a doctree has been generated.
If a page is provided like "index", "whatsnew/v1.0.0" a list of all errors
in that page is printed.
Usage::
$ ./validate_documentation.py
$ ./validate_documentation.py index
"""
import argparse
import docutils.nodes
from fnmatch import fnmatch
import json
import os
import pickle
import re
import sys
BASE_DIR = os.path.join(os.path.dirname(__file__), '..', 'doc')
DOCUMENTATION_SOURCE = os.path.join(BASE_DIR, 'build', 'doctrees', '')
DOCTREE_PATH = os.path.join(DOCUMENTATION_SOURCE, '{}.doctree')
RST_PATH = os.path.join(BASE_DIR, 'source', '{}.rst')
ERROR_MSGS = {
'DT01': "Found bullet list within block quote. Use 0 spaces for top-level "
"list and 2 spaces for sub-lists",
'WS01': "Indentation uses tabulation",
'WS02': "Trailing whitespaces",
'WS03': "Whitespace in empty line",
}
STARTING_WHITESPACE_RE = re.compile(r'^(\s+).', re.MULTILINE)
TRAILING_WHITESPACE_RE = re.compile(r'.([\t ]+)$', re.MULTILINE)
EMPTY_LINE_WHITESPACE_RE = re.compile(r'^([\t ]+)$', re.MULTILINE)
class DocumentChecker(object):
"""
Checker to validate one page in the documentation
Attributes
----------
page : str
Path to page relative to documentation's source folder without file
extension. (e.g. io)
doctree : docutils.nodes.document
Generated doctree for associated the page
raw_lines : list of str
Lines from rst-file for associated page
raw_doc : str
Joined lines from rst-file for associated page
Notes
-----
Add a method starting with `check` to add additional checks.
"""
def __init__(self, page, raw_lines, doctree):
self.page = page
self.doctree = doctree
self.raw_lines = raw_lines
self.raw_doc = ''.join(raw_lines)
self.errs = None
def error(self, code, line=None, **kwargs):
"""
Parameters
----------
code : str
Error code.
line : Tuple[int, str]
**kwargs
Values for the variables in the error messages
"""
errs = self.errs.setdefault(code, [])
errs.append((line, kwargs))
def find_line(self, match):
"""
Find rows in documentation that were matched
Parameters
----------
match : typing.Match
Returns
-------
row_start : int
row_end : int
"""
if not match:
return None
row_start = self.raw_doc[:match.start(0)].count('\n')
row_end = self.raw_doc[:match.end(0)].count('\n')
return row_start, row_end + 1
def check_bullet_list_in_block_quote(self):
for node in self.doctree.traverse(docutils.nodes.block_quote):
match = node.first_child_matching_class(docutils.nodes.bullet_list)
if match is not None:
self.error('DT01')
def check_tabulator_as_indentation(self):
matches = STARTING_WHITESPACE_RE.finditer(self.raw_doc)
for match in matches:
if '\t' in match.group(1):
self.error('WS01', line=self.find_line(match))
def check_line_ends_with_whitespace(self):
matches = TRAILING_WHITESPACE_RE.finditer(self.raw_doc)
for match in matches:
self.error('WS02', line=self.find_line(match))
def check_empty_line_contains_whitespace(self):
matches = EMPTY_LINE_WHITESPACE_RE.finditer(self.raw_doc)
for match in matches:
self.error('WS03', line=self.find_line(match))
def validate(self):
"""Execute methods starting with 'check'"""
self.errs = {}
for func in dir(self):
if func.startswith('check'):
self.__class__.__dict__[func](self)
return self.errs
def report(self, errors=None, output_format='default'):
"""
Output errors to stdout
Parameters
----------
errors : list of str, optional
If provided, filter output by these error codes.
output_format : str, optional
One of 'default', 'json', 'azure'
Returns
-------
int
A integer with number of found issues
"""
n_errs = 0
if output_format == 'json':
output = json.dumps(self.errs)
else:
if output_format == 'default':
output_format = '{path}:{row}:: {code} {text}\n'
elif output_format == 'azure':
output_format = ('##vso[task.logissue type=error;'
'sourcepath={path};'
'linenumber={row};'
'code={code};'
']{text}\n')
else:
raise ValueError('Unknown output_format "{}"'.format(
output_format))
output = ''
for err_code, errs in self.errs.items():
# The script would be faster if instead of filtering the
# errors after validating them, it didn't validate them
# initially. But that would complicate the code too much
if errors and err_code not in errors:
continue
for line, kwargs in errs:
n_errs += 1
row_start, row_end = line if line else (0, 0)
output += output_format.format(
name=self.page,
path='doc/source/{}.rst'.format(self.page),
row=row_start + 1 if line else '?',
code=err_code,
source=''.join(self.raw_lines[row_start:row_end]),
text=ERROR_MSGS[err_code].format(kwargs))
sys.stdout.write(output)
return n_errs
def validate_one(page):
"""
Validate the page for the given page
Parameters
----------
page : str
Path to page relative to documentation's source folder without file
extension. (e.g. io)
Returns
-------
dict
A dictionary containing all the information obtained from
validating the page.
Notes
-----
The errors codes are defined as:
- First two characters: Type of errors:
* DT: Error with unwanted node constellations inside the doctree
* WS: Issues with whitespace characters
- Last two characters: Numeric error code
"""
try:
with open(DOCTREE_PATH.format(page), 'r+b') as file:
doctree = pickle.load(file)
with open(RST_PATH.format(page), 'r') as file:
raw_doc = file.readlines()
except FileNotFoundError:
return None
checker = DocumentChecker(page, raw_doc, doctree)
checker.validate()
return checker
def validate_all(exclude_patterns):
"""
Execute the validation of all pages, and return a dict with the
results.
Parameters
----------
exclude_patterns : List[str] or None
If provided, the pages that match with one of these patterns
will be ignored. If None, all pages will be validated.
Returns
-------
dict
A dictionary with an item for every page containing
all the validation information.
"""
checkers = {}
for root, dirs, files in os.walk(DOCUMENTATION_SOURCE):
_, base_dir = root.split(DOCUMENTATION_SOURCE)
for file in files:
docname, ext = os.path.splitext(file)
if not ext == '.doctree':
continue
page = os.path.join(base_dir, docname)
if exclude_patterns:
for pattern in exclude_patterns:
if fnmatch(page, pattern):
continue
checker = validate_one(page)
if checker:
checkers[page] = checker
return checkers
def main(page, errors, output_format, exclude_patterns=None):
if page:
checkers = {page: validate_one(page)}
else:
checkers = validate_all(exclude_patterns=exclude_patterns)
exit_code = 0
for page, checker in checkers.items():
exit_code += checker.report(errors=errors, output_format=output_format)
return exit_code
if __name__ == '__main__':
format_opts = 'default', 'json', 'azure'
argparser = argparse.ArgumentParser(
description='validate pandas documentation')
add = argparser.add_argument
add('page', nargs='?', default=None,
help='page to validate (e.g. io) '
'if not provided, all pages are validated')
add('--format', default='default', choices=format_opts,
help='format of the output when validating '
'multiple documents (ignored when validating one).'
'It can be {}'.format(str(format_opts)[1:-1]))
add('--errors', default=None,
help='comma separated '
'list of error codes to validate. By default it '
'validates all errors (ignored when validating '
'a single document)')
add('--exclude', default=None,
help='comma separated '
'patterns of pages to exclude. Utilises '
'`Unix filename pattern matching`'
'(ignored when validating a single document)')
args = argparser.parse_args()
sys.exit(main(args.page,
args.errors.split(',') if args.errors else None,
args.format,
args.exclude.split(',') if args.exclude else None))