-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
DOC/TST: Validate documentation pages #24216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
534eb70
7aa12ad
93974fb
5e7bca4
08e7546
3e0c233
907c587
1ffa59a
f21a465
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pytest | ||
|
||
import docutils.nodes | ||
import docutils.utils | ||
|
||
import validate_documentation | ||
DocumentChecker = validate_documentation.DocumentChecker | ||
|
||
|
||
@pytest.fixture() | ||
def doctree(): | ||
return docutils.utils.new_document('test.rst') | ||
|
||
|
||
@pytest.mark.parametrize('raw_doc,errs', [ | ||
([' Indented Line\n'], {}), | ||
(['\n'], {}), | ||
(['\tIndented Line\n'], {'WS01': [((0, 1), {})]}), | ||
(['Line \n'], {'WS02': [((0, 1), {})]}), | ||
(['\t\n'], {'WS03': [((0, 1), {})]}), | ||
([' \n'], {'WS03': [((0, 1), {})]}), | ||
]) | ||
def test_line_checkers(doctree, raw_doc, errs): | ||
checker = DocumentChecker('', raw_doc, doctree) | ||
checker.validate() | ||
|
||
assert checker.errs == errs | ||
|
||
|
||
def test_doctree_with_list_in_block_quote(doctree): | ||
bullet_list = docutils.nodes.bullet_list() | ||
block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list) | ||
doctree.append(block_quote) | ||
|
||
checker = DocumentChecker('', [''], doctree) | ||
checker.validate() | ||
|
||
assert checker.errs == {'DT01': [(None, {})]} | ||
|
||
|
||
def test_doctree_with_block_quote_in_list(doctree): | ||
bullet_list = docutils.nodes.bullet_list() | ||
block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list) | ||
top_list = docutils.nodes.bullet_list('', block_quote) | ||
doctree.append(top_list) | ||
|
||
checker = DocumentChecker('', [''], doctree) | ||
result = checker.validate() | ||
|
||
assert result == {'DT01': [(None, {})]} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,306 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Analyze documentation to detect errors. | ||
|
||
Requires the documentation to be build before usage. | ||
|
||
If no argument is provided, it validates all pages in the source folder for | ||
which a doctree has been generated. | ||
|
||
If a page is provided like "index", "whatsnew/v1.0.0" a list of all errors | ||
in that page is printed. | ||
|
||
Usage:: | ||
$ ./validate_documentation.py | ||
$ ./validate_documentation.py index | ||
""" | ||
|
||
import argparse | ||
import docutils.nodes | ||
from fnmatch import fnmatch | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. haven't used |
||
import json | ||
import os | ||
import pickle | ||
import re | ||
import sys | ||
|
||
BASE_DIR = os.path.join(os.path.dirname(__file__), '..', 'doc') | ||
DOCUMENTATION_SOURCE = os.path.join(BASE_DIR, 'build', 'doctrees', '') | ||
DOCTREE_PATH = os.path.join(DOCUMENTATION_SOURCE, '{}.doctree') | ||
RST_PATH = os.path.join(BASE_DIR, 'source', '{}.rst') | ||
|
||
ERROR_MSGS = { | ||
'DT01': "Found bullet list within block quote. Use 0 spaces for top-level " | ||
"list and 2 spaces for sub-lists", | ||
'WS01': "Indentation uses tabulation", | ||
'WS02': "Trailing whitespaces", | ||
'WS03': "Whitespace in empty line", | ||
} | ||
|
||
STARTING_WHITESPACE_RE = re.compile(r'^(\s+).', re.MULTILINE) | ||
TRAILING_WHITESPACE_RE = re.compile(r'.([\t ]+)$', re.MULTILINE) | ||
EMPTY_LINE_WHITESPACE_RE = re.compile(r'^([\t ]+)$', re.MULTILINE) | ||
|
||
|
||
class DocumentChecker(object): | ||
""" | ||
Checker to validate one page in the documentation | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May be a short docstring for the class? If you explain what the script does at the beginning of the file there is not much to say here, but a comment on what does this class do, or how is it expected to be used, can be useful. |
||
Attributes | ||
---------- | ||
page : str | ||
Path to page relative to documentation's source folder without file | ||
extension. (e.g. io) | ||
doctree : docutils.nodes.document | ||
Generated doctree for associated the page | ||
raw_lines : list of str | ||
Lines from rst-file for associated page | ||
raw_doc : str | ||
Joined lines from rst-file for associated page | ||
|
||
Notes | ||
----- | ||
Add a method starting with `check` to add additional checks. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this blank line, for me, adds more noise than value |
||
""" | ||
|
||
def __init__(self, page, raw_lines, doctree): | ||
self.page = page | ||
self.doctree = doctree | ||
self.raw_lines = raw_lines | ||
self.raw_doc = ''.join(raw_lines) | ||
self.errs = None | ||
|
||
def error(self, code, line=None, **kwargs): | ||
""" | ||
Parameters | ||
---------- | ||
code : str | ||
Error code. | ||
line : Tuple[int, str] | ||
**kwargs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
Values for the variables in the error messages | ||
""" | ||
errs = self.errs.setdefault(code, []) | ||
errs.append((line, kwargs)) | ||
|
||
def find_line(self, match): | ||
""" | ||
Find rows in documentation that were matched | ||
|
||
Parameters | ||
---------- | ||
match : typing.Match | ||
|
||
Returns | ||
------- | ||
row_start : int | ||
row_end : int | ||
""" | ||
if not match: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add docstrings to those too? It's not obvious what they do |
||
return None | ||
|
||
row_start = self.raw_doc[:match.start(0)].count('\n') | ||
row_end = self.raw_doc[:match.end(0)].count('\n') | ||
return row_start, row_end + 1 | ||
|
||
def check_bullet_list_in_block_quote(self): | ||
for node in self.doctree.traverse(docutils.nodes.block_quote): | ||
match = node.first_child_matching_class(docutils.nodes.bullet_list) | ||
if match is not None: | ||
self.error('DT01') | ||
|
||
def check_tabulator_as_indentation(self): | ||
matches = STARTING_WHITESPACE_RE.finditer(self.raw_doc) | ||
for match in matches: | ||
if '\t' in match.group(1): | ||
self.error('WS01', line=self.find_line(match)) | ||
|
||
def check_line_ends_with_whitespace(self): | ||
matches = TRAILING_WHITESPACE_RE.finditer(self.raw_doc) | ||
for match in matches: | ||
self.error('WS02', line=self.find_line(match)) | ||
|
||
def check_empty_line_contains_whitespace(self): | ||
matches = EMPTY_LINE_WHITESPACE_RE.finditer(self.raw_doc) | ||
for match in matches: | ||
self.error('WS03', line=self.find_line(match)) | ||
|
||
def validate(self): | ||
"""Execute methods starting with 'check'""" | ||
self.errs = {} | ||
for func in dir(self): | ||
if func.startswith('check'): | ||
self.__class__.__dict__[func](self) | ||
|
||
return self.errs | ||
|
||
def report(self, errors=None, output_format='default'): | ||
""" | ||
Output errors to stdout | ||
|
||
Parameters | ||
---------- | ||
errors : list of str, optional | ||
If provided, filter output by these error codes. | ||
output_format : str, optional | ||
One of 'default', 'json', 'azure' | ||
|
||
Returns | ||
------- | ||
int | ||
A integer with number of found issues | ||
""" | ||
n_errs = 0 | ||
if output_format == 'json': | ||
output = json.dumps(self.errs) | ||
else: | ||
if output_format == 'default': | ||
output_format = '{path}:{row}:: {code} {text}\n' | ||
elif output_format == 'azure': | ||
output_format = ('##vso[task.logissue type=error;' | ||
'sourcepath={path};' | ||
'linenumber={row};' | ||
'code={code};' | ||
']{text}\n') | ||
else: | ||
raise ValueError('Unknown output_format "{}"'.format( | ||
output_format)) | ||
|
||
output = '' | ||
|
||
for err_code, errs in self.errs.items(): | ||
# The script would be faster if instead of filtering the | ||
# errors after validating them, it didn't validate them | ||
# initially. But that would complicate the code too much | ||
if errors and err_code not in errors: | ||
continue | ||
for line, kwargs in errs: | ||
n_errs += 1 | ||
row_start, row_end = line if line else (0, 0) | ||
output += output_format.format( | ||
name=self.page, | ||
path='doc/source/{}.rst'.format(self.page), | ||
row=row_start + 1 if line else '?', | ||
code=err_code, | ||
source=''.join(self.raw_lines[row_start:row_end]), | ||
text=ERROR_MSGS[err_code].format(kwargs)) | ||
|
||
sys.stdout.write(output) | ||
return n_errs | ||
|
||
|
||
def validate_one(page): | ||
""" | ||
Validate the page for the given page | ||
|
||
Parameters | ||
---------- | ||
page : str | ||
Path to page relative to documentation's source folder without file | ||
extension. (e.g. io) | ||
|
||
Returns | ||
------- | ||
dict | ||
A dictionary containing all the information obtained from | ||
validating the page. | ||
|
||
Notes | ||
----- | ||
The errors codes are defined as: | ||
- First two characters: Type of errors: | ||
* DT: Error with unwanted node constellations inside the doctree | ||
* WS: Issues with whitespace characters | ||
- Last two characters: Numeric error code | ||
""" | ||
try: | ||
with open(DOCTREE_PATH.format(page), 'r+b') as file: | ||
doctree = pickle.load(file) | ||
|
||
with open(RST_PATH.format(page), 'r') as file: | ||
raw_doc = file.readlines() | ||
except FileNotFoundError: | ||
return None | ||
|
||
checker = DocumentChecker(page, raw_doc, doctree) | ||
checker.validate() | ||
return checker | ||
|
||
|
||
def validate_all(exclude_patterns): | ||
""" | ||
Execute the validation of all pages, and return a dict with the | ||
results. | ||
|
||
Parameters | ||
---------- | ||
exclude_patterns : List[str] or None | ||
If provided, the pages that match with one of these patterns | ||
will be ignored. If None, all pages will be validated. | ||
|
||
Returns | ||
------- | ||
dict | ||
A dictionary with an item for every page containing | ||
all the validation information. | ||
""" | ||
|
||
checkers = {} | ||
for root, dirs, files in os.walk(DOCUMENTATION_SOURCE): | ||
_, base_dir = root.split(DOCUMENTATION_SOURCE) | ||
for file in files: | ||
docname, ext = os.path.splitext(file) | ||
if not ext == '.doctree': | ||
continue | ||
page = os.path.join(base_dir, docname) | ||
if exclude_patterns: | ||
for pattern in exclude_patterns: | ||
if fnmatch(page, pattern): | ||
continue | ||
|
||
checker = validate_one(page) | ||
if checker: | ||
checkers[page] = checker | ||
return checkers | ||
|
||
|
||
def main(page, errors, output_format, exclude_patterns=None): | ||
if page: | ||
checkers = {page: validate_one(page)} | ||
else: | ||
checkers = validate_all(exclude_patterns=exclude_patterns) | ||
exit_code = 0 | ||
for page, checker in checkers.items(): | ||
exit_code += checker.report(errors=errors, output_format=output_format) | ||
return exit_code | ||
|
||
|
||
if __name__ == '__main__': | ||
format_opts = 'default', 'json', 'azure' | ||
argparser = argparse.ArgumentParser( | ||
description='validate pandas documentation') | ||
add = argparser.add_argument | ||
add('page', nargs='?', default=None, | ||
help='page to validate (e.g. io) ' | ||
'if not provided, all pages are validated') | ||
add('--format', default='default', choices=format_opts, | ||
help='format of the output when validating ' | ||
'multiple documents (ignored when validating one).' | ||
'It can be {}'.format(str(format_opts)[1:-1])) | ||
add('--errors', default=None, | ||
help='comma separated ' | ||
'list of error codes to validate. By default it ' | ||
'validates all errors (ignored when validating ' | ||
'a single document)') | ||
add('--exclude', default=None, | ||
help='comma separated ' | ||
'patterns of pages to exclude. Utilises ' | ||
'`Unix filename pattern matching`' | ||
'(ignored when validating a single document)') | ||
|
||
args = argparser.parse_args() | ||
sys.exit(main(args.page, | ||
args.errors.split(',') if args.errors else None, | ||
args.format, | ||
args.exclude.split(',') if args.exclude else None)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should have a comment at the beginning of the script explaining what it does and how to use it.