-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
DOC/TST: Validate documentation pages #24216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
534eb70
7aa12ad
93974fb
5e7bca4
08e7546
3e0c233
907c587
1ffa59a
f21a465
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pytest | ||
|
||
import docutils.nodes | ||
import docutils.utils | ||
|
||
import validate_documentation | ||
DocumentChecker = validate_documentation.DocumentChecker | ||
|
||
|
||
@pytest.fixture() | ||
def doctree(): | ||
return docutils.utils.new_document('test.rst') | ||
|
||
|
||
@pytest.mark.parametrize('raw_doc,errs', [ | ||
(' Indented Line\n', {}), | ||
('\n', {}), | ||
('\tIndented Line\n', {'WS01': [(1, '\tIndented Line\n')]}), | ||
('Line \n', {'WS02': [(1, 'Line \n')]}), | ||
('\t\n', {'WS01': [(1, '\t\n')], 'WS02': [(1, '\t\n')]}), | ||
(' \n', {'WS02': [(1, ' \n')]}), | ||
]) | ||
def test_line_checkers(doctree, raw_doc, errs): | ||
checker = DocumentChecker(raw_doc, doctree) | ||
result = checker.validate() | ||
|
||
assert result == errs | ||
|
||
|
||
def test_doctree_with_list_in_block_quote(doctree): | ||
bullet_list = docutils.nodes.bullet_list() | ||
block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list) | ||
doctree.append(block_quote) | ||
|
||
checker = DocumentChecker('', doctree) | ||
result = checker.validate() | ||
|
||
assert result == {'DT01': [None]} | ||
|
||
|
||
def test_doctree_with_block_quote_in_list(doctree): | ||
bullet_list = docutils.nodes.bullet_list() | ||
block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list) | ||
top_list = docutils.nodes.bullet_list('', block_quote) | ||
doctree.append(top_list) | ||
|
||
checker = DocumentChecker('', doctree) | ||
result = checker.validate() | ||
|
||
assert result == {'DT01': [None]} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,227 @@ | ||
import argparse | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should have a comment at the beginning of the script explaining what it does and how to use it. |
||
from fnmatch import fnmatch | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. haven't used |
||
import json | ||
import os | ||
import pickle | ||
import re | ||
import sys | ||
|
||
import docutils.nodes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. docutils is part of the standard library, right? why the blank line if that's the case? |
||
|
||
DOCUMENTATION_SOURCE = os.path.join(os.curdir, '../doc/source') | ||
DOCTREE_PATH = '../doc/build/doctrees/{}.doctree' | ||
RST_PATH = '../doc/source/{}.rst' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you are assuming here that the current directory ( Also, better use |
||
|
||
ERROR_MSGS = { | ||
'DT01': "Found bullet list within block quote. Use 0 spaces for top-level " | ||
"list and 2 spaces for sub-lists", | ||
'WS01': "Indentation uses tabulation", | ||
'WS02': "Trailing whitespaces", | ||
} | ||
|
||
STARTING_WHITESPACE_RE = re.compile(r'^(\s+).*\n$', re.MULTILINE) | ||
TRAILING_WHITESPACE_RE = re.compile(r'^.*([\t ]+)\n$', re.MULTILINE) | ||
|
||
|
||
class DocumentChecker(object): | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May be a short docstring for the class? If you explain what the script does at the beginning of the file there is not much to say here, but a comment on what does this class do, or how is it expected to be used, can be useful. |
||
def __init__(self, raw_doc, doctree): | ||
self.doctree = doctree | ||
self.raw_doc = raw_doc | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this blank line, for me, adds more noise than value |
||
self.issues = None | ||
|
||
def issue(self, code, match=None, **kwargs): | ||
""" | ||
Parameters | ||
---------- | ||
code : str | ||
Error code. | ||
**kwargs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
Values for the variables in the error messages | ||
""" | ||
issue = self.issues.setdefault(code, []) | ||
issue.append((self.find_line(match), kwargs)) | ||
|
||
def find_line(self, match): | ||
if not match: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add docstrings to those too? It's not obvious what they do |
||
return None | ||
lines = self.raw_doc[:match.start(0)].splitlines() | ||
return len(lines) + 1, match.group(0) | ||
|
||
def check_bullet_list_in_block_quote(self): | ||
for node in self.doctree.traverse(docutils.nodes.block_quote): | ||
match = node.first_child_matching_class(docutils.nodes.bullet_list) | ||
if match is not None: | ||
self.issue('DT01') | ||
|
||
def check_tabulator_as_indentation(self): | ||
matches = STARTING_WHITESPACE_RE.finditer(self.raw_doc) | ||
for match in matches: | ||
if '\t' in match.group(1): | ||
self.issue('WS01', match) | ||
|
||
def check_line_ends_with_whitespace(self): | ||
matches = TRAILING_WHITESPACE_RE.finditer(self.raw_doc) | ||
for match in matches: | ||
self.issue('WS02', match) | ||
|
||
def validate(self): | ||
self.issues = {} | ||
for func in dir(self): | ||
if func.startswith('check'): | ||
self.__class__.__dict__[func](self) | ||
|
||
return self.issues | ||
|
||
|
||
def report(reports, output_format='default', errors=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't find |
||
exit_status = 0 | ||
if output_format == 'json': | ||
output = json.dumps(reports) | ||
else: | ||
if output_format == 'default': | ||
output_format = '{text}\n' | ||
elif output_format == 'azure': | ||
output_format = ('##vso[task.logissue type=error;' | ||
'sourcepath={path};' | ||
'linenumber={row};' | ||
'code={code};' | ||
']{text}\n') | ||
else: | ||
raise ValueError('Unknown output_format "{}"'.format( | ||
output_format)) | ||
|
||
output = '' | ||
for name, res in reports.items(): | ||
for err_code, issues in res.items(): | ||
# The script would be faster if instead of filtering the | ||
# errors after validating them, it didn't validate them | ||
# initially. But that would complicate the code too much | ||
if errors and err_code not in errors: | ||
continue | ||
for issue, kwargs in issues: | ||
exit_status += 1 | ||
row = issue[0] if issue else None | ||
output += output_format.format( | ||
name=name, | ||
path=RST_PATH.format(name), | ||
row=row, | ||
code=err_code, | ||
text='{}{}:: {}'.format(name, | ||
':' + row if row else '', | ||
ERROR_MSGS[err_code] | ||
.format(kwargs))) | ||
|
||
sys.stdout.write(output) | ||
return exit_status | ||
|
||
|
||
def validate_one(page): | ||
""" | ||
Validate the page for the given func_name | ||
|
||
Parameters | ||
---------- | ||
page : str | ||
Path to page relative to documentation's source folder without file | ||
extension. (e.g. io) | ||
|
||
Returns | ||
------- | ||
dict | ||
A dictionary containing all the information obtained from | ||
validating the page. | ||
|
||
Notes | ||
----- | ||
The errors codes are defined as: | ||
- First two characters: Type of errors: | ||
* DT: Error with unwanted node constellations the doctree | ||
* WS: Issues regarding whitespace characters | ||
- Last two characters: Numeric error code | ||
""" | ||
|
||
with open(DOCTREE_PATH.format(page), 'r+b') as file: | ||
doctree = pickle.load(file) | ||
|
||
with open(RST_PATH.format(page), 'r') as file: | ||
raw_doc = file.read() | ||
|
||
checker = DocumentChecker(raw_doc, doctree) | ||
return checker.validate() | ||
|
||
|
||
def validate_all(exclude_patterns): | ||
""" | ||
Execute the validation of all pages, and return a dict with the | ||
results. | ||
|
||
Parameters | ||
---------- | ||
exclude_patterns : List[str] or None | ||
If provided, the pages that match with one of these patterns | ||
will be ignored. If None, all pages will be validated. | ||
|
||
Returns | ||
------- | ||
dict | ||
A dictionary with an item for every page containing | ||
all the validation information. | ||
""" | ||
|
||
result = {} | ||
for root, dirs, files in os.walk(DOCUMENTATION_SOURCE): | ||
_, base_dir = root.split('../doc/source') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
for file in files: | ||
docname, ext = os.path.splitext(file) | ||
if not ext == '.rst': | ||
continue | ||
page = os.path.join(base_dir, docname) | ||
for pattern in exclude_patterns: | ||
if fnmatch(page, pattern): | ||
continue | ||
|
||
result[page] = validate_one(page) | ||
return result | ||
|
||
|
||
def main(page, errors, output_format, exclude_patterns=None): | ||
if page: | ||
reports = {page: validate_one(page)} | ||
else: | ||
reports = validate_all(exclude_patterns=exclude_patterns) | ||
return report(reports, output_format=output_format, errors=errors) | ||
|
||
|
||
if __name__ == '__main__': | ||
format_opts = 'default', 'json', 'azure' | ||
func_help = ('document to validate (e.g. io) ' | ||
'if not provided, all documents are validated') | ||
argparser = argparse.ArgumentParser( | ||
description='validate pandas documentation') | ||
add = argparser.add_argument | ||
add('document', | ||
nargs='?', | ||
default=None, | ||
help=func_help) | ||
add('--format', default='default', choices=format_opts, | ||
help='format of the output when validating ' | ||
'multiple documents (ignored when validating one).' | ||
'It can be {}'.format(str(format_opts)[1:-1])) | ||
add('--errors', default=None, | ||
help='comma separated ' | ||
'list of error codes to validate. By default it ' | ||
'validates all errors (ignored when validating ' | ||
'a single document)') | ||
add('--exclude', default=None, | ||
help='comma separated ' | ||
'patterns of pages to exclude. By default it ' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From the help message I don't understand how I should use this. |
||
'validates all errors (ignored when validating ' | ||
'a single document)') | ||
|
||
args = argparser.parse_args() | ||
sys.exit(main(args.document, | ||
args.errors.split(',') if args.errors else None, | ||
args.format, | ||
args.exclude.split(','))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm -1 on importing anything in
scripts/
from other places. When we make changes to the scripts, we don't need to worry about breaking other parts of the code at the moment. I like to keep it this way.I think this part is not essential, is it?