Skip to content

DOC/TST: Validate documentation pages #24216

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions ci/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@
set -e

if [ "$DOC" ]; then
echo "We are not running pytest as this is a doc-build"
exit 0
echo ###############################
echo # Lint documentation #
echo ###############################

MSG='Validate documentation' ; echo $MSG
$TRAVIS_BUILD_DIR/scripts/validate_documentation.py
RET=$? ; echo $MSG "DONE"
exit $RET
fi

# Workaround for pytest-xdist flaky collection order
Expand Down
7 changes: 7 additions & 0 deletions doc/make.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
import webbrowser
import jinja2

sys.path.extend([
# validation for documentation
os.path.join(os.path.dirname(__file__), '..', 'scripts')
])
import validate_documentation

DOC_PATH = os.path.dirname(os.path.abspath(__file__))
SOURCE_PATH = os.path.join(DOC_PATH, 'source')
Expand Down Expand Up @@ -263,6 +268,8 @@ def html(self):
os.remove(zip_fname)

if self.single_doc is not None:
checker = validate_documentation.validate_one(self.single_doc)
checker.report()
self._open_browser()
shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'),
ignore_errors=True)
Expand Down
50 changes: 50 additions & 0 deletions scripts/tests/test_validate_documentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pytest

import docutils.nodes
import docutils.utils

import validate_documentation
DocumentChecker = validate_documentation.DocumentChecker


@pytest.fixture()
def doctree():
return docutils.utils.new_document('test.rst')


@pytest.mark.parametrize('raw_doc,errs', [
([' Indented Line\n'], {}),
(['\n'], {}),
(['\tIndented Line\n'], {'WS01': [((0, 1), {})]}),
(['Line \n'], {'WS02': [((0, 1), {})]}),
(['\t\n'], {'WS03': [((0, 1), {})]}),
([' \n'], {'WS03': [((0, 1), {})]}),
])
def test_line_checkers(doctree, raw_doc, errs):
checker = DocumentChecker('', raw_doc, doctree)
checker.validate()

assert checker.errs == errs


def test_doctree_with_list_in_block_quote(doctree):
bullet_list = docutils.nodes.bullet_list()
block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list)
doctree.append(block_quote)

checker = DocumentChecker('', [''], doctree)
checker.validate()

assert checker.errs == {'DT01': [(None, {})]}


def test_doctree_with_block_quote_in_list(doctree):
bullet_list = docutils.nodes.bullet_list()
block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list)
top_list = docutils.nodes.bullet_list('', block_quote)
doctree.append(top_list)

checker = DocumentChecker('', [''], doctree)
result = checker.validate()

assert result == {'DT01': [(None, {})]}
306 changes: 306 additions & 0 deletions scripts/validate_documentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
#!/usr/bin/env python
"""
Analyze documentation to detect errors.

Requires the documentation to be build before usage.

If no argument is provided, it validates all pages in the source folder for
which a doctree has been generated.

If a page is provided like "index", "whatsnew/v1.0.0" a list of all errors
in that page is printed.

Usage::
$ ./validate_documentation.py
$ ./validate_documentation.py index
"""

import argparse
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should have a comment at the beginning of the script explaining what it does and how to use it.

import docutils.nodes
from fnmatch import fnmatch
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

haven't used fnmatch, but I prefer to import modules with submodules having the same name (like datetime) as import fnmatch. Then, in the code, when you see fnmatch.fnmatch there is no ambiguity whether it's the module or the submodule.

import json
import os
import pickle
import re
import sys

BASE_DIR = os.path.join(os.path.dirname(__file__), '..', 'doc')
DOCUMENTATION_SOURCE = os.path.join(BASE_DIR, 'build', 'doctrees', '')
DOCTREE_PATH = os.path.join(DOCUMENTATION_SOURCE, '{}.doctree')
RST_PATH = os.path.join(BASE_DIR, 'source', '{}.rst')

ERROR_MSGS = {
'DT01': "Found bullet list within block quote. Use 0 spaces for top-level "
"list and 2 spaces for sub-lists",
'WS01': "Indentation uses tabulation",
'WS02': "Trailing whitespaces",
'WS03': "Whitespace in empty line",
}

STARTING_WHITESPACE_RE = re.compile(r'^(\s+).', re.MULTILINE)
TRAILING_WHITESPACE_RE = re.compile(r'.([\t ]+)$', re.MULTILINE)
EMPTY_LINE_WHITESPACE_RE = re.compile(r'^([\t ]+)$', re.MULTILINE)


class DocumentChecker(object):
"""
Checker to validate one page in the documentation

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be a short docstring for the class? If you explain what the script does at the beginning of the file there is not much to say here, but a comment on what does this class do, or how is it expected to be used, can be useful.

Attributes
----------
page : str
Path to page relative to documentation's source folder without file
extension. (e.g. io)
doctree : docutils.nodes.document
Generated doctree for associated the page
raw_lines : list of str
Lines from rst-file for associated page
raw_doc : str
Joined lines from rst-file for associated page

Notes
-----
Add a method starting with `check` to add additional checks.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this blank line, for me, adds more noise than value

"""

def __init__(self, page, raw_lines, doctree):
self.page = page
self.doctree = doctree
self.raw_lines = raw_lines
self.raw_doc = ''.join(raw_lines)
self.errs = None

def error(self, code, line=None, **kwargs):
"""
Parameters
----------
code : str
Error code.
line : Tuple[int, str]
**kwargs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

match is not docuemted

Values for the variables in the error messages
"""
errs = self.errs.setdefault(code, [])
errs.append((line, kwargs))

def find_line(self, match):
"""
Find rows in documentation that were matched

Parameters
----------
match : typing.Match

Returns
-------
row_start : int
row_end : int
"""
if not match:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add docstrings to those too? It's not obvious what they do

return None

row_start = self.raw_doc[:match.start(0)].count('\n')
row_end = self.raw_doc[:match.end(0)].count('\n')
return row_start, row_end + 1

def check_bullet_list_in_block_quote(self):
for node in self.doctree.traverse(docutils.nodes.block_quote):
match = node.first_child_matching_class(docutils.nodes.bullet_list)
if match is not None:
self.error('DT01')

def check_tabulator_as_indentation(self):
matches = STARTING_WHITESPACE_RE.finditer(self.raw_doc)
for match in matches:
if '\t' in match.group(1):
self.error('WS01', line=self.find_line(match))

def check_line_ends_with_whitespace(self):
matches = TRAILING_WHITESPACE_RE.finditer(self.raw_doc)
for match in matches:
self.error('WS02', line=self.find_line(match))

def check_empty_line_contains_whitespace(self):
matches = EMPTY_LINE_WHITESPACE_RE.finditer(self.raw_doc)
for match in matches:
self.error('WS03', line=self.find_line(match))

def validate(self):
"""Execute methods starting with 'check'"""
self.errs = {}
for func in dir(self):
if func.startswith('check'):
self.__class__.__dict__[func](self)

return self.errs

def report(self, errors=None, output_format='default'):
"""
Output errors to stdout

Parameters
----------
errors : list of str, optional
If provided, filter output by these error codes.
output_format : str, optional
One of 'default', 'json', 'azure'

Returns
-------
int
A integer with number of found issues
"""
n_errs = 0
if output_format == 'json':
output = json.dumps(self.errs)
else:
if output_format == 'default':
output_format = '{path}:{row}:: {code} {text}\n'
elif output_format == 'azure':
output_format = ('##vso[task.logissue type=error;'
'sourcepath={path};'
'linenumber={row};'
'code={code};'
']{text}\n')
else:
raise ValueError('Unknown output_format "{}"'.format(
output_format))

output = ''

for err_code, errs in self.errs.items():
# The script would be faster if instead of filtering the
# errors after validating them, it didn't validate them
# initially. But that would complicate the code too much
if errors and err_code not in errors:
continue
for line, kwargs in errs:
n_errs += 1
row_start, row_end = line if line else (0, 0)
output += output_format.format(
name=self.page,
path='doc/source/{}.rst'.format(self.page),
row=row_start + 1 if line else '?',
code=err_code,
source=''.join(self.raw_lines[row_start:row_end]),
text=ERROR_MSGS[err_code].format(kwargs))

sys.stdout.write(output)
return n_errs


def validate_one(page):
"""
Validate the page for the given page

Parameters
----------
page : str
Path to page relative to documentation's source folder without file
extension. (e.g. io)

Returns
-------
dict
A dictionary containing all the information obtained from
validating the page.

Notes
-----
The errors codes are defined as:
- First two characters: Type of errors:
* DT: Error with unwanted node constellations inside the doctree
* WS: Issues with whitespace characters
- Last two characters: Numeric error code
"""
try:
with open(DOCTREE_PATH.format(page), 'r+b') as file:
doctree = pickle.load(file)

with open(RST_PATH.format(page), 'r') as file:
raw_doc = file.readlines()
except FileNotFoundError:
return None

checker = DocumentChecker(page, raw_doc, doctree)
checker.validate()
return checker


def validate_all(exclude_patterns):
"""
Execute the validation of all pages, and return a dict with the
results.

Parameters
----------
exclude_patterns : List[str] or None
If provided, the pages that match with one of these patterns
will be ignored. If None, all pages will be validated.

Returns
-------
dict
A dictionary with an item for every page containing
all the validation information.
"""

checkers = {}
for root, dirs, files in os.walk(DOCUMENTATION_SOURCE):
_, base_dir = root.split(DOCUMENTATION_SOURCE)
for file in files:
docname, ext = os.path.splitext(file)
if not ext == '.doctree':
continue
page = os.path.join(base_dir, docname)
if exclude_patterns:
for pattern in exclude_patterns:
if fnmatch(page, pattern):
continue

checker = validate_one(page)
if checker:
checkers[page] = checker
return checkers


def main(page, errors, output_format, exclude_patterns=None):
if page:
checkers = {page: validate_one(page)}
else:
checkers = validate_all(exclude_patterns=exclude_patterns)
exit_code = 0
for page, checker in checkers.items():
exit_code += checker.report(errors=errors, output_format=output_format)
return exit_code


if __name__ == '__main__':
format_opts = 'default', 'json', 'azure'
argparser = argparse.ArgumentParser(
description='validate pandas documentation')
add = argparser.add_argument
add('page', nargs='?', default=None,
help='page to validate (e.g. io) '
'if not provided, all pages are validated')
add('--format', default='default', choices=format_opts,
help='format of the output when validating '
'multiple documents (ignored when validating one).'
'It can be {}'.format(str(format_opts)[1:-1]))
add('--errors', default=None,
help='comma separated '
'list of error codes to validate. By default it '
'validates all errors (ignored when validating '
'a single document)')
add('--exclude', default=None,
help='comma separated '
'patterns of pages to exclude. Utilises '
'`Unix filename pattern matching`'
'(ignored when validating a single document)')

args = argparser.parse_args()
sys.exit(main(args.page,
args.errors.split(',') if args.errors else None,
args.format,
args.exclude.split(',') if args.exclude else None))