diff --git a/ci/run_tests.sh b/ci/run_tests.sh index ee46da9f52eab..27176a1c71789 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -3,8 +3,14 @@ set -e if [ "$DOC" ]; then - echo "We are not running pytest as this is a doc-build" - exit 0 + echo ############################### + echo # Lint documentation # + echo ############################### + + MSG='Validate documentation' ; echo $MSG + $TRAVIS_BUILD_DIR/scripts/validate_documentation.py + RET=$? ; echo $MSG "DONE" + exit $RET fi # Workaround for pytest-xdist flaky collection order diff --git a/doc/make.py b/doc/make.py index 0a3a7483fcc91..1ed9dc1c17417 100755 --- a/doc/make.py +++ b/doc/make.py @@ -21,6 +21,11 @@ import webbrowser import jinja2 +sys.path.extend([ + # validation for documentation + os.path.join(os.path.dirname(__file__), '..', 'scripts') +]) +import validate_documentation DOC_PATH = os.path.dirname(os.path.abspath(__file__)) SOURCE_PATH = os.path.join(DOC_PATH, 'source') @@ -263,6 +268,8 @@ def html(self): os.remove(zip_fname) if self.single_doc is not None: + checker = validate_documentation.validate_one(self.single_doc) + checker.report() self._open_browser() shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'), ignore_errors=True) diff --git a/scripts/tests/test_validate_documentation.py b/scripts/tests/test_validate_documentation.py new file mode 100644 index 0000000000000..eb4ac493b33dd --- /dev/null +++ b/scripts/tests/test_validate_documentation.py @@ -0,0 +1,50 @@ +import pytest + +import docutils.nodes +import docutils.utils + +import validate_documentation +DocumentChecker = validate_documentation.DocumentChecker + + +@pytest.fixture() +def doctree(): + return docutils.utils.new_document('test.rst') + + +@pytest.mark.parametrize('raw_doc,errs', [ + ([' Indented Line\n'], {}), + (['\n'], {}), + (['\tIndented Line\n'], {'WS01': [((0, 1), {})]}), + (['Line \n'], {'WS02': [((0, 1), {})]}), + (['\t\n'], {'WS03': [((0, 1), {})]}), + ([' \n'], {'WS03': [((0, 1), {})]}), +]) +def test_line_checkers(doctree, raw_doc, errs): + checker = DocumentChecker('', raw_doc, doctree) + checker.validate() + + assert checker.errs == errs + + +def test_doctree_with_list_in_block_quote(doctree): + bullet_list = docutils.nodes.bullet_list() + block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list) + doctree.append(block_quote) + + checker = DocumentChecker('', [''], doctree) + checker.validate() + + assert checker.errs == {'DT01': [(None, {})]} + + +def test_doctree_with_block_quote_in_list(doctree): + bullet_list = docutils.nodes.bullet_list() + block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list) + top_list = docutils.nodes.bullet_list('', block_quote) + doctree.append(top_list) + + checker = DocumentChecker('', [''], doctree) + result = checker.validate() + + assert result == {'DT01': [(None, {})]} diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py new file mode 100755 index 0000000000000..163f41de6036a --- /dev/null +++ b/scripts/validate_documentation.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python +""" +Analyze documentation to detect errors. + +Requires the documentation to be build before usage. + +If no argument is provided, it validates all pages in the source folder for +which a doctree has been generated. + +If a page is provided like "index", "whatsnew/v1.0.0" a list of all errors +in that page is printed. + +Usage:: + $ ./validate_documentation.py + $ ./validate_documentation.py index +""" + +import argparse +import docutils.nodes +from fnmatch import fnmatch +import json +import os +import pickle +import re +import sys + +BASE_DIR = os.path.join(os.path.dirname(__file__), '..', 'doc') +DOCUMENTATION_SOURCE = os.path.join(BASE_DIR, 'build', 'doctrees', '') +DOCTREE_PATH = os.path.join(DOCUMENTATION_SOURCE, '{}.doctree') +RST_PATH = os.path.join(BASE_DIR, 'source', '{}.rst') + +ERROR_MSGS = { + 'DT01': "Found bullet list within block quote. Use 0 spaces for top-level " + "list and 2 spaces for sub-lists", + 'WS01': "Indentation uses tabulation", + 'WS02': "Trailing whitespaces", + 'WS03': "Whitespace in empty line", +} + +STARTING_WHITESPACE_RE = re.compile(r'^(\s+).', re.MULTILINE) +TRAILING_WHITESPACE_RE = re.compile(r'.([\t ]+)$', re.MULTILINE) +EMPTY_LINE_WHITESPACE_RE = re.compile(r'^([\t ]+)$', re.MULTILINE) + + +class DocumentChecker(object): + """ + Checker to validate one page in the documentation + + Attributes + ---------- + page : str + Path to page relative to documentation's source folder without file + extension. (e.g. io) + doctree : docutils.nodes.document + Generated doctree for associated the page + raw_lines : list of str + Lines from rst-file for associated page + raw_doc : str + Joined lines from rst-file for associated page + + Notes + ----- + Add a method starting with `check` to add additional checks. + + """ + + def __init__(self, page, raw_lines, doctree): + self.page = page + self.doctree = doctree + self.raw_lines = raw_lines + self.raw_doc = ''.join(raw_lines) + self.errs = None + + def error(self, code, line=None, **kwargs): + """ + Parameters + ---------- + code : str + Error code. + line : Tuple[int, str] + **kwargs + Values for the variables in the error messages + """ + errs = self.errs.setdefault(code, []) + errs.append((line, kwargs)) + + def find_line(self, match): + """ + Find rows in documentation that were matched + + Parameters + ---------- + match : typing.Match + + Returns + ------- + row_start : int + row_end : int + """ + if not match: + return None + + row_start = self.raw_doc[:match.start(0)].count('\n') + row_end = self.raw_doc[:match.end(0)].count('\n') + return row_start, row_end + 1 + + def check_bullet_list_in_block_quote(self): + for node in self.doctree.traverse(docutils.nodes.block_quote): + match = node.first_child_matching_class(docutils.nodes.bullet_list) + if match is not None: + self.error('DT01') + + def check_tabulator_as_indentation(self): + matches = STARTING_WHITESPACE_RE.finditer(self.raw_doc) + for match in matches: + if '\t' in match.group(1): + self.error('WS01', line=self.find_line(match)) + + def check_line_ends_with_whitespace(self): + matches = TRAILING_WHITESPACE_RE.finditer(self.raw_doc) + for match in matches: + self.error('WS02', line=self.find_line(match)) + + def check_empty_line_contains_whitespace(self): + matches = EMPTY_LINE_WHITESPACE_RE.finditer(self.raw_doc) + for match in matches: + self.error('WS03', line=self.find_line(match)) + + def validate(self): + """Execute methods starting with 'check'""" + self.errs = {} + for func in dir(self): + if func.startswith('check'): + self.__class__.__dict__[func](self) + + return self.errs + + def report(self, errors=None, output_format='default'): + """ + Output errors to stdout + + Parameters + ---------- + errors : list of str, optional + If provided, filter output by these error codes. + output_format : str, optional + One of 'default', 'json', 'azure' + + Returns + ------- + int + A integer with number of found issues + """ + n_errs = 0 + if output_format == 'json': + output = json.dumps(self.errs) + else: + if output_format == 'default': + output_format = '{path}:{row}:: {code} {text}\n' + elif output_format == 'azure': + output_format = ('##vso[task.logissue type=error;' + 'sourcepath={path};' + 'linenumber={row};' + 'code={code};' + ']{text}\n') + else: + raise ValueError('Unknown output_format "{}"'.format( + output_format)) + + output = '' + + for err_code, errs in self.errs.items(): + # The script would be faster if instead of filtering the + # errors after validating them, it didn't validate them + # initially. But that would complicate the code too much + if errors and err_code not in errors: + continue + for line, kwargs in errs: + n_errs += 1 + row_start, row_end = line if line else (0, 0) + output += output_format.format( + name=self.page, + path='doc/source/{}.rst'.format(self.page), + row=row_start + 1 if line else '?', + code=err_code, + source=''.join(self.raw_lines[row_start:row_end]), + text=ERROR_MSGS[err_code].format(kwargs)) + + sys.stdout.write(output) + return n_errs + + +def validate_one(page): + """ + Validate the page for the given page + + Parameters + ---------- + page : str + Path to page relative to documentation's source folder without file + extension. (e.g. io) + + Returns + ------- + dict + A dictionary containing all the information obtained from + validating the page. + + Notes + ----- + The errors codes are defined as: + - First two characters: Type of errors: + * DT: Error with unwanted node constellations inside the doctree + * WS: Issues with whitespace characters + - Last two characters: Numeric error code + """ + try: + with open(DOCTREE_PATH.format(page), 'r+b') as file: + doctree = pickle.load(file) + + with open(RST_PATH.format(page), 'r') as file: + raw_doc = file.readlines() + except FileNotFoundError: + return None + + checker = DocumentChecker(page, raw_doc, doctree) + checker.validate() + return checker + + +def validate_all(exclude_patterns): + """ + Execute the validation of all pages, and return a dict with the + results. + + Parameters + ---------- + exclude_patterns : List[str] or None + If provided, the pages that match with one of these patterns + will be ignored. If None, all pages will be validated. + + Returns + ------- + dict + A dictionary with an item for every page containing + all the validation information. + """ + + checkers = {} + for root, dirs, files in os.walk(DOCUMENTATION_SOURCE): + _, base_dir = root.split(DOCUMENTATION_SOURCE) + for file in files: + docname, ext = os.path.splitext(file) + if not ext == '.doctree': + continue + page = os.path.join(base_dir, docname) + if exclude_patterns: + for pattern in exclude_patterns: + if fnmatch(page, pattern): + continue + + checker = validate_one(page) + if checker: + checkers[page] = checker + return checkers + + +def main(page, errors, output_format, exclude_patterns=None): + if page: + checkers = {page: validate_one(page)} + else: + checkers = validate_all(exclude_patterns=exclude_patterns) + exit_code = 0 + for page, checker in checkers.items(): + exit_code += checker.report(errors=errors, output_format=output_format) + return exit_code + + +if __name__ == '__main__': + format_opts = 'default', 'json', 'azure' + argparser = argparse.ArgumentParser( + description='validate pandas documentation') + add = argparser.add_argument + add('page', nargs='?', default=None, + help='page to validate (e.g. io) ' + 'if not provided, all pages are validated') + add('--format', default='default', choices=format_opts, + help='format of the output when validating ' + 'multiple documents (ignored when validating one).' + 'It can be {}'.format(str(format_opts)[1:-1])) + add('--errors', default=None, + help='comma separated ' + 'list of error codes to validate. By default it ' + 'validates all errors (ignored when validating ' + 'a single document)') + add('--exclude', default=None, + help='comma separated ' + 'patterns of pages to exclude. Utilises ' + '`Unix filename pattern matching`' + '(ignored when validating a single document)') + + args = argparser.parse_args() + sys.exit(main(args.page, + args.errors.split(',') if args.errors else None, + args.format, + args.exclude.split(',') if args.exclude else None))