pandas-dev · FHaase · Dec 10, 2018 · Dec 10, 2018 · Dec 10, 2018 · Dec 11, 2018
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
@@ -3,8 +3,14 @@
 set -e
 
 if [ "$DOC" ]; then
-    echo "We are not running pytest as this is a doc-build"
-    exit 0
+    echo ###############################
+    echo #     Lint documentation      #
+    echo ###############################
+
+    MSG='Validate documentation' ; echo $MSG
+    $TRAVIS_BUILD_DIR/scripts/validate_documentation.py
+    RET=$? ; echo $MSG "DONE"
+    exit $RET
 fi
 
 # Workaround for pytest-xdist flaky collection order

diff --git a/doc/make.py b/doc/make.py
@@ -21,6 +21,11 @@
 import webbrowser
 import jinja2
 
+sys.path.extend([
+    # validation for documentation
+    os.path.join(os.path.dirname(__file__), '..', 'scripts')
+])
+import validate_documentation
 
 DOC_PATH = os.path.dirname(os.path.abspath(__file__))
 SOURCE_PATH = os.path.join(DOC_PATH, 'source')
@@ -263,6 +268,8 @@ def html(self):
                 os.remove(zip_fname)
 
         if self.single_doc is not None:
+            checker = validate_documentation.validate_one(self.single_doc)
+            checker.report()
             self._open_browser()
             shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'),
                           ignore_errors=True)

diff --git a/scripts/tests/test_validate_documentation.py b/scripts/tests/test_validate_documentation.py
@@ -0,0 +1,50 @@
+import pytest
+
+import docutils.nodes
+import docutils.utils
+
+import validate_documentation
+DocumentChecker = validate_documentation.DocumentChecker
+
+
+@pytest.fixture()
+def doctree():
+    return docutils.utils.new_document('test.rst')
+
+
+@pytest.mark.parametrize('raw_doc,errs', [
+    (['   Indented Line\n'], {}),
+    (['\n'], {}),
+    (['\tIndented Line\n'], {'WS01': [((0, 1), {})]}),
+    (['Line   \n'], {'WS02': [((0, 1), {})]}),
+    (['\t\n'], {'WS03': [((0, 1), {})]}),
+    ([' \n'], {'WS03': [((0, 1), {})]}),
+])
+def test_line_checkers(doctree, raw_doc, errs):
+    checker = DocumentChecker('', raw_doc, doctree)
+    checker.validate()
+
+    assert checker.errs == errs
+
+
+def test_doctree_with_list_in_block_quote(doctree):
+    bullet_list = docutils.nodes.bullet_list()
+    block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list)
+    doctree.append(block_quote)
+
+    checker = DocumentChecker('', [''], doctree)
+    checker.validate()
+
+    assert checker.errs == {'DT01': [(None, {})]}
+
+
+def test_doctree_with_block_quote_in_list(doctree):
+    bullet_list = docutils.nodes.bullet_list()
+    block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list)
+    top_list = docutils.nodes.bullet_list('', block_quote)
+    doctree.append(top_list)
+
+    checker = DocumentChecker('', [''], doctree)
+    result = checker.validate()
+
+    assert result == {'DT01': [(None, {})]}
diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python
+"""
+Analyze documentation to detect errors.
+
+Requires the documentation to be build before usage.
+
+If no argument is provided, it validates all pages in the source folder for
+which a doctree has been generated.
+
+If a page is provided like "index", "whatsnew/v1.0.0" a list of all errors
+in that page is printed.
+
+Usage::
+    $ ./validate_documentation.py
+    $ ./validate_documentation.py index
+"""
+
+import argparse
+import docutils.nodes
+from fnmatch import fnmatch
+import json
+import os
+import pickle
+import re
+import sys
+
+BASE_DIR = os.path.join(os.path.dirname(__file__), '..', 'doc')
+DOCUMENTATION_SOURCE = os.path.join(BASE_DIR, 'build', 'doctrees', '')
+DOCTREE_PATH = os.path.join(DOCUMENTATION_SOURCE, '{}.doctree')
+RST_PATH = os.path.join(BASE_DIR, 'source', '{}.rst')
+
+ERROR_MSGS = {
+    'DT01': "Found bullet list within block quote. Use 0 spaces for top-level "
+            "list and 2 spaces for sub-lists",
+    'WS01': "Indentation uses tabulation",
+    'WS02': "Trailing whitespaces",
+    'WS03': "Whitespace in empty line",
+}
+
+STARTING_WHITESPACE_RE = re.compile(r'^(\s+).', re.MULTILINE)
+TRAILING_WHITESPACE_RE = re.compile(r'.([\t ]+)$', re.MULTILINE)
+EMPTY_LINE_WHITESPACE_RE = re.compile(r'^([\t ]+)$', re.MULTILINE)
+
+
+class DocumentChecker(object):
+    """
+    Checker to validate one page in the documentation
+
+    Attributes
+    ----------
+    page : str
+        Path to page relative to documentation's source folder without file
+        extension. (e.g. io)
+    doctree : docutils.nodes.document
+        Generated doctree for associated the page
+    raw_lines : list of str
+        Lines from rst-file for associated page
+    raw_doc : str
+        Joined lines from rst-file for associated page
+
+    Notes
+    -----
+    Add a method starting with `check` to add additional checks.
+
+    """
+
+    def __init__(self, page, raw_lines, doctree):
+        self.page = page
+        self.doctree = doctree
+        self.raw_lines = raw_lines
+        self.raw_doc = ''.join(raw_lines)
+        self.errs = None
+
+    def error(self, code, line=None, **kwargs):
+        """
+        Parameters
+        ----------
+        code : str
+            Error code.
+        line : Tuple[int, str]
+        **kwargs
+            Values for the variables in the error messages
+        """
+        errs = self.errs.setdefault(code, [])
+        errs.append((line, kwargs))
+
+    def find_line(self, match):
+        """
+        Find rows in documentation that were matched
+
+        Parameters
+        ----------
+        match : typing.Match
+
+        Returns
+        -------
+        row_start : int
+        row_end : int
+        """
+        if not match:
+            return None
+
+        row_start = self.raw_doc[:match.start(0)].count('\n')
+        row_end = self.raw_doc[:match.end(0)].count('\n')
+        return row_start, row_end + 1
+
+    def check_bullet_list_in_block_quote(self):
+        for node in self.doctree.traverse(docutils.nodes.block_quote):
+            match = node.first_child_matching_class(docutils.nodes.bullet_list)
+            if match is not None:
+                self.error('DT01')
+
+    def check_tabulator_as_indentation(self):
+        matches = STARTING_WHITESPACE_RE.finditer(self.raw_doc)
+        for match in matches:
+            if '\t' in match.group(1):
+                self.error('WS01', line=self.find_line(match))
+
+    def check_line_ends_with_whitespace(self):
+        matches = TRAILING_WHITESPACE_RE.finditer(self.raw_doc)
+        for match in matches:
+            self.error('WS02', line=self.find_line(match))
+
+    def check_empty_line_contains_whitespace(self):
+        matches = EMPTY_LINE_WHITESPACE_RE.finditer(self.raw_doc)
+        for match in matches:
+            self.error('WS03', line=self.find_line(match))
+
+    def validate(self):
+        """Execute methods starting with 'check'"""
+        self.errs = {}
+        for func in dir(self):
+            if func.startswith('check'):
+                self.__class__.__dict__[func](self)
+
+        return self.errs
+
+    def report(self, errors=None, output_format='default'):
+        """
+        Output errors to stdout
+
+        Parameters
+        ----------
+        errors : list of str, optional
+            If provided, filter output by these error codes.
+        output_format : str, optional
+            One of 'default', 'json', 'azure'
+
+        Returns
+        -------
+        int
+            A integer with number of found issues
+        """
+        n_errs = 0
+        if output_format == 'json':
+            output = json.dumps(self.errs)
+        else:
+            if output_format == 'default':
+                output_format = '{path}:{row}:: {code} {text}\n'
+            elif output_format == 'azure':
+                output_format = ('##vso[task.logissue type=error;'
+                                 'sourcepath={path};'
+                                 'linenumber={row};'
+                                 'code={code};'
+                                 ']{text}\n')
+            else:
+                raise ValueError('Unknown output_format "{}"'.format(
+                    output_format))
+
+            output = ''
+
+            for err_code, errs in self.errs.items():
+                # The script would be faster if instead of filtering the
+                # errors after validating them, it didn't validate them
+                # initially. But that would complicate the code too much
+                if errors and err_code not in errors:
+                    continue
+                for line, kwargs in errs:
+                    n_errs += 1
+                    row_start, row_end = line if line else (0, 0)
+                    output += output_format.format(
+                        name=self.page,
+                        path='doc/source/{}.rst'.format(self.page),
+                        row=row_start + 1 if line else '?',
+                        code=err_code,
+                        source=''.join(self.raw_lines[row_start:row_end]),
+                        text=ERROR_MSGS[err_code].format(kwargs))
+
+        sys.stdout.write(output)
+        return n_errs
+
+
+def validate_one(page):
+    """
+    Validate the page for the given page
+
+    Parameters
+    ----------
+    page : str
+        Path to page relative to documentation's source folder without file
+        extension. (e.g. io)
+
+    Returns
+    -------
+    dict
+        A dictionary containing all the information obtained from
+        validating the page.
+
+    Notes
+    -----
+    The errors codes are defined as:
+    - First two characters: Type of errors:
+       * DT: Error with unwanted node constellations inside the doctree
+       * WS: Issues with whitespace characters
+    - Last two characters: Numeric error code
+    """
+    try:
+        with open(DOCTREE_PATH.format(page), 'r+b') as file:
+            doctree = pickle.load(file)
+
+        with open(RST_PATH.format(page), 'r') as file:
+            raw_doc = file.readlines()
+    except FileNotFoundError:
+        return None
+
+    checker = DocumentChecker(page, raw_doc, doctree)
+    checker.validate()
+    return checker
+
+
+def validate_all(exclude_patterns):
+    """
+    Execute the validation of all pages, and return a dict with the
+    results.
+
+    Parameters
+    ----------
+    exclude_patterns : List[str] or None
+        If provided, the pages that match with one of these patterns
+        will be ignored. If None, all pages will be validated.
+
+    Returns
+    -------
+    dict
+        A dictionary with an item for every page containing
+        all the validation information.
+    """
+
+    checkers = {}
+    for root, dirs, files in os.walk(DOCUMENTATION_SOURCE):
+        _, base_dir = root.split(DOCUMENTATION_SOURCE)
+        for file in files:
+            docname, ext = os.path.splitext(file)
+            if not ext == '.doctree':
+                continue
+            page = os.path.join(base_dir, docname)
+            if exclude_patterns:
+                for pattern in exclude_patterns:
+                    if fnmatch(page, pattern):
+                        continue
+
+            checker = validate_one(page)
+            if checker:
+                checkers[page] = checker
+    return checkers
+
+
+def main(page, errors, output_format, exclude_patterns=None):
+    if page:
+        checkers = {page: validate_one(page)}
+    else:
+        checkers = validate_all(exclude_patterns=exclude_patterns)
+    exit_code = 0
+    for page, checker in checkers.items():
+        exit_code += checker.report(errors=errors, output_format=output_format)
+    return exit_code
+
+
+if __name__ == '__main__':
+    format_opts = 'default', 'json', 'azure'
+    argparser = argparse.ArgumentParser(
+        description='validate pandas documentation')
+    add = argparser.add_argument
+    add('page', nargs='?', default=None,
+        help='page to validate (e.g. io) '
+             'if not provided, all pages are validated')
+    add('--format', default='default', choices=format_opts,
+        help='format of the output when validating '
+             'multiple documents (ignored when validating one).'
+             'It can be {}'.format(str(format_opts)[1:-1]))
+    add('--errors', default=None,
+        help='comma separated '
+             'list of error codes to validate. By default it '
+             'validates all errors (ignored when validating '
+             'a single document)')
+    add('--exclude', default=None,
+        help='comma separated '
+             'patterns of pages to exclude. Utilises '
+             '`Unix filename pattern matching`'
+             '(ignored when validating a single document)')
+
+    args = argparser.parse_args()
+    sys.exit(main(args.page,
+                  args.errors.split(',') if args.errors else None,
+                  args.format,
+                  args.exclude.split(',') if args.exclude else None))