pandas-dev · FHaase · Dec 10, 2018 · Dec 10, 2018 · Dec 10, 2018 · Dec 11, 2018
diff --git a/doc/make.py b/doc/make.py
@@ -21,6 +21,14 @@
 import webbrowser
 import jinja2
 
+sys.path.insert(0, os.path.abspath('../scripts'))
+sys.path.extend([
+
+    # numpy standard doc extensions
+    os.path.join(os.path.dirname(__file__), '..', 'scripts')
+
+])
+import validate_documentation
 
 DOC_PATH = os.path.dirname(os.path.abspath(__file__))
 SOURCE_PATH = os.path.join(DOC_PATH, 'source')
@@ -263,6 +271,8 @@ def html(self):
                 os.remove(zip_fname)
 
         if self.single_doc is not None:
+            report = validate_documentation.validate_one(self.single_doc)
+            validate_documentation.report({self.single_doc: report})
             self._open_browser()
             shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'),
                           ignore_errors=True)

diff --git a/scripts/tests/test_validate_documentation.py b/scripts/tests/test_validate_documentation.py
@@ -0,0 +1,50 @@
+import pytest
+
+import docutils.nodes
+import docutils.utils
+
+import validate_documentation
+DocumentChecker = validate_documentation.DocumentChecker
+
+
+@pytest.fixture()
+def doctree():
+    return docutils.utils.new_document('test.rst')
+
+
+@pytest.mark.parametrize('raw_doc,errs', [
+    ('   Indented Line\n', {}),
+    ('\n', {}),
+    ('\tIndented Line\n', {'WS01': [(1, '\tIndented Line\n')]}),
+    ('Line   \n', {'WS02': [(1, 'Line   \n')]}),
+    ('\t\n', {'WS01': [(1, '\t\n')], 'WS02': [(1, '\t\n')]}),
+    (' \n', {'WS02': [(1, ' \n')]}),
+])
+def test_line_checkers(doctree, raw_doc, errs):
+    checker = DocumentChecker(raw_doc, doctree)
+    result = checker.validate()
+
+    assert result == errs
+
+
+def test_doctree_with_list_in_block_quote(doctree):
+    bullet_list = docutils.nodes.bullet_list()
+    block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list)
+    doctree.append(block_quote)
+
+    checker = DocumentChecker('', doctree)
+    result = checker.validate()
+
+    assert result == {'DT01': [None]}
+
+
+def test_doctree_with_block_quote_in_list(doctree):
+    bullet_list = docutils.nodes.bullet_list()
+    block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list)
+    top_list = docutils.nodes.bullet_list('', block_quote)
+    doctree.append(top_list)
+
+    checker = DocumentChecker('', doctree)
+    result = checker.validate()
+
+    assert result == {'DT01': [None]}
diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py
@@ -0,0 +1,227 @@
+import argparse
+from fnmatch import fnmatch
+import json
+import os
+import pickle
+import re
+import sys
+
+import docutils.nodes
+
+DOCUMENTATION_SOURCE = os.path.join(os.curdir, '../doc/source')
+DOCTREE_PATH = '../doc/build/doctrees/{}.doctree'
+RST_PATH = '../doc/source/{}.rst'
+
+ERROR_MSGS = {
+    'DT01': "Found bullet list within block quote. Use 0 spaces for top-level "
+            "list and 2 spaces for sub-lists",
+    'WS01': "Indentation uses tabulation",
+    'WS02': "Trailing whitespaces",
+}
+
+STARTING_WHITESPACE_RE = re.compile(r'^(\s+).*\n$', re.MULTILINE)
+TRAILING_WHITESPACE_RE = re.compile(r'^.*([\t ]+)\n$', re.MULTILINE)
+
+
+class DocumentChecker(object):
+
+    def __init__(self, raw_doc, doctree):
+        self.doctree = doctree
+        self.raw_doc = raw_doc
+
+        self.issues = None
+
+    def issue(self, code, match=None, **kwargs):
+        """
+        Parameters
+        ----------
+        code : str
+            Error code.
+        **kwargs
+            Values for the variables in the error messages
+        """
+        issue = self.issues.setdefault(code, [])
+        issue.append((self.find_line(match), kwargs))
+
+    def find_line(self, match):
+        if not match:
+            return None
+        lines = self.raw_doc[:match.start(0)].splitlines()
+        return len(lines) + 1, match.group(0)
+
+    def check_bullet_list_in_block_quote(self):
+        for node in self.doctree.traverse(docutils.nodes.block_quote):
+            match = node.first_child_matching_class(docutils.nodes.bullet_list)
+            if match is not None:
+                self.issue('DT01')
+
+    def check_tabulator_as_indentation(self):
+        matches = STARTING_WHITESPACE_RE.finditer(self.raw_doc)
+        for match in matches:
+            if '\t' in match.group(1):
+                self.issue('WS01', match)
+
+    def check_line_ends_with_whitespace(self):
+        matches = TRAILING_WHITESPACE_RE.finditer(self.raw_doc)
+        for match in matches:
+            self.issue('WS02', match)
+
+    def validate(self):
+        self.issues = {}
+        for func in dir(self):
+            if func.startswith('check'):
+                self.__class__.__dict__[func](self)
+
+        return self.issues
+
+
+def report(reports, output_format='default', errors=None):
+    exit_status = 0
+    if output_format == 'json':
+        output = json.dumps(reports)
+    else:
+        if output_format == 'default':
+            output_format = '{text}\n'
+        elif output_format == 'azure':
+            output_format = ('##vso[task.logissue type=error;'
+                             'sourcepath={path};'
+                             'linenumber={row};'
+                             'code={code};'
+                             ']{text}\n')
+        else:
+            raise ValueError('Unknown output_format "{}"'.format(
+                output_format))
+
+        output = ''
+        for name, res in reports.items():
+            for err_code, issues in res.items():
+                # The script would be faster if instead of filtering the
+                # errors after validating them, it didn't validate them
+                # initially. But that would complicate the code too much
+                if errors and err_code not in errors:
+                    continue
+                for issue, kwargs in issues:
+                    exit_status += 1
+                    row = issue[0] if issue else None
+                    output += output_format.format(
+                        name=name,
+                        path=RST_PATH.format(name),
+                        row=row,
+                        code=err_code,
+                        text='{}{}:: {}'.format(name,
+                                                ':' + row if row else '',
+                                                ERROR_MSGS[err_code]
+                                                .format(kwargs)))
+
+    sys.stdout.write(output)
+    return exit_status
+
+
+def validate_one(page):
+    """
+        Validate the page for the given func_name
+
+        Parameters
+        ----------
+        page : str
+            Path to page relative to documentation's source folder without file
+            extension. (e.g. io)
+
+        Returns
+        -------
+        dict
+            A dictionary containing all the information obtained from
+            validating the page.
+
+        Notes
+        -----
+        The errors codes are defined as:
+        - First two characters: Type of errors:
+           * DT: Error with unwanted node constellations the doctree
+           * WS: Issues regarding whitespace characters
+        - Last two characters: Numeric error code
+        """
+
+    with open(DOCTREE_PATH.format(page), 'r+b') as file:
+        doctree = pickle.load(file)
+
+    with open(RST_PATH.format(page), 'r') as file:
+        raw_doc = file.read()
+
+    checker = DocumentChecker(raw_doc, doctree)
+    return checker.validate()
+
+
+def validate_all(exclude_patterns):
+    """
+    Execute the validation of all pages, and return a dict with the
+    results.
+
+    Parameters
+    ----------
+    exclude_patterns : List[str] or None
+        If provided, the pages that match with one of these patterns
+        will be ignored. If None, all pages will be validated.
+
+    Returns
+    -------
+    dict
+        A dictionary with an item for every page containing
+        all the validation information.
+    """
+
+    result = {}
+    for root, dirs, files in os.walk(DOCUMENTATION_SOURCE):
+        _, base_dir = root.split('../doc/source')
+        for file in files:
+            docname, ext = os.path.splitext(file)
+            if not ext == '.rst':
+                continue
+            page = os.path.join(base_dir, docname)
+            for pattern in exclude_patterns:
+                if fnmatch(page, pattern):
+                    continue
+
+            result[page] = validate_one(page)
+    return result
+
+
+def main(page, errors, output_format, exclude_patterns=None):
+    if page:
+        reports = {page: validate_one(page)}
+    else:
+        reports = validate_all(exclude_patterns=exclude_patterns)
+    return report(reports, output_format=output_format, errors=errors)
+
+
+if __name__ == '__main__':
+    format_opts = 'default', 'json', 'azure'
+    func_help = ('document to validate (e.g. io) '
+                 'if not provided, all documents are validated')
+    argparser = argparse.ArgumentParser(
+        description='validate pandas documentation')
+    add = argparser.add_argument
+    add('document',
+        nargs='?',
+        default=None,
+        help=func_help)
+    add('--format', default='default', choices=format_opts,
+        help='format of the output when validating '
+             'multiple documents (ignored when validating one).'
+             'It can be {}'.format(str(format_opts)[1:-1]))
+    add('--errors', default=None,
+        help='comma separated '
+             'list of error codes to validate. By default it '
+             'validates all errors (ignored when validating '
+             'a single document)')
+    add('--exclude', default=None,
+        help='comma separated '
+             'patterns of pages to exclude. By default it '
+             'validates all errors (ignored when validating '
+             'a single document)')
+
+    args = argparser.parse_args()
+    sys.exit(main(args.document,
+                  args.errors.split(',') if args.errors else None,
+                  args.format,
+                  args.exclude.split(',')))