From 534eb70c429391344a3692fb22750d1946119db6 Mon Sep 17 00:00:00 2001
From: Fabian Haase <haase.fabian@gmail.com>
Date: Mon, 10 Dec 2018 18:45:59 +0100
Subject: [PATCH 1/9] Validate documentation

Signed-off-by: Fabian Haase <haase.fabian@gmail.com>
---
 doc/make.py                                  |   9 +
 scripts/tests/test_validate_documentation.py |  49 ++++++
 scripts/validate_documentation.py            | 174 +++++++++++++++++++
 3 files changed, 232 insertions(+)
 create mode 100644 scripts/tests/test_validate_documentation.py
 create mode 100644 scripts/validate_documentation.py

diff --git a/doc/make.py b/doc/make.py
index 0a3a7483fcc91..33c3f16c1d140 100755
--- a/doc/make.py
+++ b/doc/make.py
@@ -21,6 +21,14 @@
 import webbrowser
 import jinja2
 
+sys.path.insert(0, os.path.abspath('../scripts'))
+sys.path.extend([
+
+    # numpy standard doc extensions
+    os.path.join(os.path.dirname(__file__), '..', 'scripts')
+
+])
+import validate_documentation
 
 DOC_PATH = os.path.dirname(os.path.abspath(__file__))
 SOURCE_PATH = os.path.join(DOC_PATH, 'source')
@@ -263,6 +271,7 @@ def html(self):
                 os.remove(zip_fname)
 
         if self.single_doc is not None:
+            validate_documentation.main(self.single_doc, None, 'default')
             self._open_browser()
             shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'),
                           ignore_errors=True)
diff --git a/scripts/tests/test_validate_documentation.py b/scripts/tests/test_validate_documentation.py
new file mode 100644
index 0000000000000..930f3cbc74f66
--- /dev/null
+++ b/scripts/tests/test_validate_documentation.py
@@ -0,0 +1,49 @@
+import pytest
+
+import docutils.nodes
+import docutils.utils
+
+import validate_documentation
+DocumentChecker = validate_documentation.DocumentChecker
+
+
+@pytest.fixture()
+def doctree():
+    return docutils.utils.new_document('test.rst')
+
+@pytest.mark.parametrize('raw_doc,errs', [
+    ('   Indented Line\n', {}),
+    ('\n', {}),
+    ('\tIndented Line\n', {'WS01': [(1, '\tIndented Line\n')]}),
+    ('Line   \n', {'WS02': [(1, 'Line   \n')]}),
+    ('\t\n', {'WS01': [(1, '\t\n')], 'WS02': [(1, '\t\n')]}),
+    (' \n', {'WS02': [(1, ' \n')]}),
+])
+def test_line_checkers(doctree, raw_doc, errs):
+    checker = DocumentChecker(raw_doc, doctree)
+    result = checker.validate()
+
+    assert result == errs
+
+
+def test_doctree_with_list_in_block_quote(doctree):
+    bullet_list = docutils.nodes.bullet_list()
+    block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list)
+    doctree.append(block_quote)
+
+    checker = DocumentChecker('', doctree)
+    result = checker.validate()
+
+    assert result == {'DT01': [None]}
+
+
+def test_doctree_with_block_quote_in_list(doctree):
+    bullet_list = docutils.nodes.bullet_list()
+    block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list)
+    top_list = docutils.nodes.bullet_list('', block_quote)
+    doctree.append(top_list)
+
+    checker = DocumentChecker('', doctree)
+    result = checker.validate()
+
+    assert result == {'DT01': [None]}
diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py
new file mode 100644
index 0000000000000..a2e893ac86144
--- /dev/null
+++ b/scripts/validate_documentation.py
@@ -0,0 +1,174 @@
+import argparse
+import json
+import os
+import pickle
+import re
+import sys
+
+import docutils.nodes
+
+DOCTREE_PATH = '../doc/build/doctrees/{}.doctree'
+RST_PATH = '../doc/source/{}.rst'
+
+ERROR_MSGS = {
+    'DT01': "Found bullet list within block quote",
+    'WS01': "Indentation uses tabulation",
+    'WS02': "Trailing whitespaces",
+    'WS03': "Whitespaces in empty line",
+}
+
+STARTING_WHITESPACE_RE = re.compile(r'^(\s+).*\n$', re.MULTILINE)
+TRAILING_WHITESPACE_RE = re.compile(r'^.*([\t ]+)\n$', re.MULTILINE)
+
+
+class DocumentChecker(object):
+
+    def __init__(self, raw_doc, doctree):
+        self.doctree = doctree
+        self.raw_doc = raw_doc
+
+        self.issues = None
+
+    def issue(self, code, match=None, **kwargs):
+        """
+        Parameters
+        ----------
+        code : str
+            Error code.
+        **kwargs
+            Values for the variables in the error messages
+        """
+        issue = self.issues.setdefault(code, [])
+        issue.append((self.find_line(match), kwargs))
+
+    def find_line(self, match):
+        if not match:
+            return None
+        lines = self.raw_doc[:match.start(0)].splitlines()
+        return len(lines) + 1, match.group(0)
+
+    def check_bullet_list_in_block_quote(self):
+        for node in self.doctree.traverse(docutils.nodes.block_quote):
+            match = node.first_child_matching_class(docutils.nodes.bullet_list)
+            if match is not None:
+                self.issue('DT01')
+
+    def check_tabulator_as_indentation(self):
+        matches = STARTING_WHITESPACE_RE.finditer(self.raw_doc)
+        for match in matches:
+            if '\t' in match.group(1):
+                self.issue('WS01', match)
+
+    def check_line_ends_with_whitespace(self):
+        matches = TRAILING_WHITESPACE_RE.finditer(self.raw_doc)
+        for match in matches:
+            self.issue('WS02', match)
+
+    def validate(self):
+        self.issues = {}
+        for func in dir(self):
+            if func.startswith('check'):
+                self.__class__.__dict__[func](self)
+
+        return self.issues
+
+
+def report(reports, output_format='default', errors=None):
+    exit_status = 0
+    if output_format == 'json':
+        output = json.dumps(reports)
+    else:
+        if output_format == 'default':
+            output_format = '{text}\n'
+        elif output_format == 'azure':
+            output_format = ('##vso[task.logissue type=error;'
+                             'sourcepath={path};'
+                             'linenumber={row};'
+                             'code={code};'
+                             ']{text}\n')
+        else:
+            raise ValueError('Unknown output_format "{}"'.format(
+                output_format))
+
+        output = ''
+        for name, res in reports.items():
+            for err_code, issues in res.items():
+                # The script would be faster if instead of filtering the
+                # errors after validating them, it didn't validate them
+                # initially. But that would complicate the code too much
+                if errors and err_code not in errors:
+                    continue
+                for issue, kwargs in issues:
+                    exit_status += 1
+                    row = issue[0] if issue else None
+                    output += output_format.format(
+                        name=name,
+                        path=RST_PATH.format(name),
+                        row=row,
+                        code=err_code,
+                        text='{}{}:: {}'.format(name,
+                                                ':' + row if row else '',
+                                                ERROR_MSGS[err_code]
+                                                .format(kwargs)))
+
+    sys.stdout.write(output)
+    return exit_status
+
+
+def validate(*docnames):
+    result = {}
+    for docname in docnames:
+        with open(DOCTREE_PATH.format(docname), 'r+b') as file:
+            doctree = pickle.load(file)
+
+        with open(RST_PATH.format(docname), 'r') as file:
+            raw_doc = file.read()
+
+        checker = DocumentChecker(raw_doc, doctree)
+        result[docname] = checker.validate()
+
+    return result
+
+
+def main(document, errors, output_format):
+    if document:
+        reports = validate(document)
+    else:
+        documentation_source = os.path.join(os.curdir, '../doc/source')
+        docnames = []
+        for root, dirs, files in os.walk(documentation_source):
+            _, base_dir = root.split('../doc/source')
+            for file in files:
+                docname, ext = os.path.splitext(file)
+                if not ext == '.rst':
+                    continue
+                docnames.append(os.path.join(base_dir, docname))
+        reports = validate(*docnames)
+    return report(reports, output_format=output_format, errors=errors)
+
+
+if __name__ == '__main__':
+    format_opts = 'default', 'json', 'azure'
+    func_help = ('document to validate (e.g. io) '
+                 'if not provided, all documents are validated')
+    argparser = argparse.ArgumentParser(
+        description='validate pandas documentation')
+    add = argparser.add_argument
+    add('document',
+        nargs='?',
+        default=None,
+        help=func_help)
+    add('--format', default='default', choices=format_opts,
+        help='format of the output when validating '
+             'multiple documents (ignored when validating one).'
+             'It can be {}'.format(str(format_opts)[1:-1]))
+    add('--errors', default=None,
+        help='comma separated '
+             'list of error codes to validate. By default it '
+             'validates all errors (ignored when validating '
+             'a single document)')
+
+    args = argparser.parse_args()
+    sys.exit(main(args.document,
+                  args.errors.split(',') if args.errors else None,
+                  args.format, ))

From 7aa12ad5feaf148e55f1ea5cad9c3708069b884d Mon Sep 17 00:00:00 2001
From: Fabian Haase <haase.fabian@gmail.com>
Date: Tue, 11 Dec 2018 00:51:21 +0100
Subject: [PATCH 2/9] Match style of validate_docstrings

Signed-off-by: Fabian Haase <haase.fabian@gmail.com>
---
 doc/make.py                                  |   3 +-
 scripts/tests/test_validate_documentation.py |   1 +
 scripts/validate_documentation.py            | 103 ++++++++++++++-----
 3 files changed, 81 insertions(+), 26 deletions(-)

diff --git a/doc/make.py b/doc/make.py
index 33c3f16c1d140..3f6bd55db9169 100755
--- a/doc/make.py
+++ b/doc/make.py
@@ -271,7 +271,8 @@ def html(self):
                 os.remove(zip_fname)
 
         if self.single_doc is not None:
-            validate_documentation.main(self.single_doc, None, 'default')
+            report = validate_documentation.validate_one(self.single_doc)
+            validate_documentation.report({self.single_doc: report})
             self._open_browser()
             shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'),
                           ignore_errors=True)
diff --git a/scripts/tests/test_validate_documentation.py b/scripts/tests/test_validate_documentation.py
index 930f3cbc74f66..e5edc4f566f1b 100644
--- a/scripts/tests/test_validate_documentation.py
+++ b/scripts/tests/test_validate_documentation.py
@@ -11,6 +11,7 @@
 def doctree():
     return docutils.utils.new_document('test.rst')
 
+
 @pytest.mark.parametrize('raw_doc,errs', [
     ('   Indented Line\n', {}),
     ('\n', {}),
diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py
index a2e893ac86144..1f253861df06f 100644
--- a/scripts/validate_documentation.py
+++ b/scripts/validate_documentation.py
@@ -1,4 +1,5 @@
 import argparse
+from fnmatch import fnmatch
 import json
 import os
 import pickle
@@ -7,14 +8,15 @@
 
 import docutils.nodes
 
+DOCUMENTATION_SOURCE = os.path.join(os.curdir, '../doc/source')
 DOCTREE_PATH = '../doc/build/doctrees/{}.doctree'
 RST_PATH = '../doc/source/{}.rst'
 
 ERROR_MSGS = {
-    'DT01': "Found bullet list within block quote",
+    'DT01': "Found bullet list within block quote. Use 0 spaces for top-level "
+            "list and 2 spaces for sub-lists",
     'WS01': "Indentation uses tabulation",
     'WS02': "Trailing whitespaces",
-    'WS03': "Whitespaces in empty line",
 }
 
 STARTING_WHITESPACE_RE = re.compile(r'^(\s+).*\n$', re.MULTILINE)
@@ -115,35 +117,80 @@ def report(reports, output_format='default', errors=None):
     return exit_status
 
 
-def validate(*docnames):
-    result = {}
-    for docname in docnames:
-        with open(DOCTREE_PATH.format(docname), 'r+b') as file:
-            doctree = pickle.load(file)
+def validate_one(page):
+    """
+        Validate the page for the given func_name
+
+        Parameters
+        ----------
+        page : str
+            Path to page relative to documentation's source folder without file
+            extension. (e.g. io)
+
+        Returns
+        -------
+        dict
+            A dictionary containing all the information obtained from validating
+            the page.
+
+        Notes
+        -----
+        The errors codes are defined as:
+        - First two characters: Type of errors:
+           * DT: Error with unwanted node constellations the doctree
+           * WS: Issues regarding whitespace characters
+        - Last two characters: Numeric error code
+        """
+
+    with open(DOCTREE_PATH.format(page), 'r+b') as file:
+        doctree = pickle.load(file)
+
+    with open(RST_PATH.format(page), 'r') as file:
+        raw_doc = file.read()
+
+    checker = DocumentChecker(raw_doc, doctree)
+    return checker.validate()
+
 
-        with open(RST_PATH.format(docname), 'r') as file:
-            raw_doc = file.read()
+def validate_all(exclude_patterns):
+    """
+    Execute the validation of all pages, and return a dict with the
+    results.
 
-        checker = DocumentChecker(raw_doc, doctree)
-        result[docname] = checker.validate()
+    Parameters
+    ----------
+    exclude_patterns : List[str] or None
+        If provided, the pages that match with one of these patterns
+        will be ignored. If None, all pages will be validated.
 
+    Returns
+    -------
+    dict
+        A dictionary with an item for every page containing
+        all the validation information.
+    """
+
+    result = {}
+    for root, dirs, files in os.walk(DOCUMENTATION_SOURCE):
+        _, base_dir = root.split('../doc/source')
+        for file in files:
+            docname, ext = os.path.splitext(file)
+            if not ext == '.rst':
+                continue
+            page = os.path.join(base_dir, docname)
+            for pattern in exclude_patterns:
+                if fnmatch(page, pattern):
+                    continue
+
+            result[page] = validate_one(page)
     return result
 
 
-def main(document, errors, output_format):
-    if document:
-        reports = validate(document)
+def main(page, errors, output_format, exclude_patterns=None):
+    if page:
+        reports = {page: validate_one(page)}
     else:
-        documentation_source = os.path.join(os.curdir, '../doc/source')
-        docnames = []
-        for root, dirs, files in os.walk(documentation_source):
-            _, base_dir = root.split('../doc/source')
-            for file in files:
-                docname, ext = os.path.splitext(file)
-                if not ext == '.rst':
-                    continue
-                docnames.append(os.path.join(base_dir, docname))
-        reports = validate(*docnames)
+        reports = validate_all(exclude_patterns=exclude_patterns)
     return report(reports, output_format=output_format, errors=errors)
 
 
@@ -167,8 +214,14 @@ def main(document, errors, output_format):
              'list of error codes to validate. By default it '
              'validates all errors (ignored when validating '
              'a single document)')
+    add('--exclude', default=None,
+        help='comma separated '
+             'patterns of pages to exclude. By default it '
+             'validates all errors (ignored when validating '
+             'a single document)')
 
     args = argparser.parse_args()
     sys.exit(main(args.document,
                   args.errors.split(',') if args.errors else None,
-                  args.format, ))
+                  args.format,
+                  args.exclude.split(',')))

From 93974fb7dce50a9bdc2233d544e17d00f45ed43d Mon Sep 17 00:00:00 2001
From: Fabian Haase <haase.fabian@gmail.com>
Date: Tue, 11 Dec 2018 00:55:23 +0100
Subject: [PATCH 3/9] Fix flake8 issues

Signed-off-by: Fabian Haase <haase.fabian@gmail.com>
---
 scripts/validate_documentation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py
index 1f253861df06f..cdb56cc8a2c4a 100644
--- a/scripts/validate_documentation.py
+++ b/scripts/validate_documentation.py
@@ -130,8 +130,8 @@ def validate_one(page):
         Returns
         -------
         dict
-            A dictionary containing all the information obtained from validating
-            the page.
+            A dictionary containing all the information obtained from
+            validating the page.
 
         Notes
         -----

From 5e7bca463ea7a5a4d0e9689747a5cf70689af15b Mon Sep 17 00:00:00 2001
From: Fabian Haase <haase.fabian@gmail.com>
Date: Tue, 11 Dec 2018 03:03:47 +0100
Subject: [PATCH 4/9] Review of @datapythonista

Signed-off-by: Fabian Haase <haase.fabian@gmail.com>
---
 ci/build_docs.sh                             |  12 +-
 doc/make.py                                  |   9 +-
 scripts/tests/test_validate_documentation.py |  28 +-
 scripts/validate_documentation.py            | 258 ++++++++++++-------
 4 files changed, 195 insertions(+), 112 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index f89c4369dff4a..4ea4a0a9194aa 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -8,6 +8,8 @@ fi
 cd "$TRAVIS_BUILD_DIR"/doc
 echo "inside $0"
 
+RET=0
+
 if [ "$DOC" ]; then
 
     echo "Will build docs"
@@ -19,6 +21,14 @@ if [ "$DOC" ]; then
     echo ./make.py
     ./make.py
 
+    echo ###############################
+    echo #     Lint documentation      #
+    echo ###############################
+
+    MSG='Validate documentation' ; echo $MSG
+    $TRAVIS_BUILD_DIR/scripts/validate_docstrings.py io
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
     echo ########################
     echo # Create and send docs #
     echo ########################
@@ -47,4 +57,4 @@ if [ "$DOC" ]; then
     git push origin gh-pages -f
 fi
 
-exit 0
+exit $RET
diff --git a/doc/make.py b/doc/make.py
index 3f6bd55db9169..1ed9dc1c17417 100755
--- a/doc/make.py
+++ b/doc/make.py
@@ -21,12 +21,9 @@
 import webbrowser
 import jinja2
 
-sys.path.insert(0, os.path.abspath('../scripts'))
 sys.path.extend([
-
-    # numpy standard doc extensions
+    # validation for documentation
     os.path.join(os.path.dirname(__file__), '..', 'scripts')
-
 ])
 import validate_documentation
 
@@ -271,8 +268,8 @@ def html(self):
                 os.remove(zip_fname)
 
         if self.single_doc is not None:
-            report = validate_documentation.validate_one(self.single_doc)
-            validate_documentation.report({self.single_doc: report})
+            checker = validate_documentation.validate_one(self.single_doc)
+            checker.report()
             self._open_browser()
             shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'),
                           ignore_errors=True)
diff --git a/scripts/tests/test_validate_documentation.py b/scripts/tests/test_validate_documentation.py
index e5edc4f566f1b..eb4ac493b33dd 100644
--- a/scripts/tests/test_validate_documentation.py
+++ b/scripts/tests/test_validate_documentation.py
@@ -13,18 +13,18 @@ def doctree():
 
 
 @pytest.mark.parametrize('raw_doc,errs', [
-    ('   Indented Line\n', {}),
-    ('\n', {}),
-    ('\tIndented Line\n', {'WS01': [(1, '\tIndented Line\n')]}),
-    ('Line   \n', {'WS02': [(1, 'Line   \n')]}),
-    ('\t\n', {'WS01': [(1, '\t\n')], 'WS02': [(1, '\t\n')]}),
-    (' \n', {'WS02': [(1, ' \n')]}),
+    (['   Indented Line\n'], {}),
+    (['\n'], {}),
+    (['\tIndented Line\n'], {'WS01': [((0, 1), {})]}),
+    (['Line   \n'], {'WS02': [((0, 1), {})]}),
+    (['\t\n'], {'WS03': [((0, 1), {})]}),
+    ([' \n'], {'WS03': [((0, 1), {})]}),
 ])
 def test_line_checkers(doctree, raw_doc, errs):
-    checker = DocumentChecker(raw_doc, doctree)
-    result = checker.validate()
+    checker = DocumentChecker('', raw_doc, doctree)
+    checker.validate()
 
-    assert result == errs
+    assert checker.errs == errs
 
 
 def test_doctree_with_list_in_block_quote(doctree):
@@ -32,10 +32,10 @@ def test_doctree_with_list_in_block_quote(doctree):
     block_quote = docutils.nodes.block_quote('BlockQuote', bullet_list)
     doctree.append(block_quote)
 
-    checker = DocumentChecker('', doctree)
-    result = checker.validate()
+    checker = DocumentChecker('', [''], doctree)
+    checker.validate()
 
-    assert result == {'DT01': [None]}
+    assert checker.errs == {'DT01': [(None, {})]}
 
 
 def test_doctree_with_block_quote_in_list(doctree):
@@ -44,7 +44,7 @@ def test_doctree_with_block_quote_in_list(doctree):
     top_list = docutils.nodes.bullet_list('', block_quote)
     doctree.append(top_list)
 
-    checker = DocumentChecker('', doctree)
+    checker = DocumentChecker('', [''], doctree)
     result = checker.validate()
 
-    assert result == {'DT01': [None]}
+    assert result == {'DT01': [(None, {})]}
diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py
index cdb56cc8a2c4a..aa821a622ab75 100644
--- a/scripts/validate_documentation.py
+++ b/scripts/validate_documentation.py
@@ -1,3 +1,19 @@
+"""
+Analyze documentation to detect errors.
+
+Requires the documentation to be build before usage.
+
+If no argument is provided, it validates all pages in the source folder for
+which a doctree has been generated.
+
+If a page is provided like "index", "whatsnew/v1.0.0" a list of all errors
+in that page is printed.
+
+Usage::
+    $ ./validate_documentation.py
+    $ ./validate_documentation.py index
+"""
+
 import argparse
 from fnmatch import fnmatch
 import json
@@ -8,148 +24,207 @@
 
 import docutils.nodes
 
-DOCUMENTATION_SOURCE = os.path.join(os.curdir, '../doc/source')
-DOCTREE_PATH = '../doc/build/doctrees/{}.doctree'
-RST_PATH = '../doc/source/{}.rst'
+BASE_DIR = os.path.join(os.path.dirname(__file__), '..', 'doc')
+DOCUMENTATION_SOURCE = os.path.join(BASE_DIR, 'build', 'doctrees', '')
+DOCTREE_PATH = os.path.join(DOCUMENTATION_SOURCE, '{}.doctree')
+RST_PATH = os.path.join(BASE_DIR, 'source', '{}.rst')
 
 ERROR_MSGS = {
     'DT01': "Found bullet list within block quote. Use 0 spaces for top-level "
             "list and 2 spaces for sub-lists",
     'WS01': "Indentation uses tabulation",
     'WS02': "Trailing whitespaces",
+    'WS03': "Whitespace in empty line",
 }
 
-STARTING_WHITESPACE_RE = re.compile(r'^(\s+).*\n$', re.MULTILINE)
-TRAILING_WHITESPACE_RE = re.compile(r'^.*([\t ]+)\n$', re.MULTILINE)
+STARTING_WHITESPACE_RE = re.compile(r'^(\s+).', re.MULTILINE)
+TRAILING_WHITESPACE_RE = re.compile(r'.([\t ]+)$', re.MULTILINE)
+EMPTY_LINE_WHITESPACE_RE = re.compile(r'^([\t ]+)$', re.MULTILINE)
 
 
 class DocumentChecker(object):
+    """
+    Checker to validate one page in the documentation
 
-    def __init__(self, raw_doc, doctree):
+    Attributes
+    ----------
+    page : str
+        Path to page relative to documentation's source folder without file
+        extension. (e.g. io)
+    doctree : docutils.nodes.document
+        Generated doctree for associated the page
+    raw_lines : list of str
+        Lines from rst-file for associated page
+    raw_doc : str
+        Joined lines from rst-file for associated page
+
+    Notes
+    -----
+    Add a method starting with `check` to add additional checks.
+
+    """
+
+    def __init__(self, page, raw_lines, doctree):
+        self.page = page
         self.doctree = doctree
-        self.raw_doc = raw_doc
+        self.raw_lines = raw_lines
+        self.raw_doc = ''.join(raw_lines)
 
-        self.issues = None
+        self.errs = None
 
-    def issue(self, code, match=None, **kwargs):
+    def error(self, code, line=None, **kwargs):
         """
         Parameters
         ----------
         code : str
             Error code.
+        line : Tuple[int, str]
         **kwargs
             Values for the variables in the error messages
         """
-        issue = self.issues.setdefault(code, [])
-        issue.append((self.find_line(match), kwargs))
+        errs = self.errs.setdefault(code, [])
+        errs.append((line, kwargs))
 
     def find_line(self, match):
+        """
+        Find rows in documentation that were matched
+
+        Parameters
+        ----------
+        match : typing.Match
+
+        Returns
+        -------
+        row_start : int
+        row_end : int
+        """
         if not match:
             return None
-        lines = self.raw_doc[:match.start(0)].splitlines()
-        return len(lines) + 1, match.group(0)
+
+        row_start = self.raw_doc[:match.start(0)].count('\n')
+        row_end = self.raw_doc[:match.end(0)].count('\n')
+        return row_start, row_end + 1
 
     def check_bullet_list_in_block_quote(self):
         for node in self.doctree.traverse(docutils.nodes.block_quote):
             match = node.first_child_matching_class(docutils.nodes.bullet_list)
             if match is not None:
-                self.issue('DT01')
+                self.error('DT01')
 
     def check_tabulator_as_indentation(self):
         matches = STARTING_WHITESPACE_RE.finditer(self.raw_doc)
         for match in matches:
             if '\t' in match.group(1):
-                self.issue('WS01', match)
+                self.error('WS01', line=self.find_line(match))
 
     def check_line_ends_with_whitespace(self):
         matches = TRAILING_WHITESPACE_RE.finditer(self.raw_doc)
         for match in matches:
-            self.issue('WS02', match)
+            self.error('WS02', line=self.find_line(match))
+
+    def check_empty_line_contains_whitespace(self):
+        matches = EMPTY_LINE_WHITESPACE_RE.finditer(self.raw_doc)
+        for match in matches:
+            self.error('WS03', line=self.find_line(match))
 
     def validate(self):
-        self.issues = {}
+        """Execute methods starting with 'check'"""
+        self.errs = {}
         for func in dir(self):
             if func.startswith('check'):
                 self.__class__.__dict__[func](self)
 
-        return self.issues
+        return self.errs
+
+    def report(self, errors=None, output_format='default'):
+        """
+        Output errors to stdout
 
+        Parameters
+        ----------
+        errors : list of str, optional
+            If provided, filter output by these error codes.
+        output_format : str, optional
+            One of 'default', 'json', 'azure'
 
-def report(reports, output_format='default', errors=None):
-    exit_status = 0
-    if output_format == 'json':
-        output = json.dumps(reports)
-    else:
-        if output_format == 'default':
-            output_format = '{text}\n'
-        elif output_format == 'azure':
-            output_format = ('##vso[task.logissue type=error;'
-                             'sourcepath={path};'
-                             'linenumber={row};'
-                             'code={code};'
-                             ']{text}\n')
+        Returns
+        -------
+        int
+            A integer with number of found issues
+        """
+        n_errs = 0
+        if output_format == 'json':
+            output = json.dumps(self.errs)
         else:
-            raise ValueError('Unknown output_format "{}"'.format(
-                output_format))
-
-        output = ''
-        for name, res in reports.items():
-            for err_code, issues in res.items():
+            if output_format == 'default':
+                output_format = '{path}:{row}:: {code} {text}\n{source}\n'
+            elif output_format == 'azure':
+                output_format = ('##vso[task.logissue type=error;'
+                                 'sourcepath={path};'
+                                 'linenumber={row};'
+                                 'code={code};'
+                                 ']{text}\n')
+            else:
+                raise ValueError('Unknown output_format "{}"'.format(
+                    output_format))
+
+            output = ''
+
+            for err_code, errs in self.errs.items():
                 # The script would be faster if instead of filtering the
                 # errors after validating them, it didn't validate them
                 # initially. But that would complicate the code too much
                 if errors and err_code not in errors:
                     continue
-                for issue, kwargs in issues:
-                    exit_status += 1
-                    row = issue[0] if issue else None
+                for line, kwargs in errs:
+                    n_errs += 1
+                    row_start, row_end = line if line else (0, 0)
                     output += output_format.format(
-                        name=name,
-                        path=RST_PATH.format(name),
-                        row=row,
+                        name=self.page,
+                        path='doc/source/{}.rst'.format(self.page),
+                        row=row_start + 1,
                         code=err_code,
-                        text='{}{}:: {}'.format(name,
-                                                ':' + row if row else '',
-                                                ERROR_MSGS[err_code]
-                                                .format(kwargs)))
+                        source=''.join(self.raw_lines[row_start:row_end]),
+                        text=ERROR_MSGS[err_code].format(kwargs))
 
-    sys.stdout.write(output)
-    return exit_status
+        sys.stdout.write(output)
+        return n_errs
 
 
 def validate_one(page):
     """
-        Validate the page for the given func_name
+    Validate the page for the given page
 
-        Parameters
-        ----------
-        page : str
-            Path to page relative to documentation's source folder without file
-            extension. (e.g. io)
+    Parameters
+    ----------
+    page : str
+        Path to page relative to documentation's source folder without file
+        extension. (e.g. io)
 
-        Returns
-        -------
-        dict
-            A dictionary containing all the information obtained from
-            validating the page.
-
-        Notes
-        -----
-        The errors codes are defined as:
-        - First two characters: Type of errors:
-           * DT: Error with unwanted node constellations the doctree
-           * WS: Issues regarding whitespace characters
-        - Last two characters: Numeric error code
-        """
+    Returns
+    -------
+    dict
+        A dictionary containing all the information obtained from
+        validating the page.
+
+    Notes
+    -----
+    The errors codes are defined as:
+    - First two characters: Type of errors:
+       * DT: Error with unwanted node constellations inside the doctree
+       * WS: Issues with whitespace characters
+    - Last two characters: Numeric error code
+    """
 
     with open(DOCTREE_PATH.format(page), 'r+b') as file:
         doctree = pickle.load(file)
 
     with open(RST_PATH.format(page), 'r') as file:
-        raw_doc = file.read()
+        raw_doc = file.readlines()
 
-    checker = DocumentChecker(raw_doc, doctree)
-    return checker.validate()
+    checker = DocumentChecker(page, raw_doc, doctree)
+    checker.validate()
+    return checker
 
 
 def validate_all(exclude_patterns):
@@ -172,15 +247,16 @@ def validate_all(exclude_patterns):
 
     result = {}
     for root, dirs, files in os.walk(DOCUMENTATION_SOURCE):
-        _, base_dir = root.split('../doc/source')
+        _, base_dir = root.split(DOCUMENTATION_SOURCE)
         for file in files:
             docname, ext = os.path.splitext(file)
-            if not ext == '.rst':
+            if not ext == '.doctree':
                 continue
             page = os.path.join(base_dir, docname)
-            for pattern in exclude_patterns:
-                if fnmatch(page, pattern):
-                    continue
+            if exclude_patterns:
+                for pattern in exclude_patterns:
+                    if fnmatch(page, pattern):
+                        continue
 
             result[page] = validate_one(page)
     return result
@@ -188,23 +264,23 @@ def validate_all(exclude_patterns):
 
 def main(page, errors, output_format, exclude_patterns=None):
     if page:
-        reports = {page: validate_one(page)}
+        checkers = {page: validate_one(page)}
     else:
-        reports = validate_all(exclude_patterns=exclude_patterns)
-    return report(reports, output_format=output_format, errors=errors)
+        checkers = validate_all(exclude_patterns=exclude_patterns)
+    exit_code = 0
+    for page, checker in checkers.items():
+        exit_code += checker.report(errors=errors, output_format=output_format)
+    return exit_code
 
 
 if __name__ == '__main__':
     format_opts = 'default', 'json', 'azure'
-    func_help = ('document to validate (e.g. io) '
-                 'if not provided, all documents are validated')
     argparser = argparse.ArgumentParser(
         description='validate pandas documentation')
     add = argparser.add_argument
-    add('document',
-        nargs='?',
-        default=None,
-        help=func_help)
+    add('page', nargs='?', default=None,
+        help='page to validate (e.g. io) '
+             'if not provided, all pages are validated')
     add('--format', default='default', choices=format_opts,
         help='format of the output when validating '
              'multiple documents (ignored when validating one).'
@@ -216,12 +292,12 @@ def main(page, errors, output_format, exclude_patterns=None):
              'a single document)')
     add('--exclude', default=None,
         help='comma separated '
-             'patterns of pages to exclude. By default it '
-             'validates all errors (ignored when validating '
-             'a single document)')
+             'patterns of pages to exclude. Utilises '
+             '`Unix filename pattern matching`'
+             '(ignored when validating a single document)')
 
     args = argparser.parse_args()
-    sys.exit(main(args.document,
+    sys.exit(main(args.page,
                   args.errors.split(',') if args.errors else None,
                   args.format,
-                  args.exclude.split(',')))
+                  args.exclude.split(',') if args.exclude else None))

From 08e754608eb92151745fdd309bf16f86ae60abd9 Mon Sep 17 00:00:00 2001
From: Fabian Haase <haase.fabian@gmail.com>
Date: Tue, 11 Dec 2018 19:02:34 +0100
Subject: [PATCH 5/9] Review of @datapythonista

Signed-off-by: Fabian Haase <haase.fabian@gmail.com>
---
 scripts/validate_documentation.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py
index aa821a622ab75..c55c61d8e06f2 100644
--- a/scripts/validate_documentation.py
+++ b/scripts/validate_documentation.py
@@ -15,6 +15,7 @@
 """
 
 import argparse
+import docutils.nodes
 from fnmatch import fnmatch
 import json
 import os
@@ -22,8 +23,6 @@
 import re
 import sys
 
-import docutils.nodes
-
 BASE_DIR = os.path.join(os.path.dirname(__file__), '..', 'doc')
 DOCUMENTATION_SOURCE = os.path.join(BASE_DIR, 'build', 'doctrees', '')
 DOCTREE_PATH = os.path.join(DOCUMENTATION_SOURCE, '{}.doctree')
@@ -69,7 +68,6 @@ def __init__(self, page, raw_lines, doctree):
         self.doctree = doctree
         self.raw_lines = raw_lines
         self.raw_doc = ''.join(raw_lines)
-
         self.errs = None
 
     def error(self, code, line=None, **kwargs):

From 3e0c233265215c17948779a0f310678c83a3000d Mon Sep 17 00:00:00 2001
From: Fabian Haase <haase.fabian@gmail.com>
Date: Tue, 11 Dec 2018 21:10:20 +0100
Subject: [PATCH 6/9] Fix travis build

Signed-off-by: Fabian Haase <haase.fabian@gmail.com>
---
 ci/build_docs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 4ea4a0a9194aa..55c87d95cc02c 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -26,7 +26,7 @@ if [ "$DOC" ]; then
     echo ###############################
 
     MSG='Validate documentation' ; echo $MSG
-    $TRAVIS_BUILD_DIR/scripts/validate_docstrings.py io
+    $TRAVIS_BUILD_DIR/scripts/validate_documentation.py
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     echo ########################

From 907c587575c10d28712d1c5fb0622a829d57f092 Mon Sep 17 00:00:00 2001
From: Fabian Haase <haase.fabian@gmail.com>
Date: Tue, 11 Dec 2018 23:52:11 +0100
Subject: [PATCH 7/9] Make validate_documentation executable

Signed-off-by: Fabian Haase <haase.fabian@gmail.com>
---
 scripts/validate_documentation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 scripts/validate_documentation.py

diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py
old mode 100644
new mode 100755
index c55c61d8e06f2..b5fc7c0dbef6e
--- a/scripts/validate_documentation.py
+++ b/scripts/validate_documentation.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """
 Analyze documentation to detect errors.
 
@@ -155,7 +156,7 @@ def report(self, errors=None, output_format='default'):
             output = json.dumps(self.errs)
         else:
             if output_format == 'default':
-                output_format = '{path}:{row}:: {code} {text}\n{source}\n'
+                output_format = '{path}:{row}:: {code} {text}\n'
             elif output_format == 'azure':
                 output_format = ('##vso[task.logissue type=error;'
                                  'sourcepath={path};'
@@ -180,7 +181,7 @@ def report(self, errors=None, output_format='default'):
                     output += output_format.format(
                         name=self.page,
                         path='doc/source/{}.rst'.format(self.page),
-                        row=row_start + 1,
+                        row=row_start + 1 if line else '?',
                         code=err_code,
                         source=''.join(self.raw_lines[row_start:row_end]),
                         text=ERROR_MSGS[err_code].format(kwargs))

From 1ffa59a10dc19bca0271bc331e3bdd69d52d888c Mon Sep 17 00:00:00 2001
From: Fabian Haase <haase.fabian@gmail.com>
Date: Wed, 12 Dec 2018 01:05:47 +0100
Subject: [PATCH 8/9] Skip pages missing corresponding rst-file

Signed-off-by: Fabian Haase <haase.fabian@gmail.com>
---
 scripts/validate_documentation.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/scripts/validate_documentation.py b/scripts/validate_documentation.py
index b5fc7c0dbef6e..163f41de6036a 100755
--- a/scripts/validate_documentation.py
+++ b/scripts/validate_documentation.py
@@ -214,12 +214,14 @@ def validate_one(page):
        * WS: Issues with whitespace characters
     - Last two characters: Numeric error code
     """
+    try:
+        with open(DOCTREE_PATH.format(page), 'r+b') as file:
+            doctree = pickle.load(file)
 
-    with open(DOCTREE_PATH.format(page), 'r+b') as file:
-        doctree = pickle.load(file)
-
-    with open(RST_PATH.format(page), 'r') as file:
-        raw_doc = file.readlines()
+        with open(RST_PATH.format(page), 'r') as file:
+            raw_doc = file.readlines()
+    except FileNotFoundError:
+        return None
 
     checker = DocumentChecker(page, raw_doc, doctree)
     checker.validate()
@@ -244,7 +246,7 @@ def validate_all(exclude_patterns):
         all the validation information.
     """
 
-    result = {}
+    checkers = {}
     for root, dirs, files in os.walk(DOCUMENTATION_SOURCE):
         _, base_dir = root.split(DOCUMENTATION_SOURCE)
         for file in files:
@@ -257,8 +259,10 @@ def validate_all(exclude_patterns):
                     if fnmatch(page, pattern):
                         continue
 
-            result[page] = validate_one(page)
-    return result
+            checker = validate_one(page)
+            if checker:
+                checkers[page] = checker
+    return checkers
 
 
 def main(page, errors, output_format, exclude_patterns=None):

From f21a4655bf1bb03c9ce91fe810108b0e3f05784e Mon Sep 17 00:00:00 2001
From: pandas-docs-bot <pandas-docs-bot@localhost.foo>
Date: Wed, 12 Dec 2018 02:36:10 +0100
Subject: [PATCH 9/9] Move linting execution to `run_tests.sh`

Signed-off-by: pandas-docs-bot <pandas-docs-bot@localhost.foo>
---
 ci/build_docs.sh | 12 +-----------
 ci/run_tests.sh  | 10 ++++++++--
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 55c87d95cc02c..f89c4369dff4a 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -8,8 +8,6 @@ fi
 cd "$TRAVIS_BUILD_DIR"/doc
 echo "inside $0"
 
-RET=0
-
 if [ "$DOC" ]; then
 
     echo "Will build docs"
@@ -21,14 +19,6 @@ if [ "$DOC" ]; then
     echo ./make.py
     ./make.py
 
-    echo ###############################
-    echo #     Lint documentation      #
-    echo ###############################
-
-    MSG='Validate documentation' ; echo $MSG
-    $TRAVIS_BUILD_DIR/scripts/validate_documentation.py
-    RET=$(($RET + $?)) ; echo $MSG "DONE"
-
     echo ########################
     echo # Create and send docs #
     echo ########################
@@ -57,4 +47,4 @@ if [ "$DOC" ]; then
     git push origin gh-pages -f
 fi
 
-exit $RET
+exit 0
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
index ee46da9f52eab..27176a1c71789 100755
--- a/ci/run_tests.sh
+++ b/ci/run_tests.sh
@@ -3,8 +3,14 @@
 set -e
 
 if [ "$DOC" ]; then
-    echo "We are not running pytest as this is a doc-build"
-    exit 0
+    echo ###############################
+    echo #     Lint documentation      #
+    echo ###############################
+
+    MSG='Validate documentation' ; echo $MSG
+    $TRAVIS_BUILD_DIR/scripts/validate_documentation.py
+    RET=$? ; echo $MSG "DONE"
+    exit $RET
 fi
 
 # Workaround for pytest-xdist flaky collection order