diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 00496f771570b..27c63e3ba3a79 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -1,5 +1,6 @@ import string import random +import io import pytest import numpy as np @@ -531,28 +532,36 @@ def _import_path(self, klass=None, func=None): @capture_stderr def test_good_class(self): - assert validate_one(self._import_path( - klass='GoodDocStrings')) == 0 + errors = validate_one(self._import_path( + klass='GoodDocStrings'))['errors'] + assert isinstance(errors, list) + assert not errors @capture_stderr @pytest.mark.parametrize("func", [ 'plot', 'sample', 'random_letters', 'sample_values', 'head', 'head1', 'contains', 'mode']) def test_good_functions(self, func): - assert validate_one(self._import_path( - klass='GoodDocStrings', func=func)) == 0 + errors = validate_one(self._import_path( + klass='GoodDocStrings', func=func))['errors'] + assert isinstance(errors, list) + assert not errors @capture_stderr def test_bad_class(self): - assert validate_one(self._import_path( - klass='BadGenericDocStrings')) > 0 + errors = validate_one(self._import_path( + klass='BadGenericDocStrings'))['errors'] + assert isinstance(errors, list) + assert errors @capture_stderr @pytest.mark.parametrize("func", [ 'func', 'astype', 'astype1', 'astype2', 'astype3', 'plot', 'method']) def test_bad_generic_functions(self, func): - assert validate_one(self._import_path( # noqa:F821 - klass='BadGenericDocStrings', func=func)) > 0 + errors = validate_one(self._import_path( # noqa:F821 + klass='BadGenericDocStrings', func=func))['errors'] + assert isinstance(errors, list) + assert errors @pytest.mark.parametrize("klass,func,msgs", [ # Summary tests @@ -594,7 +603,82 @@ def test_bad_generic_functions(self, func): marks=pytest.mark.xfail) ]) def test_bad_examples(self, capsys, klass, func, msgs): - validate_one(self._import_path(klass=klass, func=func)) # noqa:F821 - err = capsys.readouterr().err + result = validate_one(self._import_path(klass=klass, func=func)) # noqa:F821 for msg in msgs: - assert msg in err + assert msg in ' '.join(result['errors']) + + +class ApiItems(object): + @property + def api_doc(self): + return io.StringIO(''' +.. currentmodule:: itertools + +Itertools +--------- + +Infinite +~~~~~~~~ + +.. autosummary:: + + cycle + count + +Finite +~~~~~~ + +.. autosummary:: + + chain + +.. currentmodule:: random + +Random +------ + +All +~~~ + +.. autosummary:: + + seed + randint +''') + + @pytest.mark.parametrize('idx,name', [(0, 'itertools.cycle'), + (1, 'itertools.count'), + (2, 'itertools.chain'), + (3, 'random.seed'), + (4, 'random.randint')]) + def test_item_name(self, idx, name): + result = list(validate_docstrings.get_api_items(self.api_doc)) + assert result[idx][0] == name + + @pytest.mark.parametrize('idx,func', [(0, 'cycle'), + (1, 'count'), + (2, 'chain'), + (3, 'seed'), + (4, 'randint')]) + def test_item_function(self, idx, func): + result = list(validate_docstrings.get_api_items(self.api_doc)) + assert callable(result[idx][1]) + assert result[idx][1].__name__ == func + + @pytest.mark.parametrize('idx,section', [(0, 'Itertools'), + (1, 'Itertools'), + (2, 'Itertools'), + (3, 'Random'), + (4, 'Random')]) + def test_item_section(self, idx, section): + result = list(validate_docstrings.get_api_items(self.api_doc)) + assert result[idx][2] == section + + @pytest.mark.parametrize('idx,subsection', [(0, 'Infinite'), + (1, 'Infinite'), + (2, 'Finite'), + (3, 'All'), + (4, 'All')]) + def test_item_subsection(self, idx, subsection): + result = list(validate_docstrings.get_api_items(self.api_doc)) + assert result[idx][3] == subsection diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 790a62b53845b..6588522331433 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -15,7 +15,7 @@ """ import os import sys -import csv +import json import re import functools import collections @@ -45,86 +45,188 @@ DIRECTIVES = ['versionadded', 'versionchanged', 'deprecated'] -def _load_obj(obj_name): - for maxsplit in range(1, obj_name.count('.') + 1): - # TODO when py3 only replace by: module, *func_parts = ... - func_name_split = obj_name.rsplit('.', maxsplit) - module = func_name_split[0] - func_parts = func_name_split[1:] - try: - obj = importlib.import_module(module) - except ImportError: - pass - else: - continue +def get_api_items(api_doc_fd): + """ + Yield information about all public API items. - if 'module' not in locals(): - raise ImportError('No module can be imported ' - 'from "{}"'.format(obj_name)) + Parse api.rst file from the documentation, and extract all the functions, + methods, classes, attributes... This should include all pandas public API. + + Parameters + ---------- + api_doc_fd : file descriptor + A file descriptor of the API documentation page, containing the table + of contents with all the public API. + + Yields + ------ + name : str + The name of the object (e.g. 'pandas.Series.str.upper). + func : function + The object itself. In most cases this will be a function or method, + but it can also be classes, properties, cython objects... + section : str + The name of the section in the API page where the object item is + located. + subsection : str + The name of the subsection in the API page where the object item is + located. + """ + previous_line = current_section = current_subsection = '' + position = None + for line in api_doc_fd: + line = line.strip() + if len(line) == len(previous_line): + if set(line) == set('-'): + current_section = previous_line + continue + if set(line) == set('~'): + current_subsection = previous_line + continue - for part in func_parts: - obj = getattr(obj, part) - return obj + if line.startswith('.. currentmodule::'): + current_module = line.replace('.. currentmodule::', '').strip() + continue + if line == '.. autosummary::': + position = 'autosummary' + continue -def _to_original_callable(obj): - while True: - if inspect.isfunction(obj) or inspect.isclass(obj): - f = inspect.getfile(obj) - if f.startswith('<') and f.endswith('>'): - return None - return obj - if inspect.ismethod(obj): - obj = obj.__func__ - elif isinstance(obj, functools.partial): - obj = obj.func - elif isinstance(obj, property): - obj = obj.fget - else: - return None + if position == 'autosummary': + if line == '': + position = 'items' + continue + if position == 'items': + if line == '': + position = None + continue + item = line.strip() + func = importlib.import_module(current_module) + for part in item.split('.'): + func = getattr(func, part) -def _output_header(title, width=80, char='#'): - full_line = char * width - side_len = (width - len(title) - 2) // 2 - adj = '' if len(title) % 2 == 0 else ' ' - title_line = '{side} {title}{adj} {side}'.format(side=char * side_len, - title=title, - adj=adj) + yield ('.'.join([current_module, item]), func, + current_section, current_subsection) - return '\n{full_line}\n{title_line}\n{full_line}\n\n'.format( - full_line=full_line, title_line=title_line) + previous_line = line class Docstring(object): - def __init__(self, method_name, method_obj): - self.method_name = method_name - self.method_obj = method_obj - self.raw_doc = method_obj.__doc__ or '' - self.clean_doc = pydoc.getdoc(self.method_obj) + def __init__(self, name): + self.name = name + obj = self._load_obj(name) + self.obj = obj + self.code_obj = self._to_original_callable(obj) + self.raw_doc = obj.__doc__ or '' + self.clean_doc = pydoc.getdoc(obj) self.doc = NumpyDocString(self.clean_doc) def __len__(self): return len(self.raw_doc) + @staticmethod + def _load_obj(name): + """ + Import Python object from its name as string. + + Parameters + ---------- + name : str + Object name to import (e.g. pandas.Series.str.upper) + + Returns + ------- + object + Python object that can be a class, method, function... + + Examples + -------- + >>> Docstring._load_obj('pandas.Series') + + """ + for maxsplit in range(1, name.count('.') + 1): + # TODO when py3 only replace by: module, *func_parts = ... + func_name_split = name.rsplit('.', maxsplit) + module = func_name_split[0] + func_parts = func_name_split[1:] + try: + obj = importlib.import_module(module) + except ImportError: + pass + else: + continue + + if 'module' not in locals(): + raise ImportError('No module can be imported ' + 'from "{}"'.format(name)) + + for part in func_parts: + obj = getattr(obj, part) + return obj + + @staticmethod + def _to_original_callable(obj): + """ + Find the Python object that contains the source code ot the object. + + This is useful to find the place in the source code (file and line + number) where a docstring is defined. It does not currently work for + all cases, but it should help find some (properties...). + """ + while True: + if inspect.isfunction(obj) or inspect.isclass(obj): + f = inspect.getfile(obj) + if f.startswith('<') and f.endswith('>'): + return None + return obj + if inspect.ismethod(obj): + obj = obj.__func__ + elif isinstance(obj, functools.partial): + obj = obj.func + elif isinstance(obj, property): + obj = obj.fget + else: + return None + + @property + def type(self): + return type(self.obj).__name__ + @property def is_function_or_method(self): # TODO(py27): remove ismethod - return (inspect.isfunction(self.method_obj) - or inspect.ismethod(self.method_obj)) + return (inspect.isfunction(self.obj) + or inspect.ismethod(self.obj)) @property def source_file_name(self): - fname = inspect.getsourcefile(self.method_obj) - if fname: - fname = os.path.relpath(fname, BASE_PATH) - return fname + """ + File name where the object is implemented (e.g. pandas/core/frame.py). + """ + try: + fname = inspect.getsourcefile(self.code_obj) + except TypeError: + # In some cases the object is something complex like a cython + # object that can't be easily introspected. An it's better to + # return the source code file of the object as None, than crash + pass + else: + if fname: + fname = os.path.relpath(fname, BASE_PATH) + return fname @property def source_file_def_line(self): + """ + Number of line where the object is defined in its file. + """ try: - return inspect.getsourcelines(self.method_obj)[-1] - except OSError: + return inspect.getsourcelines(self.code_obj)[-1] + except (OSError, TypeError): + # In some cases the object is something complex like a cython + # object that can't be easily introspected. An it's better to + # return the line number as None, than crash pass @property @@ -187,14 +289,14 @@ def doc_parameters(self): @property def signature_parameters(self): - if inspect.isclass(self.method_obj): - if hasattr(self.method_obj, '_accessors') and ( - self.method_name.split('.')[-1] in - self.method_obj._accessors): + if inspect.isclass(self.obj): + if hasattr(self.obj, '_accessors') and ( + self.name.split('.')[-1] in + self.obj._accessors): # accessor classes have a signature but don't want to show this return tuple() try: - sig = signature(self.method_obj) + sig = signature(self.obj) except (TypeError, ValueError): # Some objects, mainly in C extensions do not support introspection # of the signature @@ -266,7 +368,10 @@ def yields(self): @property def method_source(self): - return inspect.getsource(self.method_obj) + try: + return inspect.getsource(self.obj) + except TypeError: + return '' @property def first_line_ends_in_dot(self): @@ -276,9 +381,9 @@ def first_line_ends_in_dot(self): @property def deprecated(self): pattern = re.compile('.. deprecated:: ') - return (self.method_name.startswith('pandas.Panel') or - bool(pattern.search(self.summary)) or - bool(pattern.search(self.extended_summary))) + return (self.name.startswith('pandas.Panel') + or bool(pattern.search(self.summary)) + or bool(pattern.search(self.extended_summary))) @property def mentioned_private_classes(self): @@ -291,121 +396,13 @@ def examples_errors(self): runner = doctest.DocTestRunner(optionflags=flags) context = {'np': numpy, 'pd': pandas} error_msgs = '' - for test in finder.find(self.raw_doc, self.method_name, globs=context): + for test in finder.find(self.raw_doc, self.name, globs=context): f = StringIO() runner.run(test, out=f.write) error_msgs += f.getvalue() return error_msgs -def get_api_items(): - api_fname = os.path.join(BASE_PATH, 'doc', 'source', 'api.rst') - - previous_line = current_section = current_subsection = '' - position = None - with open(api_fname) as f: - for line in f: - line = line.strip() - if len(line) == len(previous_line): - if set(line) == set('-'): - current_section = previous_line - continue - if set(line) == set('~'): - current_subsection = previous_line - continue - - if line.startswith('.. currentmodule::'): - current_module = line.replace('.. currentmodule::', '').strip() - continue - - if line == '.. autosummary::': - position = 'autosummary' - continue - - if position == 'autosummary': - if line == '': - position = 'items' - continue - - if position == 'items': - if line == '': - position = None - continue - item = line.strip() - func = importlib.import_module(current_module) - for part in item.split('.'): - func = getattr(func, part) - - yield ('.'.join([current_module, item]), func, - current_section, current_subsection) - - previous_line = line - - -def _csv_row(func_name, func_obj, section, subsection, in_api, seen={}): - obj_type = type(func_obj).__name__ - original_callable = _to_original_callable(func_obj) - if original_callable is None: - return [func_name, obj_type] + [''] * 12, '' - else: - doc = Docstring(func_name, original_callable) - key = doc.source_file_name, doc.source_file_def_line - shared_code = seen.get(key, '') - return [func_name, - obj_type, - in_api, - int(doc.deprecated), - section, - subsection, - doc.source_file_name, - doc.source_file_def_line, - doc.github_url, - int(bool(doc.summary)), - int(bool(doc.extended_summary)), - int(doc.correct_parameters), - int(bool(doc.examples)), - shared_code], key - - -def validate_all(): - writer = csv.writer(sys.stdout) - cols = ('Function or method', - 'Type', - 'In API doc', - 'Is deprecated', - 'Section', - 'Subsection', - 'File', - 'Code line', - 'GitHub link', - 'Has summary', - 'Has extended summary', - 'Parameters ok', - 'Has examples', - 'Shared code with') - writer.writerow(cols) - seen = {} - api_items = list(get_api_items()) - for func_name, func, section, subsection in api_items: - row, key = _csv_row(func_name, func, section, subsection, - in_api=1, seen=seen) - seen[key] = func_name - writer.writerow(row) - - api_item_names = set(list(zip(*api_items))[0]) - for class_ in (pandas.Series, pandas.DataFrame, pandas.Panel): - for member in inspect.getmembers(class_): - func_name = 'pandas.{}.{}'.format(class_.__name__, member[0]) - if (not member[0].startswith('_') and - func_name not in api_item_names): - func = _load_obj(func_name) - row, key = _csv_row(func_name, func, section='', subsection='', - in_api=0) - writer.writerow(row) - - return 0 - - def validate_one(func_name): """ Validate the docstring for the given func_name @@ -413,18 +410,15 @@ def validate_one(func_name): Parameters ---------- func_name : function - Function whose docstring will be evaluated + Function whose docstring will be evaluated (e.g. pandas.read_csv). Returns ------- - int - The number of errors found in the `func_name` docstring + dict + A dictionary containing all the information obtained from validating + the docstring. """ - func_obj = _load_obj(func_name) - doc = Docstring(func_name, func_obj) - - sys.stderr.write(_output_header('Docstring ({})'.format(func_name))) - sys.stderr.write('{}\n'.format(doc.clean_doc)) + doc = Docstring(func_name) errs = [] wrns = [] @@ -449,8 +443,10 @@ def validate_one(func_name): errs.append('Summary does not start with a capital letter') if doc.summary[-1] != '.': errs.append('Summary does not end with a period') - if (doc.is_function_or_method and - doc.summary.split(' ')[0][-1] == 's'): + if doc.summary != doc.summary.lstrip(): + errs.append('Summary contains heading whitespaces.') + elif (doc.is_function_or_method + and doc.summary.split(' ')[0][-1] == 's'): errs.append('Summary must start with infinitive verb, ' 'not third person (e.g. use "Generate" instead of ' '"Generates")') @@ -517,42 +513,109 @@ def validate_one(func_name): if examples_errs: errs.append('Examples do not pass tests') - sys.stderr.write(_output_header('Validation')) - if errs: - sys.stderr.write('Errors found:\n') - for err in errs: - sys.stderr.write('\t{}\n'.format(err)) - if wrns: - sys.stderr.write('Warnings found:\n') - for wrn in wrns: - sys.stderr.write('\t{}\n'.format(wrn)) + return {'type': doc.type, + 'docstring': doc.clean_doc, + 'deprecated': doc.deprecated, + 'file': doc.source_file_name, + 'file_line': doc.source_file_def_line, + 'github_link': doc.github_url, + 'errors': errs, + 'warnings': wrns, + 'examples_errors': examples_errs} - if not errs: - sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) - if examples_errs: - sys.stderr.write(_output_header('Doctests')) - sys.stderr.write(examples_errs) +def validate_all(): + """ + Execute the validation of all docstrings, and return a dict with the + results. + + Returns + ------- + dict + A dictionary with an item for every function/method... containing + all the validation information. + """ + result = {} + seen = {} + + # functions from the API docs + api_doc_fname = os.path.join(BASE_PATH, 'doc', 'source', 'api.rst') + with open(api_doc_fname) as f: + api_items = list(get_api_items(f)) + for func_name, func_obj, section, subsection in api_items: + doc_info = validate_one(func_name) + result[func_name] = doc_info - return len(errs) + shared_code_key = doc_info['file'], doc_info['file_line'] + shared_code = seen.get(shared_code_key, '') + result[func_name].update({'in_api': True, + 'section': section, + 'subsection': subsection, + 'shared_code_with': shared_code}) + seen[shared_code_key] = func_name -def main(function): - if function is None: - return validate_all() + # functions from introspecting Series, DataFrame and Panel + api_item_names = set(list(zip(*api_items))[0]) + for class_ in (pandas.Series, pandas.DataFrame, pandas.Panel): + for member in inspect.getmembers(class_): + func_name = 'pandas.{}.{}'.format(class_.__name__, member[0]) + if (not member[0].startswith('_') + and func_name not in api_item_names): + doc_info = validate_one(func_name) + result[func_name] = doc_info + result[func_name]['in_api'] = False + + return result + + +def main(func_name, fd): + def header(title, width=80, char='#'): + full_line = char * width + side_len = (width - len(title) - 2) // 2 + adj = '' if len(title) % 2 == 0 else ' ' + title_line = '{side} {title}{adj} {side}'.format(side=char * side_len, + title=title, + adj=adj) + + return '\n{full_line}\n{title_line}\n{full_line}\n\n'.format( + full_line=full_line, title_line=title_line) + + if func_name is None: + json_doc = validate_all() + fd.write(json.dumps(json_doc)) else: - return validate_one(function) + doc_info = validate_one(func_name) + + fd.write(header('Docstring ({})'.format(func_name))) + fd.write('{}\n'.format(doc_info['docstring'])) + fd.write(header('Validation')) + if doc_info['errors']: + fd.write('Errors found:\n') + for err in doc_info['errors']: + fd.write('\t{}\n'.format(err)) + if doc_info['warnings']: + fd.write('Warnings found:\n') + for wrn in doc_info['warnings']: + fd.write('\t{}\n'.format(wrn)) + + if not doc_info['errors']: + fd.write('Docstring for "{}" correct. :)\n'.format(func_name)) + + if doc_info['examples_errors']: + fd.write(header('Doctests')) + fd.write(doc_info['examples_errors']) if __name__ == '__main__': + func_help = ('function or method to validate (e.g. pandas.DataFrame.head) ' + 'if not provided, all docstrings are validated and returned ' + 'as JSON') argparser = argparse.ArgumentParser( description='validate pandas docstrings') argparser.add_argument('function', nargs='?', default=None, - help=('function or method to validate ' - '(e.g. pandas.DataFrame.head) ' - 'if not provided, all docstrings ' - 'are validated')) + help=func_help) args = argparser.parse_args() - sys.exit(main(args.function)) + sys.exit(main(args.function, sys.stdout))