Skip to content

Commit d615f86

Browse files
datapythonistajorisvandenbossche
authored andcommitted
DOC: Adding script to validate docstrings, and generate list of all functions/methods with state (pandas-dev#19898)
1 parent 5f271eb commit d615f86

File tree

1 file changed

+355
-0
lines changed

1 file changed

+355
-0
lines changed

scripts/validate_docstrings.py

+355
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,355 @@
1+
#!/usr/bin/env python
2+
"""
3+
Analyze docstrings to detect errors.
4+
5+
If no argument is provided, it does a quick check of docstrings and returns
6+
a csv with all API functions and results of basic checks.
7+
8+
If a function or method is provided in the form "pandas.function",
9+
"pandas.module.class.method", etc. a list of all errors in the docstring for
10+
the specified function or method.
11+
12+
Usage::
13+
$ ./validate_docstrings.py
14+
$ ./validate_docstrings.py pandas.DataFrame.head
15+
"""
16+
import os
17+
import sys
18+
import csv
19+
import re
20+
import functools
21+
import argparse
22+
import contextlib
23+
import inspect
24+
import importlib
25+
import doctest
26+
import textwrap
27+
try:
28+
from io import StringIO
29+
except ImportError:
30+
from cStringIO import StringIO
31+
import numpy
32+
33+
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
34+
35+
sys.path.insert(0, os.path.join(BASE_PATH))
36+
import pandas
37+
38+
sys.path.insert(1, os.path.join(BASE_PATH, 'doc', 'sphinxext'))
39+
from numpydoc.docscrape import NumpyDocString
40+
41+
42+
def _to_original_callable(obj):
43+
while True:
44+
if inspect.isfunction(obj) or inspect.isclass(obj):
45+
f = inspect.getfile(obj)
46+
if f.startswith('<') and f.endswith('>'):
47+
return None
48+
return obj
49+
if inspect.ismethod(obj):
50+
obj = obj.__func__
51+
elif isinstance(obj, functools.partial):
52+
obj = obj.func
53+
elif isinstance(obj, property):
54+
obj = obj.fget
55+
else:
56+
return None
57+
58+
59+
def _output_header(title, width=80, char='#'):
60+
full_line = char * width
61+
side_len = (width - len(title) - 2) // 2
62+
adj = '' if len(title) % 2 == 0 else ' '
63+
title_line = '{side} {title}{adj} {side}'.format(side=char * side_len,
64+
title=title,
65+
adj=adj)
66+
67+
return '\n{full_line}\n{title_line}\n{full_line}\n\n'.format(
68+
full_line=full_line, title_line=title_line)
69+
70+
71+
class Docstring:
72+
def __init__(self, method_name, method_obj):
73+
self.method_name = method_name
74+
self.method_obj = method_obj
75+
self.raw_doc = method_obj.__doc__ or ''
76+
self.raw_doc = textwrap.dedent(self.raw_doc)
77+
self.doc = NumpyDocString(self.raw_doc)
78+
79+
def __len__(self):
80+
return len(self.raw_doc)
81+
82+
@property
83+
def source_file_name(self):
84+
fname = inspect.getsourcefile(self.method_obj)
85+
if fname:
86+
fname = os.path.relpath(fname, BASE_PATH)
87+
return fname
88+
89+
@property
90+
def source_file_def_line(self):
91+
try:
92+
return inspect.getsourcelines(self.method_obj)[-1]
93+
except OSError:
94+
pass
95+
96+
@property
97+
def github_url(self):
98+
url = 'https://github.com/pandas-dev/pandas/blob/master/'
99+
url += '{}#L{}'.format(self.source_file_name,
100+
self.source_file_def_line)
101+
return url
102+
103+
@property
104+
def first_line_blank(self):
105+
if self.raw_doc:
106+
return not bool(self.raw_doc.split('\n')[0].strip())
107+
108+
@property
109+
def summary(self):
110+
if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1:
111+
return ''
112+
return ' '.join(self.doc['Summary'])
113+
114+
@property
115+
def extended_summary(self):
116+
if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1:
117+
return ' '.join(self.doc['Summary'])
118+
return ' '.join(self.doc['Extended Summary'])
119+
120+
@property
121+
def needs_summary(self):
122+
return not (bool(self.summary) and bool(self.extended_summary))
123+
124+
@property
125+
def doc_parameters(self):
126+
return self.doc['Parameters']
127+
128+
@property
129+
def signature_parameters(self):
130+
if not inspect.isfunction(self.method_obj):
131+
return tuple()
132+
params = tuple(inspect.signature(self.method_obj).parameters.keys())
133+
if params and params[0] in ('self', 'cls'):
134+
return params[1:]
135+
return params
136+
137+
@property
138+
def parameter_mismatches(self):
139+
errs = []
140+
signature_params = self.signature_parameters
141+
if self.doc_parameters:
142+
doc_params = list(zip(*self.doc_parameters))[0]
143+
else:
144+
doc_params = []
145+
146+
missing = set(signature_params) - set(doc_params)
147+
if missing:
148+
errs.append('Parameters {!r} not documented'.format(missing))
149+
extra = set(doc_params) - set(signature_params)
150+
if extra:
151+
errs.append('Unknown parameters {!r}'.format(extra))
152+
if not missing and not extra and signature_params != doc_params:
153+
errs.append('Wrong parameters order. ' +
154+
'Actual: {!r}. '.format(signature_params) +
155+
'Documented: {!r}'.format(doc_params))
156+
157+
return errs
158+
159+
@property
160+
def correct_parameters(self):
161+
return not bool(self.parameter_mismatches)
162+
163+
@property
164+
def see_also(self):
165+
return self.doc['See Also']
166+
167+
@property
168+
def examples(self):
169+
return self.doc['Examples']
170+
171+
@property
172+
def first_line_ends_in_dot(self):
173+
if self.doc:
174+
return self.doc.split('\n')[0][-1] == '.'
175+
176+
@property
177+
def deprecated(self):
178+
pattern = re.compile('.. deprecated:: ')
179+
return (self.method_name.startswith('pandas.Panel') or
180+
bool(pattern.search(self.summary)) or
181+
bool(pattern.search(self.extended_summary)))
182+
183+
@property
184+
def examples_errors(self):
185+
flags = doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL
186+
finder = doctest.DocTestFinder()
187+
runner = doctest.DocTestRunner(optionflags=flags)
188+
context = {'np': numpy, 'pd': pandas}
189+
error_msgs = ''
190+
for test in finder.find(self.raw_doc, self.method_name, globs=context):
191+
f = StringIO()
192+
with contextlib.redirect_stdout(f):
193+
runner.run(test)
194+
error_msgs += f.getvalue()
195+
return error_msgs
196+
197+
198+
def get_api_items():
199+
api_fname = os.path.join(BASE_PATH, 'doc', 'source', 'api.rst')
200+
201+
position = None
202+
with open(api_fname) as f:
203+
for line in f:
204+
if line.startswith('.. currentmodule::'):
205+
current_module = line.replace('.. currentmodule::', '').strip()
206+
continue
207+
208+
if line == '.. autosummary::\n':
209+
position = 'autosummary'
210+
continue
211+
212+
if position == 'autosummary':
213+
if line == '\n':
214+
position = 'items'
215+
continue
216+
217+
if position == 'items':
218+
if line == '\n':
219+
position = None
220+
continue
221+
item = line.strip()
222+
func = importlib.import_module(current_module)
223+
for part in item.split('.'):
224+
func = getattr(func, part)
225+
226+
yield '.'.join([current_module, item]), func
227+
228+
229+
def validate_all():
230+
writer = csv.writer(sys.stdout)
231+
writer.writerow(['Function or method',
232+
'Type',
233+
'File',
234+
'Code line',
235+
'GitHub link',
236+
'Is deprecated',
237+
'Has summary',
238+
'Has extended summary',
239+
'Parameters ok',
240+
'Has examples',
241+
'Shared code with'])
242+
seen = {}
243+
for func_name, func in get_api_items():
244+
obj_type = type(func).__name__
245+
original_callable = _to_original_callable(func)
246+
if original_callable is None:
247+
writer.writerow([func_name, obj_type] + [''] * 9)
248+
else:
249+
doc = Docstring(func_name, original_callable)
250+
key = doc.source_file_name, doc.source_file_def_line
251+
shared_code = seen.get(key, '')
252+
seen[key] = func_name
253+
writer.writerow([func_name,
254+
obj_type,
255+
doc.source_file_name,
256+
doc.source_file_def_line,
257+
doc.github_url,
258+
int(doc.deprecated),
259+
int(bool(doc.summary)),
260+
int(bool(doc.extended_summary)),
261+
int(doc.correct_parameters),
262+
int(bool(doc.examples)),
263+
shared_code])
264+
265+
return 0
266+
267+
268+
def validate_one(func_name):
269+
for maxsplit in range(1, func_name.count('.') + 1):
270+
# TODO when py3 only replace by: module, *func_parts = ...
271+
func_name_split = func_name.rsplit('.', maxsplit=maxsplit)
272+
module = func_name_split[0]
273+
func_parts = func_name_split[1:]
274+
try:
275+
func_obj = importlib.import_module(module)
276+
except ImportError:
277+
pass
278+
else:
279+
continue
280+
281+
if 'module' not in locals():
282+
raise ImportError('No module can be imported '
283+
'from "{}"'.format(func_name))
284+
285+
for part in func_parts:
286+
func_obj = getattr(func_obj, part)
287+
288+
doc = Docstring(func_name, func_obj)
289+
290+
sys.stderr.write(_output_header('Docstring ({})'.format(func_name)))
291+
sys.stderr.write('{}\n'.format(doc.raw_doc))
292+
293+
errs = []
294+
if not doc.summary:
295+
errs.append('No summary found')
296+
else:
297+
if not doc.summary[0].isupper():
298+
errs.append('Summary does not start with capital')
299+
if doc.summary[-1] != '.':
300+
errs.append('Summary does not end with dot')
301+
if doc.summary.split(' ')[0][-1] == 's':
302+
errs.append('Summary must start with infinitive verb, '
303+
'not third person (e.g. use "Generate" instead of '
304+
'"Generates")')
305+
if not doc.extended_summary:
306+
errs.append('No extended summary found')
307+
308+
param_errs = doc.parameter_mismatches
309+
if param_errs:
310+
errs.append('Errors in parameters section')
311+
for param_err in param_errs:
312+
errs.append('\t{}'.format(param_err))
313+
314+
examples_errs = ''
315+
if not doc.examples:
316+
errs.append('No examples section found')
317+
else:
318+
examples_errs = doc.examples_errors
319+
if examples_errs:
320+
errs.append('Examples do not pass tests')
321+
322+
sys.stderr.write(_output_header('Validation'))
323+
if errs:
324+
sys.stderr.write('Errors found:\n')
325+
for err in errs:
326+
sys.stderr.write('\t{}\n'.format(err))
327+
else:
328+
sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name))
329+
330+
if examples_errs:
331+
sys.stderr.write(_output_header('Doctests'))
332+
sys.stderr.write(examples_errs)
333+
334+
return len(errs)
335+
336+
337+
def main(function):
338+
if function is None:
339+
return validate_all()
340+
else:
341+
return validate_one(function)
342+
343+
344+
if __name__ == '__main__':
345+
argparser = argparse.ArgumentParser(
346+
description='validate pandas docstrings')
347+
argparser.add_argument('function',
348+
nargs='?',
349+
default=None,
350+
help=('function or method to validate '
351+
'(e.g. pandas.DataFrame.head) '
352+
'if not provided, all docstrings '
353+
'are validated'))
354+
args = argparser.parse_args()
355+
sys.exit(main(args.function))

0 commit comments

Comments
 (0)