diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 73bfb12316dc5..ed9b52557b28f 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -90,6 +90,69 @@ def leftover_files(self) -> None: """ +def _print_validate_all_base(monkeypatch, + prefix=None, + ignore_deprecated=False, + ignore_errors=None, + overwrite_api_items=True + ) -> tuple[int, int]: + dummy_docinfo = { + "docstring": "docstring1", + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc") + ], + "warnings": [], + "deprecated": True, + "file": "file1", + "file_line": "file_line1" + } + monkeypatch.setattr( + validate_docstrings, + "pandas_validate", + lambda func_names: { + func_name: dummy_docinfo + for func_name in func_names}, + ) + if overwrite_api_items: + monkeypatch.setattr( + validate_docstrings, + "get_all_api_items", + lambda: [ + ( + "pandas.DataFrame.align", + "func", + "current_section", + "current_subsection", + ), + ( + "pandas.Index.all", + "func", + "current_section", + "current_subsection", + ), + ], + ) + + exit_status = validate_docstrings.print_validate_all_results( + output_format="default", + prefix=prefix, + errors=["ER01", "ER02"], + ignore_deprecated=ignore_deprecated, + ignore_errors=ignore_errors + ) + exit_status_reference = validate_docstrings.print_validate_all_results( + output_format="default", + prefix=None, + errors=["ER01", "ER02"], + ignore_deprecated=False, + ignore_errors=None + ) + + return exit_status_reference, exit_status + + class TestValidator: def _import_path(self, klass=None, func=None): """ @@ -118,11 +181,14 @@ def _import_path(self, klass=None, func=None): return base_path def test_bad_class(self, capsys) -> None: - errors = validate_docstrings.pandas_validate( + results = validate_docstrings.pandas_validate( self._import_path(klass="BadDocstrings") - )["errors"] - assert isinstance(errors, list) - assert errors + ) + assert len(results.keys()) == 1 + for docinfo in results.values(): + errors = docinfo["errors"] + assert errors + assert isinstance(errors, list) @pytest.mark.parametrize( "klass,func,msgs", @@ -193,92 +259,41 @@ def test_bad_class(self, capsys) -> None: ], ) def test_bad_docstrings(self, capsys, klass, func, msgs) -> None: - result = validate_docstrings.pandas_validate( + results = validate_docstrings.pandas_validate( self._import_path(klass=klass, func=func) ) - for msg in msgs: - assert msg in " ".join([err[1] for err in result["errors"]]) - - def test_validate_all_ignore_deprecated(self, monkeypatch) -> None: - monkeypatch.setattr( - validate_docstrings, - "pandas_validate", - lambda func_name: { - "docstring": "docstring1", - "errors": [ - ("ER01", "err desc"), - ("ER02", "err desc"), - ("ER03", "err desc"), - ], - "warnings": [], - "examples_errors": "", - "deprecated": True, - }, - ) - result = validate_docstrings.validate_all(prefix=None, ignore_deprecated=True) - assert len(result) == 0 + assert len(results.keys()) == 1 + for result in results.values(): + for msg in msgs: + assert msg in " ".join([err[1] for err in result["errors"]]) + + def test_print_validate_all_ignore_deprecated(self, monkeypatch) -> None: + status, status_ignore_depr = _print_validate_all_base(monkeypatch, + ignore_deprecated=True, + overwrite_api_items=False) + assert status_ignore_depr == 0 + assert status > 100 + assert status % 2 == 0 + + def test_validate_all_prefix(self, monkeypatch): + status, status_prefix = _print_validate_all_base(monkeypatch, + prefix="pandas.DataFrame") + # the two errors of pandas.Index shall not be counted + assert status_prefix == status - 2 def test_validate_all_ignore_errors(self, monkeypatch): - monkeypatch.setattr( - validate_docstrings, - "pandas_validate", - lambda func_name: { - "docstring": "docstring1", - "errors": [ - ("ER01", "err desc"), - ("ER02", "err desc"), - ("ER03", "err desc") - ], - "warnings": [], - "examples_errors": "", - "deprecated": True, - "file": "file1", - "file_line": "file_line1" - }, - ) - monkeypatch.setattr( - validate_docstrings, - "get_all_api_items", - lambda: [ - ( - "pandas.DataFrame.align", - "func", - "current_section", - "current_subsection", - ), - ( - "pandas.Index.all", - "func", - "current_section", - "current_subsection", - ), - ], - ) - - exit_status_ignore_func = validate_docstrings.print_validate_all_results( - output_format="default", - prefix=None, - errors=["ER01", "ER02"], - ignore_deprecated=False, - ignore_errors={ - "pandas.DataFrame.align": ["ER01"], - # ignoring an error that is not requested should be of no effect - "pandas.Index.all": ["ER03"] - } - ) - exit_status = validate_docstrings.print_validate_all_results( - output_format="default", - prefix=None, - errors=["ER01", "ER02"], - ignore_deprecated=False, - ignore_errors=None - ) + ignore_errs = { + "pandas.DataFrame.align": ["ER01"], + # ignoring an error that is not requested should be of no effect + "pandas.Index.all": ["ER03"] + } + status, status_ignore_func = _print_validate_all_base(monkeypatch, + ignore_errors=ignore_errs) # we have 2 error codes activated out of the 3 available in the validate results # one run has a function to ignore, the other does not - assert exit_status == 2*2 - assert exit_status_ignore_func == exit_status - 1 - + assert status == 2*2 + assert status_ignore_func == status - 1 class TestApiItems: @@ -375,7 +390,8 @@ class TestPandasDocstringClass: ) def test_encode_content_write_to_file(self, name) -> None: # GH25466 - docstr = validate_docstrings.PandasDocstring(name).validate_pep8() + docstr = validate_docstrings.PandasDocstring(name) + docstr = validate_docstrings.validate_pep8_for_examples(docstr)[docstr] # the list of pep8 errors should be empty assert not list(docstr) @@ -392,7 +408,6 @@ def test_exit_status_for_main(self, monkeypatch) -> None: ("ER02", "err desc"), ("ER03", "err desc"), ], - "examples_errs": "", }, ) exit_status = validate_docstrings.main( diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index b42deff66f546..f117324a571c3 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -56,14 +56,14 @@ PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] ERROR_MSGS = { "GL04": "Private classes ({mentioned_private_classes}) should not be " - "mentioned in public docstrings", + "mentioned in public docstrings", "PD01": "Use 'array-like' rather than 'array_like' in docstrings.", "SA05": "{reference_name} in `See Also` section does not need `pandas` " - "prefix, use {right_reference} instead.", + "prefix, use {right_reference} instead.", "EX03": "flake8 error: line {line_number}, col {col_number}: {error_code} " - "{error_message}", + "{error_message}", "EX04": "Do not import {imported_library}, as it is imported " - "automatically for the examples (numpy as np, pandas as pd)", + "automatically for the examples (numpy as np, pandas as pd)", } @@ -148,6 +148,100 @@ def get_api_items(api_doc_fd): previous_line = line_stripped +def validate_pep8_for_examples(docs: list[PandasDocstring] | PandasDocstring + ) -> dict[PandasDocstring, list[tuple]]: + """ + Call the pep8 validation for docstrings with examples and add the found errors. + + Parameters + ---------- + docs : list[PandasDocString] + List of docstrings to validate. + + Returns + ------- + dict[PandasDocstring, list] + Dict of function names and the pep8 error messages found in their docstrings. + The errors messages are of the form + (error_code, message, line_number, col_number). + """ + if isinstance(docs, PandasDocstring): + docs = [docs] + + with tempfile.TemporaryDirectory() as temp_dir: + doc_to_filename = {} + for doc in docs: + if not doc.examples: + continue + + # F401 is needed to not generate flake8 errors in examples + # that do not use numpy or pandas + content = "".join( + ( + "import numpy as np # noqa: F401\n", + "import pandas as pd # noqa: F401\n", + *doc.examples_source_code, + ) + ) + + temp_file = tempfile.NamedTemporaryFile(mode="w", + dir=temp_dir, + encoding="utf-8", + delete=False) + temp_file.write(content) + temp_file.flush() + doc_to_filename[doc] = temp_file.name + + # No docs with examples to process + if not doc_to_filename: + return {} + + cmd = [ + sys.executable, + "-m", + "flake8", + "--format=%(row)d\t%(col)d\t%(code)s\t%(text)s", + "--max-line-length=88", + "--ignore=E203,E3,W503,W504,E402,E731,E128,E124,E704", + ] + cmd.extend(doc_to_filename.values()) + response = subprocess.run(cmd, capture_output=True, check=False, + text=True) + + all_docs_error_messages = {doc: [] for doc in docs} + for doc, temp_file_name in doc_to_filename.items(): + # one output for each error, each error must be mapped to the func_name + for output in ("stdout", "stderr"): + out = getattr(response, output) + out = out.replace(temp_file_name, "").strip("\n").splitlines() + if out: + all_docs_error_messages[doc].extend(out) + + for doc, raw_error_messages in all_docs_error_messages.items(): + doc_error_messages = [] + for raw_error_message in raw_error_messages: + line_num, col_num, err_code, msg = raw_error_message.split("\t", maxsplit=3) + # Note: we subtract 2 from the line number because + # 'import numpy as np\nimport pandas as pd\n' + # is prepended to the docstrings. + doc_error_messages.append( + ( + err_code, + msg, + int(line_num) - 2, + int(col_num) + ) + ) + all_docs_error_messages[doc] = doc_error_messages + + for doc in docs: + if doc.examples and doc not in all_docs_error_messages.keys(): + raise KeyError(f"Docstring\n###\n{doc}\n###\nhas examples but " + f"no pep8 validation results.") + + return all_docs_error_messages + + class PandasDocstring(Validator): def __init__(self, func_name: str, doc_obj=None) -> None: self.func_name = func_name @@ -168,119 +262,85 @@ def examples_source_code(self): lines = doctest.DocTestParser().get_examples(self.raw_doc) return [line.source for line in lines] - def validate_pep8(self): - if not self.examples: - return - - # F401 is needed to not generate flake8 errors in examples - # that do not user numpy or pandas - content = "".join( - ( - "import numpy as np # noqa: F401\n", - "import pandas as pd # noqa: F401\n", - *self.examples_source_code, - ) - ) - - error_messages = [] - - file = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) - try: - file.write(content) - file.flush() - cmd = [ - sys.executable, - "-m", - "flake8", - "--format=%(row)d\t%(col)d\t%(code)s\t%(text)s", - "--max-line-length=88", - "--ignore=E203,E3,W503,W504,E402,E731,E128,E124,E704", - file.name, - ] - response = subprocess.run(cmd, capture_output=True, check=False, text=True) - for output in ("stdout", "stderr"): - out = getattr(response, output) - out = out.replace(file.name, "") - messages = out.strip("\n").splitlines() - if messages: - error_messages.extend(messages) - finally: - file.close() - os.unlink(file.name) - - for error_message in error_messages: - line_number, col_number, error_code, message = error_message.split( - "\t", maxsplit=3 - ) - # Note: we subtract 2 from the line number because - # 'import numpy as np\nimport pandas as pd\n' - # is prepended to the docstrings. - yield error_code, message, int(line_number) - 2, int(col_number) - def non_hyphenated_array_like(self): return "array_like" in self.raw_doc -def pandas_validate(func_name: str): +def pandas_validate(func_names: str | list[str]) -> dict[str, dict]: """ Call the numpydoc validation, and add the errors specific to pandas. Parameters ---------- - func_name : str - Name of the object of the docstring to validate. + func_names : list[str] + The names of the objects of the docstrings to validate. Returns ------- - dict - Information about the docstring and the errors found. + dict[str, dict] + For each function, information about the docstring and the errors found. """ - func_obj = Validator._load_obj(func_name) - # Some objects are instances, e.g. IndexSlice, which numpydoc can't validate - doc_obj = get_doc_object(func_obj, doc=func_obj.__doc__) - doc = PandasDocstring(func_name, doc_obj) - result = validate(doc_obj) - mentioned_errs = doc.mentioned_private_classes - if mentioned_errs: - result["errors"].append( - pandas_error("GL04", mentioned_private_classes=", ".join(mentioned_errs)) - ) - - if doc.see_also: - result["errors"].extend( - pandas_error( - "SA05", - reference_name=rel_name, - right_reference=rel_name[len("pandas."):], - ) - for rel_name in doc.see_also - if rel_name.startswith("pandas.") - ) - - result["examples_errs"] = "" - if doc.examples: - for error_code, error_message, line_number, col_number in doc.validate_pep8(): + if isinstance(func_names, str): + func_names = [func_names] + + docs_to_results = {} + for func_name in func_names: + func_obj = Validator._load_obj(func_name) + # Some objects are instances, e.g. IndexSlice, which numpydoc can't validate + doc_obj = get_doc_object(func_obj, doc=func_obj.__doc__) + doc = PandasDocstring(func_name, doc_obj) + result = validate(doc_obj) + docs_to_results[doc] = result + + # add errors not from examples to the result + for doc, result in docs_to_results.items(): + mentioned_errs = doc.mentioned_private_classes + if mentioned_errs: result["errors"].append( pandas_error( - "EX03", - error_code=error_code, - error_message=error_message, - line_number=line_number, - col_number=col_number, - ) + "GL04", + mentioned_private_classes=", ".join(mentioned_errs)) ) - examples_source_code = "".join(doc.examples_source_code) - result["errors"].extend( - pandas_error("EX04", imported_library=wrong_import) - for wrong_import in ("numpy", "pandas") - if f"import {wrong_import}" in examples_source_code - ) - if doc.non_hyphenated_array_like(): - result["errors"].append(pandas_error("PD01")) + if doc.see_also: + see_also_prefix_errors = [ + pandas_error("SA05", + reference_name=rel_name, + right_reference=rel_name[len("pandas."):], + ) + for rel_name in doc.see_also + if rel_name.startswith("pandas.") + ] + result["errors"].extend(see_also_prefix_errors) + + if doc.non_hyphenated_array_like(): + result["errors"].append(pandas_error("PD01")) + + pep8_results = validate_pep8_for_examples(list(docs_to_results.keys())) + + for doc, pep8_errors in pep8_results.items(): + result = docs_to_results[doc] + pep8_pandas_errors = [ + pandas_error( + "EX03", + error_code=err_code, + error_message=err_msg, + line_number=line_number, + col_number=col_number, + ) for err_code, err_msg, line_number, col_number in pep8_errors + ] + result["errors"].extend(pep8_pandas_errors) + examples_source_code = "".join(doc.examples_source_code) + import_errors = [pandas_error("EX04", imported_library=wrong_import) + for wrong_import in ("numpy", "pandas") + if f"import {wrong_import}" in examples_source_code] + result["errors"].extend(import_errors) plt.close("all") - return result + validation_results = {doc.func_name: result + for doc, result + in docs_to_results.items()} + return validation_results def validate_all(prefix, ignore_deprecated=False): @@ -305,10 +365,17 @@ def validate_all(prefix, ignore_deprecated=False): result = {} seen = {} - for func_name, _, section, subsection in get_all_api_items(): - if prefix and not func_name.startswith(prefix): - continue - doc_info = pandas_validate(func_name) + def matches_prefix(function_name): + return function_name.startswith(prefix) if prefix else True + + api_items = [api_item for api_item + in get_all_api_items() + if matches_prefix(api_item[0])] + func_names = [api_item[0] for api_item in api_items] + doc_infos = pandas_validate(func_names) + + for func_name, _, section, subsection in api_items: + doc_info = doc_infos[func_name] if ignore_deprecated and doc_info["deprecated"]: continue result[func_name] = doc_info @@ -343,7 +410,7 @@ def print_validate_all_results( errors: list[str] | None, ignore_deprecated: bool, ignore_errors: dict[str, list[str]] | None, -): +) -> int: if output_format not in ("default", "json", "actions"): raise ValueError(f'Unknown output_format "{output_format}"') if ignore_errors is None: @@ -395,10 +462,6 @@ def header(title, width=80, char="#") -> str: else: sys.stderr.write(f'Docstring for "{func_name}" correct. :)\n') - if result["examples_errs"]: - sys.stderr.write(header("Doctests")) - sys.stderr.write(result["examples_errs"]) - def validate_error_codes(errors): overlapped_errors = set(NUMPYDOC_ERROR_MSGS).intersection(set(ERROR_MSGS)) @@ -455,7 +518,11 @@ def main( "as JSON" ) argparser = argparse.ArgumentParser(description="validate pandas docstrings") - argparser.add_argument("function", nargs="?", default=None, help=func_help) + argparser.add_argument( + "function", + nargs="?", + default=None, + help=func_help) argparser.add_argument( "--format", default="default",