STYLE improve validate-docstrings ergonomics (pandas-dev#52482)

MarcoGorelli · topper-123 · commit 82b793aea205 · 2023-04-06T18:17:36.000+01:00
improve `validate-docstrings` ergonomics

Co-authored-by: MarcoGorelli &lt;&gt;
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -1005,11 +1005,11 @@ class Window(BaseWindow):
     Rolling sum with a window span of 2 seconds.
 
     >>> df_time = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]},
-    ...                        index = [pd.Timestamp('20130101 09:00:00'),
-    ...                                 pd.Timestamp('20130101 09:00:02'),
-    ...                                 pd.Timestamp('20130101 09:00:03'),
-    ...                                 pd.Timestamp('20130101 09:00:05'),
-    ...                                 pd.Timestamp('20130101 09:00:06')])
+    ...                        index=[pd.Timestamp('20130101 09:00:00'),
+    ...                               pd.Timestamp('20130101 09:00:02'),
+    ...                               pd.Timestamp('20130101 09:00:03'),
+    ...                               pd.Timestamp('20130101 09:00:05'),
+    ...                               pd.Timestamp('20130101 09:00:06')])
 
     >>> df_time
                            B
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
@@ -1770,16 +1770,16 @@ def apply(
         Using ``subset`` to restrict application to a single column or multiple columns
 
         >>> df.style.apply(highlight_max, color='red', subset="A")
-        ...  # doctest: +SKIP
+        ... # doctest: +SKIP
         >>> df.style.apply(highlight_max, color='red', subset=["A", "B"])
-        ...  # doctest: +SKIP
+        ... # doctest: +SKIP
 
         Using a 2d input to ``subset`` to select rows in addition to columns
 
-        >>> df.style.apply(highlight_max, color='red', subset=([0,1,2], slice(None)))
-        ...  # doctest: +SKIP
-        >>> df.style.apply(highlight_max, color='red', subset=(slice(0,5,2), "A"))
-        ...  # doctest: +SKIP
+        >>> df.style.apply(highlight_max, color='red', subset=([0, 1, 2], slice(None)))
+        ... # doctest: +SKIP
+        >>> df.style.apply(highlight_max, color='red', subset=(slice(0, 5, 2), "A"))
+        ... # doctest: +SKIP
 
         Using a function which returns a Series / DataFrame of unequal length but
         containing valid index labels
diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py
@@ -157,13 +157,16 @@ def test_bad_class(self, capsys):
             (
                 "BadDocstrings",
                 "unused_import",
-                ("flake8 error: F401 'pandas as pdf' imported but unused",),
+                (
+                    "flake8 error: line 1, col 1: F401 'pandas as pdf' "
+                    "imported but unused",
+                ),
             ),
             (
                 "BadDocstrings",
                 "missing_whitespace_around_arithmetic_operator",
                 (
-                    "flake8 error: "
+                    "flake8 error: line 1, col 2: "
                     "E226 missing whitespace around arithmetic operator",
                 ),
             ),
@@ -172,12 +175,15 @@ def test_bad_class(self, capsys):
                 "indentation_is_not_a_multiple_of_four",
                 # with flake8 3.9.0, the message ends with four spaces,
                 #  whereas in earlier versions, it ended with "four"
-                ("flake8 error: E111 indentation is not a multiple of 4",),
+                (
+                    "flake8 error: line 2, col 3: E111 indentation is not a "
+                    "multiple of 4",
+                ),
             ),
             (
                 "BadDocstrings",
                 "missing_whitespace_after_comma",
-                ("flake8 error: E231 missing whitespace after ',' (3 times)",),
+                ("flake8 error: line 1, col 33: E231 missing whitespace after ','",),
             ),
             (
                 "BadDocstrings",
diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
@@ -58,7 +58,8 @@
     "SA05": "{reference_name} in `See Also` section does not need `pandas` "
     "prefix, use {right_reference} instead.",
     "EX02": "Examples do not pass tests:\n{doctest_log}",
-    "EX03": "flake8 error: {error_code} {error_message}{times_happening}",
+    "EX03": "flake8 error: line {line_number}, col {col_number}: {error_code} "
+    "{error_message}",
     "EX04": "Do not import {imported_library}, as it is imported "
     "automatically for the examples (numpy as np, pandas as pd)",
 }
@@ -212,20 +213,31 @@ def validate_pep8(self):
         try:
             file.write(content)
             file.flush()
-            cmd = ["python", "-m", "flake8", "--quiet", "--statistics", file.name]
+            cmd = [
+                "python",
+                "-m",
+                "flake8",
+                "--format=%(row)d\t%(col)d\t%(code)s\t%(text)s",
+                file.name,
+            ]
             response = subprocess.run(cmd, capture_output=True, check=False, text=True)
             stdout = response.stdout
             stdout = stdout.replace(file.name, "")
-            messages = stdout.strip("\n")
+            messages = stdout.strip("\n").splitlines()
             if messages:
-                error_messages.append(messages)
+                error_messages.extend(messages)
         finally:
             file.close()
             os.unlink(file.name)
 
         for error_message in error_messages:
-            error_count, error_code, message = error_message.split(maxsplit=2)
-            yield error_code, message, int(error_count)
+            line_number, col_number, error_code, message = error_message.split(
+                "\t", maxsplit=3
+            )
+            # Note: we subtract 2 from the line number because
+            # 'import numpy as np\nimport pandas as pd\n'
+            # is prepended to the docstrings.
+            yield error_code, message, int(line_number) - 2, int(col_number)
 
     def non_hyphenated_array_like(self):
         return "array_like" in self.raw_doc
@@ -276,14 +288,14 @@ def pandas_validate(func_name: str):
                 pandas_error("EX02", doctest_log=result["examples_errs"])
             )
 
-        for error_code, error_message, error_count in doc.validate_pep8():
-            times_happening = f" ({error_count} times)" if error_count > 1 else ""
+        for error_code, error_message, line_number, col_number in doc.validate_pep8():
             result["errors"].append(
                 pandas_error(
                     "EX03",
                     error_code=error_code,
                     error_message=error_message,
-                    times_happening=times_happening,
+                    line_number=line_number,
+                    col_number=col_number,
                 )
             )
         examples_source_code = "".join(doc.examples_source_code)
@@ -407,7 +419,7 @@ def header(title, width=80, char="#"):
 
     sys.stderr.write(header("Validation"))
     if result["errors"]:
-        sys.stderr.write(f'{len(result["errors"])} Errors found:\n')
+        sys.stderr.write(f'{len(result["errors"])} Errors found for `{func_name}`:\n')
         for err_code, err_desc in result["errors"]:
             if err_code == "EX02":  # Failing examples are printed at the end
                 sys.stderr.write("\tExamples do not pass tests\n")
diff --git a/setup.cfg b/setup.cfg
@@ -5,6 +5,8 @@ max-line-length = 88
 ignore =
     # space before : (needed for how black formats slicing)
     E203,
+    # expected n blank lines
+    E3,
     # line break before binary operator
     W503,
     # line break after binary operator