From 04b9be81b5619d3b73dd48dbcc8ccbb3b0a23e39 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Mon, 29 Nov 2021 18:21:04 -0800
Subject: [PATCH 1/3] DOC: Ensure no files are leftover after doctests

---
 pandas/core/generic.py         | 46 ++++++++++++++--------------------
 pandas/errors/__init__.py      | 19 ++++++--------
 pandas/io/excel/_base.py       | 34 ++++++++++++-------------
 pandas/io/pickle.py            | 30 +++++++++-------------
 pandas/io/stata.py             | 20 ++++++---------
 scripts/validate_docstrings.py |  8 ++++++
 6 files changed, 72 insertions(+), 85 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 4aff7acc4c6fb..263c3520afcbb 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2709,32 +2709,27 @@ def to_hdf(
         Examples
         --------
         >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
-        ...                   index=['a', 'b', 'c'])
-        >>> df.to_hdf('data.h5', key='df', mode='w')
+        ...                   index=['a', 'b', 'c'])  # doctest: +SKIP
+        >>> df.to_hdf('data.h5', key='df', mode='w')  # doctest: +SKIP
 
         We can add another object to the same file:
 
-        >>> s = pd.Series([1, 2, 3, 4])
-        >>> s.to_hdf('data.h5', key='s')
+        >>> s = pd.Series([1, 2, 3, 4])  # doctest: +SKIP
+        >>> s.to_hdf('data.h5', key='s')  # doctest: +SKIP
 
         Reading from HDF file:
 
-        >>> pd.read_hdf('data.h5', 'df')
+        >>> pd.read_hdf('data.h5', 'df')  # doctest: +SKIP
         A  B
         a  1  4
         b  2  5
         c  3  6
-        >>> pd.read_hdf('data.h5', 's')
+        >>> pd.read_hdf('data.h5', 's')  # doctest: +SKIP
         0    1
         1    2
         2    3
         3    4
         dtype: int64
-
-        Deleting file with data:
-
-        >>> import os
-        >>> os.remove('data.h5')
         """
         from pandas.io import pytables
 
@@ -2970,28 +2965,25 @@ def to_pickle(
 
         Examples
         --------
-        >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}})
-        >>> original_df
+        >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}})  # doctest: +SKIP
+        >>> original_df  # doctest: +SKIP
            foo  bar
         0    0    5
         1    1    6
         2    2    7
         3    3    8
         4    4    9
-        >>> original_df.to_pickle("./dummy.pkl")
+        >>> original_df.to_pickle("./dummy.pkl")  # doctest: +SKIP
 
-        >>> unpickled_df = pd.read_pickle("./dummy.pkl")
-        >>> unpickled_df
+        >>> unpickled_df = pd.read_pickle("./dummy.pkl")  # doctest: +SKIP
+        >>> unpickled_df  # doctest: +SKIP
            foo  bar
         0    0    5
         1    1    6
         2    2    7
         3    3    8
         4    4    9
-
-        >>> import os
-        >>> os.remove("./dummy.pkl")
-        """
+        """  # noqa: E501
         from pandas.io.pickle import to_pickle
 
         to_pickle(
@@ -3509,14 +3501,14 @@ def to_csv(
         To write a csv file to a new folder or nested folder you will first
         need to create it using either Pathlib or os:
 
-        >>> from pathlib import Path
-        >>> filepath = Path('folder/subfolder/out.csv')
-        >>> filepath.parent.mkdir(parents=True, exist_ok=True)
-        >>> df.to_csv(filepath)
+        >>> from pathlib import Path  # doctest: +SKIP
+        >>> filepath = Path('folder/subfolder/out.csv')  # doctest: +SKIP
+        >>> filepath.parent.mkdir(parents=True, exist_ok=True)  # doctest: +SKIP
+        >>> df.to_csv(filepath)  # doctest: +SKIP
 
-        >>> import os
-        >>> os.makedirs('folder/subfolder', exist_ok=True)
-        >>> df.to_csv('folder/subfolder/out.csv')
+        >>> import os  # doctest: +SKIP
+        >>> os.makedirs('folder/subfolder', exist_ok=True)  # doctest: +SKIP
+        >>> df.to_csv('folder/subfolder/out.csv')  # doctest: +SKIP
         """
         df = self if isinstance(self, ABCDataFrame) else self.to_frame()
 
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index 56eda37c8122e..8fae2d1d1179d 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -95,32 +95,29 @@ class DtypeWarning(Warning):
 
     >>> df = pd.DataFrame({'a': (['1'] * 100000 + ['X'] * 100000 +
     ...                          ['1'] * 100000),
-    ...                    'b': ['b'] * 300000})
-    >>> df.to_csv('test.csv', index=False)
-    >>> df2 = pd.read_csv('test.csv')
+    ...                    'b': ['b'] * 300000})  # doctest: +SKIP
+    >>> df.to_csv('test.csv', index=False)  # doctest: +SKIP
+    >>> df2 = pd.read_csv('test.csv')  # doctest: +SKIP
     ... # DtypeWarning: Columns (0) have mixed types
 
     Important to notice that ``df2`` will contain both `str` and `int` for the
     same input, '1'.
 
-    >>> df2.iloc[262140, 0]
+    >>> df2.iloc[262140, 0]  # doctest: +SKIP
     '1'
-    >>> type(df2.iloc[262140, 0])
+    >>> type(df2.iloc[262140, 0])  # doctest: +SKIP
     <class 'str'>
-    >>> df2.iloc[262150, 0]
+    >>> df2.iloc[262150, 0]  # doctest: +SKIP
     1
-    >>> type(df2.iloc[262150, 0])
+    >>> type(df2.iloc[262150, 0])  # doctest: +SKIP
     <class 'int'>
 
     One way to solve this issue is using the `dtype` parameter in the
     `read_csv` and `read_table` functions to explicit the conversion:
 
-    >>> df2 = pd.read_csv('test.csv', sep=',', dtype={'a': str})
+    >>> df2 = pd.read_csv('test.csv', sep=',', dtype={'a': str})  # doctest: +SKIP
 
     No warning was issued.
-
-    >>> import os
-    >>> os.remove('test.csv')
     """
 
 
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index 9eb98195d9a88..673dbba628976 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -756,21 +756,21 @@ class ExcelWriter(metaclass=abc.ABCMeta):
     --------
     Default usage:
 
-    >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])
+    >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])  # doctest: +SKIP
     >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
-    ...     df.to_excel(writer)
+    ...     df.to_excel(writer)  # doctest: +SKIP
 
     To write to separate sheets in a single file:
 
-    >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"])
-    >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])
+    >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"])  # doctest: +SKIP
+    >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])  # doctest: +SKIP
     >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
-    ...     df1.to_excel(writer, sheet_name="Sheet1")
-    ...     df2.to_excel(writer, sheet_name="Sheet2")
+    ...     df1.to_excel(writer, sheet_name="Sheet1")  # doctest: +SKIP
+    ...     df2.to_excel(writer, sheet_name="Sheet2")  # doctest: +SKIP
 
     You can set the date format or datetime format:
 
-    >>> from datetime import date, datetime
+    >>> from datetime import date, datetime  # doctest: +SKIP
     >>> df = pd.DataFrame(
     ...     [
     ...         [date(2014, 1, 31), date(1999, 9, 24)],
@@ -778,18 +778,18 @@ class ExcelWriter(metaclass=abc.ABCMeta):
     ...     ],
     ...     index=["Date", "Datetime"],
     ...     columns=["X", "Y"],
-    ... )
+    ... )  # doctest: +SKIP
     >>> with pd.ExcelWriter(
     ...     "path_to_file.xlsx",
     ...     date_format="YYYY-MM-DD",
     ...     datetime_format="YYYY-MM-DD HH:MM:SS"
     ... ) as writer:
-    ...     df.to_excel(writer)
+    ...     df.to_excel(writer)  # doctest: +SKIP
 
     You can also append to an existing Excel file:
 
     >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer:
-    ...     df.to_excel(writer, sheet_name="Sheet3")
+    ...     df.to_excel(writer, sheet_name="Sheet3")  # doctest: +SKIP
 
     Here, the `if_sheet_exists` parameter can be set to replace a sheet if it
     already exists:
@@ -800,7 +800,7 @@ class ExcelWriter(metaclass=abc.ABCMeta):
     ...     engine="openpyxl",
     ...     if_sheet_exists="replace",
     ... ) as writer:
-    ...     df.to_excel(writer, sheet_name="Sheet1")
+    ...     df.to_excel(writer, sheet_name="Sheet1")  # doctest: +SKIP
 
     You can also write multiple DataFrames to a single sheet. Note that the
     ``if_sheet_exists`` parameter needs to be set to ``overlay``:
@@ -811,7 +811,7 @@ class ExcelWriter(metaclass=abc.ABCMeta):
     ...     if_sheet_exists="overlay",
     ... ) as writer:
     ...     df1.to_excel(writer, sheet_name="Sheet1")
-    ...     df2.to_excel(writer, sheet_name="Sheet1", startcol=3)
+    ...     df2.to_excel(writer, sheet_name="Sheet1", startcol=3)  # doctest: +SKIP
 
     You can store Excel file in RAM:
 
@@ -823,12 +823,12 @@ class ExcelWriter(metaclass=abc.ABCMeta):
 
     You can pack Excel file into zip archive:
 
-    >>> import zipfile
-    >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])
+    >>> import zipfile  # doctest: +SKIP
+    >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])  # doctest: +SKIP
     >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf:
     ...     with zf.open("filename.xlsx", "w") as buffer:
     ...         with pd.ExcelWriter(buffer) as writer:
-    ...             df.to_excel(writer)
+    ...             df.to_excel(writer)  # doctest: +SKIP
 
     You can specify additional arguments to the underlying engine:
 
@@ -837,7 +837,7 @@ class ExcelWriter(metaclass=abc.ABCMeta):
     ...     engine="xlsxwriter",
     ...     engine_kwargs={"options": {"nan_inf_to_errors": True}}
     ... ) as writer:
-    ...     df.to_excel(writer)
+    ...     df.to_excel(writer)  # doctest: +SKIP
 
     In append mode, ``engine_kwargs`` are passed through to
     openpyxl's ``load_workbook``:
@@ -848,7 +848,7 @@ class ExcelWriter(metaclass=abc.ABCMeta):
     ...     mode="a",
     ...     engine_kwargs={"keep_vba": True}
     ... ) as writer:
-    ...     df.to_excel(writer, sheet_name="Sheet2")
+    ...     df.to_excel(writer, sheet_name="Sheet2")  # doctest: +SKIP
     """
 
     # Defining an ExcelWriter implementation (see abstract methods for more...)
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
index 8bd0942550e6e..5e0a3e1646883 100644
--- a/pandas/io/pickle.py
+++ b/pandas/io/pickle.py
@@ -71,28 +71,25 @@ def to_pickle(
 
     Examples
     --------
-    >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}})
-    >>> original_df
+    >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}})  # doctest: +SKIP
+    >>> original_df  # doctest: +SKIP
        foo  bar
     0    0    5
     1    1    6
     2    2    7
     3    3    8
     4    4    9
-    >>> pd.to_pickle(original_df, "./dummy.pkl")
+    >>> pd.to_pickle(original_df, "./dummy.pkl")  # doctest: +SKIP
 
-    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
-    >>> unpickled_df
+    >>> unpickled_df = pd.read_pickle("./dummy.pkl")  # doctest: +SKIP
+    >>> unpickled_df  # doctest: +SKIP
        foo  bar
     0    0    5
     1    1    6
     2    2    7
     3    3    8
     4    4    9
-
-    >>> import os
-    >>> os.remove("./dummy.pkl")
-    """
+    """  # noqa: E501
     if protocol < 0:
         protocol = pickle.HIGHEST_PROTOCOL
 
@@ -165,28 +162,25 @@ def read_pickle(
 
     Examples
     --------
-    >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}})
-    >>> original_df
+    >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}})  # doctest: +SKIP
+    >>> original_df  # doctest: +SKIP
        foo  bar
     0    0    5
     1    1    6
     2    2    7
     3    3    8
     4    4    9
-    >>> pd.to_pickle(original_df, "./dummy.pkl")
+    >>> pd.to_pickle(original_df, "./dummy.pkl")  # doctest: +SKIP
 
-    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
-    >>> unpickled_df
+    >>> unpickled_df = pd.read_pickle("./dummy.pkl")  # doctest: +SKIP
+    >>> unpickled_df  # doctest: +SKIP
        foo  bar
     0    0    5
     1    1    6
     2    2    7
     3    3    8
     4    4    9
-
-    >>> import os
-    >>> os.remove("./dummy.pkl")
-    """
+    """  # noqa: E501
     excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError)
     with get_handle(
         filepath_or_buffer,
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index ff9d8a1be3d1e..672d6ec539124 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -172,26 +172,22 @@
 Creating a dummy stata for this example
 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
 ...                              'parrot'],
-...                   'speed': [350, 18, 361, 15]}})
->>> df.to_stata('animals.dta')
+...                   'speed': [350, 18, 361, 15]}})  # doctest: +SKIP
+>>> df.to_stata('animals.dta')  # doctest: +SKIP
 
 Read a Stata dta file:
 
->>> df = pd.read_stata('animals.dta')
+>>> df = pd.read_stata('animals.dta')  # doctest: +SKIP
 
 Read a Stata dta file in 10,000 line chunks:
->>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8")
->>> df = pd.DataFrame(values, columns=["i"])
->>> df.to_stata('filename.dta')
+>>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8")  # doctest: +SKIP
+>>> df = pd.DataFrame(values, columns=["i"])  # doctest: +SKIP
+>>> df.to_stata('filename.dta')  # doctest: +SKIP
 
->>> itr = pd.read_stata('filename.dta', chunksize=10000)
+>>> itr = pd.read_stata('filename.dta', chunksize=10000)  # doctest: +SKIP
 >>> for chunk in itr:
 ...    # Operate on a single chunk, e.g., chunk.mean()
-...    pass
-
->>> import os
->>> os.remove("./filename.dta")
->>> os.remove("./animals.dta")
+...    pass  # doctest: +SKIP
 """
 
 _read_method_doc = f"""\
diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
index 7562895d9db3e..67772dd4eecbd 100755
--- a/scripts/validate_docstrings.py
+++ b/scripts/validate_docstrings.py
@@ -20,6 +20,7 @@
 import importlib
 import io
 import json
+import os
 import pathlib
 import subprocess
 import sys
@@ -145,10 +146,17 @@ def examples_errors(self):
         runner = doctest.DocTestRunner(optionflags=flags)
         context = {"np": numpy, "pd": pandas}
         error_msgs = ""
+        current_dir = set(os.listdir())
         for test in finder.find(self.raw_doc, self.name, globs=context):
             f = io.StringIO()
             runner.run(test, out=f.write)
             error_msgs += f.getvalue()
+            leftover_files = set(os.listdir()).difference(current_dir)
+            if leftover_files:
+                error_msgs += (
+                    f"The following files were leftover from the doctest: "
+                    f"{leftover_files}"
+                )
         return error_msgs
 
     @property

From f62ea301afdc073269922a55fe152560a405ca8b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Wed, 1 Dec 2021 13:54:36 -0800
Subject: [PATCH 2/3] DOC: validate_docstrings cleans up leftover files;
 doctest +SKIP file examples

---
 scripts/tests/test_validate_docstrings.py | 15 +++++++++++++++
 scripts/validate_docstrings.py            | 18 ++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py
index 6ebf9cedeb8e3..dcfef648e8f1c 100644
--- a/scripts/tests/test_validate_docstrings.py
+++ b/scripts/tests/test_validate_docstrings.py
@@ -88,6 +88,15 @@ def write_array_like_with_hyphen_not_underscore(self):
         """
         pass
 
+    def leftover_files(self):
+        """
+        Examples
+        --------
+        >>> import pathlib
+        >>> pathlib.Path("foo.txt").touch()
+        """
+        pass
+
 
 class TestValidator:
     def _import_path(self, klass=None, func=None):
@@ -192,6 +201,12 @@ def test_bad_docstrings(self, capsys, klass, func, msgs):
         for msg in msgs:
             assert msg in " ".join([err[1] for err in result["errors"]])
 
+    def test_leftover_files_raises(self):
+        with pytest.raises(Exception, match="The following files"):
+            validate_docstrings.pandas_validate(
+                self._import_path(klass="BadDocstrings", func="leftover_files")
+            )
+
     def test_validate_all_ignore_deprecated(self, monkeypatch):
         monkeypatch.setattr(
             validate_docstrings,
diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
index 67772dd4eecbd..dcb002fd975c4 100755
--- a/scripts/validate_docstrings.py
+++ b/scripts/validate_docstrings.py
@@ -151,12 +151,18 @@ def examples_errors(self):
             f = io.StringIO()
             runner.run(test, out=f.write)
             error_msgs += f.getvalue()
-            leftover_files = set(os.listdir()).difference(current_dir)
-            if leftover_files:
-                error_msgs += (
-                    f"The following files were leftover from the doctest: "
-                    f"{leftover_files}"
-                )
+        leftovers = set(os.listdir()).difference(current_dir)
+        if leftovers:
+            for leftover in leftovers:
+                path = pathlib.Path(leftover).resolve()
+                if path.is_dir():
+                    path.rmdir()
+                elif path.is_file():
+                    path.unlink(missing_ok=True)
+            raise Exception(
+                f"The following files were leftover from the doctest: "
+                f"{leftovers}. Please use # doctest: +SKIP"
+            )
         return error_msgs
 
     @property

From 1008fdf61f7c1651489d71035d623eabd99e5fa3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Wed, 1 Dec 2021 20:52:00 -0800
Subject: [PATCH 3/3] doctest skip another file example

---
 pandas/io/pytables.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index eedf00bcd9c76..18b2ff3837a15 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -390,9 +390,9 @@ def read_hdf(
 
     Examples
     --------
-    >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
-    >>> df.to_hdf('./store.h5', 'data')
-    >>> reread = pd.read_hdf('./store.h5')
+    >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])  # doctest: +SKIP
+    >>> df.to_hdf('./store.h5', 'data')  # doctest: +SKIP
+    >>> reread = pd.read_hdf('./store.h5')  # doctest: +SKIP
     """
     if mode not in ["r", "r+", "a"]:
         raise ValueError(