From 04b9be81b5619d3b73dd48dbcc8ccbb3b0a23e39 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 29 Nov 2021 18:21:04 -0800 Subject: [PATCH 1/3] DOC: Ensure no files are leftover after doctests --- pandas/core/generic.py | 46 ++++++++++++++-------------------- pandas/errors/__init__.py | 19 ++++++-------- pandas/io/excel/_base.py | 34 ++++++++++++------------- pandas/io/pickle.py | 30 +++++++++------------- pandas/io/stata.py | 20 ++++++--------- scripts/validate_docstrings.py | 8 ++++++ 6 files changed, 72 insertions(+), 85 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4aff7acc4c6fb..263c3520afcbb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2709,32 +2709,27 @@ def to_hdf( Examples -------- >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - ... index=['a', 'b', 'c']) - >>> df.to_hdf('data.h5', key='df', mode='w') + ... index=['a', 'b', 'c']) # doctest: +SKIP + >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP We can add another object to the same file: - >>> s = pd.Series([1, 2, 3, 4]) - >>> s.to_hdf('data.h5', key='s') + >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP + >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP Reading from HDF file: - >>> pd.read_hdf('data.h5', 'df') + >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP A B a 1 4 b 2 5 c 3 6 - >>> pd.read_hdf('data.h5', 's') + >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP 0 1 1 2 2 3 3 4 dtype: int64 - - Deleting file with data: - - >>> import os - >>> os.remove('data.h5') """ from pandas.io import pytables @@ -2970,28 +2965,25 @@ def to_pickle( Examples -------- - >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) - >>> original_df + >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP + >>> original_df # doctest: +SKIP foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 - >>> original_df.to_pickle("./dummy.pkl") + >>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP - >>> unpickled_df = pd.read_pickle("./dummy.pkl") - >>> unpickled_df + >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP + >>> unpickled_df # doctest: +SKIP foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 - - >>> import os - >>> os.remove("./dummy.pkl") - """ + """ # noqa: E501 from pandas.io.pickle import to_pickle to_pickle( @@ -3509,14 +3501,14 @@ def to_csv( To write a csv file to a new folder or nested folder you will first need to create it using either Pathlib or os: - >>> from pathlib import Path - >>> filepath = Path('folder/subfolder/out.csv') - >>> filepath.parent.mkdir(parents=True, exist_ok=True) - >>> df.to_csv(filepath) + >>> from pathlib import Path # doctest: +SKIP + >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP + >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP + >>> df.to_csv(filepath) # doctest: +SKIP - >>> import os - >>> os.makedirs('folder/subfolder', exist_ok=True) - >>> df.to_csv('folder/subfolder/out.csv') + >>> import os # doctest: +SKIP + >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP + >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 56eda37c8122e..8fae2d1d1179d 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -95,32 +95,29 @@ class DtypeWarning(Warning): >>> df = pd.DataFrame({'a': (['1'] * 100000 + ['X'] * 100000 + ... ['1'] * 100000), - ... 'b': ['b'] * 300000}) - >>> df.to_csv('test.csv', index=False) - >>> df2 = pd.read_csv('test.csv') + ... 'b': ['b'] * 300000}) # doctest: +SKIP + >>> df.to_csv('test.csv', index=False) # doctest: +SKIP + >>> df2 = pd.read_csv('test.csv') # doctest: +SKIP ... # DtypeWarning: Columns (0) have mixed types Important to notice that ``df2`` will contain both `str` and `int` for the same input, '1'. - >>> df2.iloc[262140, 0] + >>> df2.iloc[262140, 0] # doctest: +SKIP '1' - >>> type(df2.iloc[262140, 0]) + >>> type(df2.iloc[262140, 0]) # doctest: +SKIP - >>> df2.iloc[262150, 0] + >>> df2.iloc[262150, 0] # doctest: +SKIP 1 - >>> type(df2.iloc[262150, 0]) + >>> type(df2.iloc[262150, 0]) # doctest: +SKIP One way to solve this issue is using the `dtype` parameter in the `read_csv` and `read_table` functions to explicit the conversion: - >>> df2 = pd.read_csv('test.csv', sep=',', dtype={'a': str}) + >>> df2 = pd.read_csv('test.csv', sep=',', dtype={'a': str}) # doctest: +SKIP No warning was issued. - - >>> import os - >>> os.remove('test.csv') """ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9eb98195d9a88..673dbba628976 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -756,21 +756,21 @@ class ExcelWriter(metaclass=abc.ABCMeta): -------- Default usage: - >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) + >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: - ... df.to_excel(writer) + ... df.to_excel(writer) # doctest: +SKIP To write to separate sheets in a single file: - >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) - >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) + >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP + >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: - ... df1.to_excel(writer, sheet_name="Sheet1") - ... df2.to_excel(writer, sheet_name="Sheet2") + ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP + ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP You can set the date format or datetime format: - >>> from datetime import date, datetime + >>> from datetime import date, datetime # doctest: +SKIP >>> df = pd.DataFrame( ... [ ... [date(2014, 1, 31), date(1999, 9, 24)], @@ -778,18 +778,18 @@ class ExcelWriter(metaclass=abc.ABCMeta): ... ], ... index=["Date", "Datetime"], ... columns=["X", "Y"], - ... ) + ... ) # doctest: +SKIP >>> with pd.ExcelWriter( ... "path_to_file.xlsx", ... date_format="YYYY-MM-DD", ... datetime_format="YYYY-MM-DD HH:MM:SS" ... ) as writer: - ... df.to_excel(writer) + ... df.to_excel(writer) # doctest: +SKIP You can also append to an existing Excel file: >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer: - ... df.to_excel(writer, sheet_name="Sheet3") + ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP Here, the `if_sheet_exists` parameter can be set to replace a sheet if it already exists: @@ -800,7 +800,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): ... engine="openpyxl", ... if_sheet_exists="replace", ... ) as writer: - ... df.to_excel(writer, sheet_name="Sheet1") + ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP You can also write multiple DataFrames to a single sheet. Note that the ``if_sheet_exists`` parameter needs to be set to ``overlay``: @@ -811,7 +811,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): ... if_sheet_exists="overlay", ... ) as writer: ... df1.to_excel(writer, sheet_name="Sheet1") - ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) + ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP You can store Excel file in RAM: @@ -823,12 +823,12 @@ class ExcelWriter(metaclass=abc.ABCMeta): You can pack Excel file into zip archive: - >>> import zipfile - >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) + >>> import zipfile # doctest: +SKIP + >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf: ... with zf.open("filename.xlsx", "w") as buffer: ... with pd.ExcelWriter(buffer) as writer: - ... df.to_excel(writer) + ... df.to_excel(writer) # doctest: +SKIP You can specify additional arguments to the underlying engine: @@ -837,7 +837,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): ... engine="xlsxwriter", ... engine_kwargs={"options": {"nan_inf_to_errors": True}} ... ) as writer: - ... df.to_excel(writer) + ... df.to_excel(writer) # doctest: +SKIP In append mode, ``engine_kwargs`` are passed through to openpyxl's ``load_workbook``: @@ -848,7 +848,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): ... mode="a", ... engine_kwargs={"keep_vba": True} ... ) as writer: - ... df.to_excel(writer, sheet_name="Sheet2") + ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP """ # Defining an ExcelWriter implementation (see abstract methods for more...) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 8bd0942550e6e..5e0a3e1646883 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -71,28 +71,25 @@ def to_pickle( Examples -------- - >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) - >>> original_df + >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP + >>> original_df # doctest: +SKIP foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 - >>> pd.to_pickle(original_df, "./dummy.pkl") + >>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP - >>> unpickled_df = pd.read_pickle("./dummy.pkl") - >>> unpickled_df + >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP + >>> unpickled_df # doctest: +SKIP foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 - - >>> import os - >>> os.remove("./dummy.pkl") - """ + """ # noqa: E501 if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL @@ -165,28 +162,25 @@ def read_pickle( Examples -------- - >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) - >>> original_df + >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP + >>> original_df # doctest: +SKIP foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 - >>> pd.to_pickle(original_df, "./dummy.pkl") + >>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP - >>> unpickled_df = pd.read_pickle("./dummy.pkl") - >>> unpickled_df + >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP + >>> unpickled_df # doctest: +SKIP foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 - - >>> import os - >>> os.remove("./dummy.pkl") - """ + """ # noqa: E501 excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError) with get_handle( filepath_or_buffer, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ff9d8a1be3d1e..672d6ec539124 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -172,26 +172,22 @@ Creating a dummy stata for this example >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', ... 'parrot'], -... 'speed': [350, 18, 361, 15]}}) ->>> df.to_stata('animals.dta') +... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP +>>> df.to_stata('animals.dta') # doctest: +SKIP Read a Stata dta file: ->>> df = pd.read_stata('animals.dta') +>>> df = pd.read_stata('animals.dta') # doctest: +SKIP Read a Stata dta file in 10,000 line chunks: ->>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") ->>> df = pd.DataFrame(values, columns=["i"]) ->>> df.to_stata('filename.dta') +>>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP +>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP +>>> df.to_stata('filename.dta') # doctest: +SKIP ->>> itr = pd.read_stata('filename.dta', chunksize=10000) +>>> itr = pd.read_stata('filename.dta', chunksize=10000) # doctest: +SKIP >>> for chunk in itr: ... # Operate on a single chunk, e.g., chunk.mean() -... pass - ->>> import os ->>> os.remove("./filename.dta") ->>> os.remove("./animals.dta") +... pass # doctest: +SKIP """ _read_method_doc = f"""\ diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 7562895d9db3e..67772dd4eecbd 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -20,6 +20,7 @@ import importlib import io import json +import os import pathlib import subprocess import sys @@ -145,10 +146,17 @@ def examples_errors(self): runner = doctest.DocTestRunner(optionflags=flags) context = {"np": numpy, "pd": pandas} error_msgs = "" + current_dir = set(os.listdir()) for test in finder.find(self.raw_doc, self.name, globs=context): f = io.StringIO() runner.run(test, out=f.write) error_msgs += f.getvalue() + leftover_files = set(os.listdir()).difference(current_dir) + if leftover_files: + error_msgs += ( + f"The following files were leftover from the doctest: " + f"{leftover_files}" + ) return error_msgs @property From f62ea301afdc073269922a55fe152560a405ca8b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 1 Dec 2021 13:54:36 -0800 Subject: [PATCH 2/3] DOC: validate_docstrings cleans up leftover files; doctest +SKIP file examples --- scripts/tests/test_validate_docstrings.py | 15 +++++++++++++++ scripts/validate_docstrings.py | 18 ++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 6ebf9cedeb8e3..dcfef648e8f1c 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -88,6 +88,15 @@ def write_array_like_with_hyphen_not_underscore(self): """ pass + def leftover_files(self): + """ + Examples + -------- + >>> import pathlib + >>> pathlib.Path("foo.txt").touch() + """ + pass + class TestValidator: def _import_path(self, klass=None, func=None): @@ -192,6 +201,12 @@ def test_bad_docstrings(self, capsys, klass, func, msgs): for msg in msgs: assert msg in " ".join([err[1] for err in result["errors"]]) + def test_leftover_files_raises(self): + with pytest.raises(Exception, match="The following files"): + validate_docstrings.pandas_validate( + self._import_path(klass="BadDocstrings", func="leftover_files") + ) + def test_validate_all_ignore_deprecated(self, monkeypatch): monkeypatch.setattr( validate_docstrings, diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 67772dd4eecbd..dcb002fd975c4 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -151,12 +151,18 @@ def examples_errors(self): f = io.StringIO() runner.run(test, out=f.write) error_msgs += f.getvalue() - leftover_files = set(os.listdir()).difference(current_dir) - if leftover_files: - error_msgs += ( - f"The following files were leftover from the doctest: " - f"{leftover_files}" - ) + leftovers = set(os.listdir()).difference(current_dir) + if leftovers: + for leftover in leftovers: + path = pathlib.Path(leftover).resolve() + if path.is_dir(): + path.rmdir() + elif path.is_file(): + path.unlink(missing_ok=True) + raise Exception( + f"The following files were leftover from the doctest: " + f"{leftovers}. Please use # doctest: +SKIP" + ) return error_msgs @property From 1008fdf61f7c1651489d71035d623eabd99e5fa3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 1 Dec 2021 20:52:00 -0800 Subject: [PATCH 3/3] doctest skip another file example --- pandas/io/pytables.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index eedf00bcd9c76..18b2ff3837a15 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -390,9 +390,9 @@ def read_hdf( Examples -------- - >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) - >>> df.to_hdf('./store.h5', 'data') - >>> reread = pd.read_hdf('./store.h5') + >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP + >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP + >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP """ if mode not in ["r", "r+", "a"]: raise ValueError(