From 53505d27caee7d318b14dd62643e67b6fcbb0ac6 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Mon, 26 Apr 2021 20:38:23 -0600 Subject: [PATCH 01/11] CLN: Deprecate convert_float, GH#41127 --- doc/source/user_guide/io.rst | 9 --------- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/io/excel/_base.py | 19 ++++++++++++++++--- pandas/tests/io/excel/test_readers.py | 20 +++++++++++--------- pandas/tests/io/excel/test_writers.py | 10 ++++++---- 5 files changed, 34 insertions(+), 25 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 5148bb87b0eb0..18f5d882ab53d 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3648,15 +3648,6 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. df1.to_excel(writer, sheet_name="Sheet1") df2.to_excel(writer, sheet_name="Sheet2") -.. note:: - - Wringing a little more performance out of ``read_excel`` - Internally, Excel stores all numeric data as floats. Because this can - produce unexpected behavior when reading in data, pandas defaults to trying - to convert integers to floats if it doesn't lose information (``1.0 --> - 1``). You can pass ``convert_float=False`` to disable this behavior, which - may give a slight performance improvement. - .. _io.excel_writing_buffer: Writing Excel files to memory diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7189c6e68d53d..be6a3844cf8c3 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -611,6 +611,7 @@ Deprecations - Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`) - The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories` is deprecated and will be removed in a future version (:issue:`37643`) - Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) +- Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 3c9dd90c0a0cb..7dc6ed470aa3d 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -245,6 +245,10 @@ Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally. + + .. deprecated:: 1.3.0 + convert_float will be removed in a future version + mangle_dupe_cols : bool, default True Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there @@ -355,7 +359,7 @@ def read_excel( thousands=None, comment=None, skipfooter=0, - convert_float=True, + convert_float=None, mangle_dupe_cols=True, storage_options: StorageOptions = None, ): @@ -489,11 +493,20 @@ def parse( thousands=None, comment=None, skipfooter=0, - convert_float=True, + convert_float=None, mangle_dupe_cols=True, **kwds, ): + if convert_float is None: + convert_float = True + else: + warnings.warn( + "convert_float is deprecated and will be removed in a future version", + FutureWarning, + stacklevel=1, + ) + validate_header_arg(header) ret_dict = False @@ -1225,7 +1238,7 @@ def parse( thousands=None, comment=None, skipfooter=0, - convert_float=True, + convert_float=None, mangle_dupe_cols=True, **kwds, ): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c4b3221e1d3a7..a187273725d91 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -433,9 +433,10 @@ def test_reader_special_dtypes(self, request, read_ext): float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - actual = pd.read_excel( - basename + read_ext, sheet_name="Sheet1", convert_float=False - ) + with pytest.warns(FutureWarning, match="convert_float is deprecated"): + actual = pd.read_excel( + basename + read_ext, sheet_name="Sheet1", convert_float=False + ) tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) @@ -455,12 +456,13 @@ def test_reader_special_dtypes(self, request, read_ext): no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - actual = pd.read_excel( - basename + read_ext, - sheet_name="Sheet1", - convert_float=False, - converters={"StrCol": str}, - ) + with pytest.warns(FutureWarning, match="convert_float is deprecated"): + actual = pd.read_excel( + basename + read_ext, + sheet_name="Sheet1", + convert_float=False, + converters={"StrCol": str}, + ) tm.assert_frame_equal(actual, no_convert_float) # GH8212 - support for converters and missing values diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 67a78f2b1de76..cff9f9abb0f8e 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -474,9 +474,10 @@ def test_int_types(self, np_type, path): float_frame = df.astype(float) float_frame.columns = float_frame.columns.astype(float) float_frame.index = float_frame.index.astype(float) - recons = pd.read_excel( - path, sheet_name="test1", convert_float=False, index_col=0 - ) + with pytest.warns(FutureWarning, match="convert_float is deprecated"): + recons = pd.read_excel( + path, sheet_name="test1", convert_float=False, index_col=0 + ) tm.assert_frame_equal(recons, float_frame) @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) @@ -1293,7 +1294,8 @@ def test_merged_cell_custom_objects(self, merge_cells, path): ) expected = DataFrame(np.ones((2, 2)), columns=mi) expected.to_excel(path) - result = pd.read_excel(path, header=[0, 1], index_col=0, convert_float=False) + with pytest.warns(FutureWarning, match="convert_float is deprecated"): + result = pd.read_excel(path, header=[0, 1], index_col=0, convert_float=False) # need to convert PeriodIndexes to standard Indexes for assert equal expected.columns = expected.columns.set_levels( [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], From 9dbe79179ee3abb4f71a151c682b1671a5898ba8 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Mon, 26 Apr 2021 22:03:43 -0600 Subject: [PATCH 02/11] Linting --- pandas/io/excel/_base.py | 1 - pandas/tests/io/excel/test_readers.py | 4 ++-- pandas/tests/io/excel/test_writers.py | 12 +++++++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 7dc6ed470aa3d..65d947d8cc23b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -504,7 +504,6 @@ def parse( warnings.warn( "convert_float is deprecated and will be removed in a future version", FutureWarning, - stacklevel=1, ) validate_header_arg(header) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a187273725d91..bb147cf6cb7f7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -433,7 +433,7 @@ def test_reader_special_dtypes(self, request, read_ext): float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - with pytest.warns(FutureWarning, match="convert_float is deprecated"): + with tm.assert_produces_warning(FutureWarning, match="convert_float is deprecated"): actual = pd.read_excel( basename + read_ext, sheet_name="Sheet1", convert_float=False ) @@ -456,7 +456,7 @@ def test_reader_special_dtypes(self, request, read_ext): no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - with pytest.warns(FutureWarning, match="convert_float is deprecated"): + with tm.assert_produces_warning(FutureWarning, match="convert_float is deprecated"): actual = pd.read_excel( basename + read_ext, sheet_name="Sheet1", diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index cff9f9abb0f8e..77837bea3e48a 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -474,7 +474,9 @@ def test_int_types(self, np_type, path): float_frame = df.astype(float) float_frame.columns = float_frame.columns.astype(float) float_frame.index = float_frame.index.astype(float) - with pytest.warns(FutureWarning, match="convert_float is deprecated"): + with tm.assert_produces_warning( + FutureWarning, match="convert_float is deprecated" + ): recons = pd.read_excel( path, sheet_name="test1", convert_float=False, index_col=0 ) @@ -1294,8 +1296,12 @@ def test_merged_cell_custom_objects(self, merge_cells, path): ) expected = DataFrame(np.ones((2, 2)), columns=mi) expected.to_excel(path) - with pytest.warns(FutureWarning, match="convert_float is deprecated"): - result = pd.read_excel(path, header=[0, 1], index_col=0, convert_float=False) + with tm.assert_produces_warning( + FutureWarning, match="convert_float is deprecated" + ): + result = pd.read_excel( + path, header=[0, 1], index_col=0, convert_float=False + ) # need to convert PeriodIndexes to standard Indexes for assert equal expected.columns = expected.columns.set_levels( [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], From a0d98006bb2266c6c0d49ec54bc5bea2f26ca097 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Tue, 27 Apr 2021 07:34:47 -0600 Subject: [PATCH 03/11] Linting 2 --- pandas/tests/io/excel/test_readers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index bb147cf6cb7f7..4d059873831f2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -433,7 +433,9 @@ def test_reader_special_dtypes(self, request, read_ext): float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - with tm.assert_produces_warning(FutureWarning, match="convert_float is deprecated"): + with tm.assert_produces_warning( + FutureWarning, match="convert_float is deprecated" + ): actual = pd.read_excel( basename + read_ext, sheet_name="Sheet1", convert_float=False ) @@ -456,7 +458,9 @@ def test_reader_special_dtypes(self, request, read_ext): no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - with tm.assert_produces_warning(FutureWarning, match="convert_float is deprecated"): + with tm.assert_produces_warning( + FutureWarning, match="convert_float is deprecated" + ): actual = pd.read_excel( basename + read_ext, sheet_name="Sheet1", From 08dcec2ae275bd410055438d21ebec29976531d8 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Tue, 27 Apr 2021 20:11:04 -0600 Subject: [PATCH 04/11] Warning stacklevel --- pandas/io/excel/_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 65d947d8cc23b..cb2acd8a9107c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -504,6 +504,7 @@ def parse( warnings.warn( "convert_float is deprecated and will be removed in a future version", FutureWarning, + stacklevel=3, ) validate_header_arg(header) From 8fdb6262d8cec870d1e56ca3583917f85b13950a Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Wed, 28 Apr 2021 08:35:52 -0600 Subject: [PATCH 05/11] Warning stacklevel=5 --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index cb2acd8a9107c..871accb3fa2d8 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -504,7 +504,7 @@ def parse( warnings.warn( "convert_float is deprecated and will be removed in a future version", FutureWarning, - stacklevel=3, + stacklevel=5, ) validate_header_arg(header) From 86fadd00eb568d67ef794c9071563d30c0cebb54 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Wed, 28 Apr 2021 10:24:12 -0600 Subject: [PATCH 06/11] TST Don't raise_on_extra_warnings --- pandas/tests/io/excel/test_readers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 4d059873831f2..0e94221e74f27 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -434,7 +434,9 @@ def test_reader_special_dtypes(self, request, read_ext): float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 with tm.assert_produces_warning( - FutureWarning, match="convert_float is deprecated" + FutureWarning, + match="convert_float is deprecated", + raise_on_extra_warnings=False, ): actual = pd.read_excel( basename + read_ext, sheet_name="Sheet1", convert_float=False @@ -459,7 +461,9 @@ def test_reader_special_dtypes(self, request, read_ext): no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) with tm.assert_produces_warning( - FutureWarning, match="convert_float is deprecated" + FutureWarning, + match="convert_float is deprecated", + raise_on_extra_warnings=False, ): actual = pd.read_excel( basename + read_ext, From 42d25af1fa1f4d827262b4f4eb6130c2471a1ebc Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Mon, 3 May 2021 15:41:57 -0600 Subject: [PATCH 07/11] Inspect stacklevel Thanks to rhshadrach for this trick --- pandas/io/excel/_base.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 871accb3fa2d8..bd6d14bb1ea58 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -501,10 +501,20 @@ def parse( if convert_float is None: convert_float = True else: + caller = inspect.stack()[2] + if ( + caller.filename.endswith( + os.path.join("pandas", "io", "excel", "_base.py") + ) + and caller.function == "read_excel" + ): + stacklevel = 5 + else: + stacklevel = 3 warnings.warn( "convert_float is deprecated and will be removed in a future version", FutureWarning, - stacklevel=5, + stacklevel=stacklevel, ) validate_header_arg(header) From 59872024b70ef28b36852f38b6457236ea04d5ba Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Mon, 3 May 2021 20:58:51 -0600 Subject: [PATCH 08/11] Add comment about raise_on_extra_warnings --- pandas/tests/io/excel/test_readers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 0e94221e74f27..e4069dd8c1eba 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -465,6 +465,9 @@ def test_reader_special_dtypes(self, request, read_ext): match="convert_float is deprecated", raise_on_extra_warnings=False, ): + # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning + # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml) + # See GH#41176 actual = pd.read_excel( basename + read_ext, sheet_name="Sheet1", From 791ec0b9e69a05fd8b57e79444ab336bfb0e520c Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Tue, 4 May 2021 08:01:35 -0600 Subject: [PATCH 09/11] 2nd comment for raise_on_extra_warnings --- pandas/tests/io/excel/test_readers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index e4069dd8c1eba..d1a4915382ad2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -438,6 +438,9 @@ def test_reader_special_dtypes(self, request, read_ext): match="convert_float is deprecated", raise_on_extra_warnings=False, ): + # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning + # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml) + # See GH#41176 actual = pd.read_excel( basename + read_ext, sheet_name="Sheet1", convert_float=False ) From f8ef79707e86c66ef5d2c7428d9553cad8abf583 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Wed, 12 May 2021 12:56:40 -0600 Subject: [PATCH 10/11] CLN: consolidate stack inpection into a function This one is named after pandas.util._exceptions.find_stack_level --- pandas/io/excel/_base.py | 24 +++--------------------- pandas/io/excel/_util.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bd6d14bb1ea58..267bff1e0d589 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,7 +3,6 @@ import abc import datetime from distutils.version import LooseVersion -import inspect from io import BytesIO import os from textwrap import fill @@ -53,6 +52,7 @@ ) from pandas.io.excel._util import ( fill_mi_header, + find_stack_level, get_default_engine, get_writer, maybe_convert_usecols, @@ -501,16 +501,7 @@ def parse( if convert_float is None: convert_float = True else: - caller = inspect.stack()[2] - if ( - caller.filename.endswith( - os.path.join("pandas", "io", "excel", "_base.py") - ) - and caller.function == "read_excel" - ): - stacklevel = 5 - else: - stacklevel = 3 + stacklevel = find_stack_level() warnings.warn( "convert_float is deprecated and will be removed in a future version", FutureWarning, @@ -1203,16 +1194,7 @@ def __init__( f"only the xls format is supported. Install openpyxl instead." ) elif ext != "xls": - caller = inspect.stack()[1] - if ( - caller.filename.endswith( - os.path.join("pandas", "io", "excel", "_base.py") - ) - and caller.function == "read_excel" - ): - stacklevel = 4 - else: - stacklevel = 2 + stacklevel = find_stack_level() warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install " diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 6612b681a9171..3545e6d4f90e6 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,3 +1,5 @@ +import inspect +import os from typing import ( List, MutableMapping, @@ -247,3 +249,20 @@ def pop_header_name(row, index_col): header_name = None if header_name == "" else header_name return header_name, row[:i] + [""] + row[i + 1 :] + + +def find_stack_level() -> int: + """ + Find the appropriate stacklevel for warnings from read_excel and ExcelFile + """ + stack = inspect.stack() + path = os.path.join("pandas", "io", "excel", "_base.py") + n = -1 + for i, frame in enumerate(stack): + if frame.filename.endswith(path): + n = i + if i == -1: + raise RuntimeError( + "find_stack_level should only be called from within pandas.io.excel" + ) + return n + 2 # due to pandas.utils._decorators.wrapper From a0f26771e65dcb5a547e995a9cca26d6b8fd9704 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Tue, 25 May 2021 10:36:19 -0600 Subject: [PATCH 11/11] Use find_stack_trace from pandas.util._exceptions That routine was recently generalized and works great --- pandas/io/excel/_base.py | 2 +- pandas/io/excel/_util.py | 19 ------------------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d69445dff71ec..42ca68376452d 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -32,6 +32,7 @@ deprecate_nonkeyword_arguments, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool, @@ -52,7 +53,6 @@ ) from pandas.io.excel._util import ( fill_mi_header, - find_stack_level, get_default_engine, get_writer, maybe_convert_usecols, diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 3545e6d4f90e6..6612b681a9171 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,5 +1,3 @@ -import inspect -import os from typing import ( List, MutableMapping, @@ -249,20 +247,3 @@ def pop_header_name(row, index_col): header_name = None if header_name == "" else header_name return header_name, row[:i] + [""] + row[i + 1 :] - - -def find_stack_level() -> int: - """ - Find the appropriate stacklevel for warnings from read_excel and ExcelFile - """ - stack = inspect.stack() - path = os.path.join("pandas", "io", "excel", "_base.py") - n = -1 - for i, frame in enumerate(stack): - if frame.filename.endswith(path): - n = i - if i == -1: - raise RuntimeError( - "find_stack_level should only be called from within pandas.io.excel" - ) - return n + 2 # due to pandas.utils._decorators.wrapper