DEP: Enforce deprecation of mangle_dup cols and convert_float in read_excel (pandas-dev#49089)

phofl · noatamir · commit af14af5c5975 · 2022-11-09T22:58:23.000+01:00
* DEP: Enforce deprecation of mangle_dup cols and convert_float in read_excel

* Remove test

* Switch to int

* Fix

* Add whatsnew

* Fix whatsnew

* Adress review
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -180,6 +180,8 @@ Removal of prior version deprecations/changes
 - Removed argument ``try_cast`` from :meth:`DataFrame.mask`, :meth:`DataFrame.where`, :meth:`Series.mask` and :meth:`Series.where` (:issue:`38836`)
 - Removed argument ``is_copy`` from :meth:`DataFrame.take` and :meth:`Series.take` (:issue:`30615`)
 - Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
+- Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`)
+- Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`)
 - Removed :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`)
 - Removed :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`)
 - Removed :attr:`Rolling.is_datetimelike` (:issue:`38963`)
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -42,8 +42,6 @@
 from pandas.errors import EmptyDataError
 from pandas.util._decorators import (
     Appender,
-    deprecate_kwarg,
-    deprecate_nonkeyword_arguments,
     doc,
 )
 from pandas.util._exceptions import find_stack_level
@@ -269,23 +267,6 @@
     comment string and the end of the current line is ignored.
 skipfooter : int, default 0
     Rows at the end to skip (0-indexed).
-convert_float : bool, default True
-    Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
-    data will be read in as floats: Excel stores all numbers as floats
-    internally.
-
-    .. deprecated:: 1.3.0
-        convert_float will be removed in a future version
-
-mangle_dupe_cols : bool, default True
-    Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
-    'X'...'X'. Passing in False will cause data to be overwritten if there
-    are duplicate names in the columns.
-
-    .. deprecated:: 1.5.0
-        Not implemented, and a new argument to specify the pattern for the
-        names of duplicated columns will be added instead
-
 {storage_options}
 
     .. versionadded:: 1.2.0
@@ -365,6 +346,7 @@ def read_excel(
     io,
     # sheet name is str or int -> DataFrame
     sheet_name: str | int = ...,
+    *,
     header: int | Sequence[int] | None = ...,
     names: list[str] | None = ...,
     index_col: int | Sequence[int] | None = ...,
@@ -392,8 +374,6 @@ def read_excel(
     decimal: str = ...,
     comment: str | None = ...,
     skipfooter: int = ...,
-    convert_float: bool | None = ...,
-    mangle_dupe_cols: bool = ...,
     storage_options: StorageOptions = ...,
 ) -> DataFrame:
     ...
@@ -404,6 +384,7 @@ def read_excel(
     io,
     # sheet name is list or None -> dict[IntStrT, DataFrame]
     sheet_name: list[IntStrT] | None,
+    *,
     header: int | Sequence[int] | None = ...,
     names: list[str] | None = ...,
     index_col: int | Sequence[int] | None = ...,
@@ -431,20 +412,17 @@ def read_excel(
     decimal: str = ...,
     comment: str | None = ...,
     skipfooter: int = ...,
-    convert_float: bool | None = ...,
-    mangle_dupe_cols: bool = ...,
     storage_options: StorageOptions = ...,
 ) -> dict[IntStrT, DataFrame]:
     ...
 
 
 @doc(storage_options=_shared_docs["storage_options"])
-@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
-@deprecate_nonkeyword_arguments(allowed_args=["io", "sheet_name"], version="2.0")
 @Appender(_read_excel_doc)
 def read_excel(
     io,
     sheet_name: str | int | list[IntStrT] | None = 0,
+    *,
     header: int | Sequence[int] | None = 0,
     names: list[str] | None = None,
     index_col: int | Sequence[int] | None = None,
@@ -472,8 +450,6 @@ def read_excel(
     decimal: str = ".",
     comment: str | None = None,
     skipfooter: int = 0,
-    convert_float: bool | None = None,
-    mangle_dupe_cols: bool = True,
     storage_options: StorageOptions = None,
 ) -> DataFrame | dict[IntStrT, DataFrame]:
 
@@ -511,8 +487,6 @@ def read_excel(
             decimal=decimal,
             comment=comment,
             skipfooter=skipfooter,
-            convert_float=convert_float,
-            mangle_dupe_cols=mangle_dupe_cols,
         )
     finally:
         # make sure to close opened file handles
@@ -588,7 +562,7 @@ def get_sheet_by_index(self, index: int):
         pass
 
     @abc.abstractmethod
-    def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None):
+    def get_sheet_data(self, sheet, rows: int | None = None):
         pass
 
     def raise_if_bad_sheet_by_index(self, index: int) -> None:
@@ -716,20 +690,9 @@ def parse(
         decimal: str = ".",
         comment: str | None = None,
         skipfooter: int = 0,
-        convert_float: bool | None = None,
-        mangle_dupe_cols: bool = True,
         **kwds,
     ):
 
-        if convert_float is None:
-            convert_float = True
-        else:
-            warnings.warn(
-                "convert_float is deprecated and will be removed in a future version.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-
         validate_header_arg(header)
         validate_integer("nrows", nrows)
 
@@ -763,7 +726,7 @@ def parse(
                 sheet = self.get_sheet_by_index(asheetname)
 
             file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
-            data = self.get_sheet_data(sheet, convert_float, file_rows_needed)
+            data = self.get_sheet_data(sheet, file_rows_needed)
             if hasattr(sheet, "close"):
                 # pyxlsb opens two TemporaryFiles
                 sheet.close()
@@ -885,7 +848,6 @@ def parse(
                     comment=comment,
                     skipfooter=skipfooter,
                     usecols=usecols,
-                    mangle_dupe_cols=mangle_dupe_cols,
                     **kwds,
                 )
 
@@ -1718,8 +1680,6 @@ def parse(
         thousands: str | None = None,
         comment: str | None = None,
         skipfooter: int = 0,
-        convert_float: bool | None = None,
-        mangle_dupe_cols: bool = True,
         **kwds,
     ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
         """
@@ -1751,8 +1711,6 @@ def parse(
             thousands=thousands,
             comment=comment,
             skipfooter=skipfooter,
-            convert_float=convert_float,
-            mangle_dupe_cols=mangle_dupe_cols,
             **kwds,
         )
 
diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
@@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str):
         raise ValueError(f"sheet {name} not found")
 
     def get_sheet_data(
-        self, sheet, convert_float: bool, file_rows_needed: int | None = None
+        self, sheet, file_rows_needed: int | None = None
     ) -> list[list[Scalar | NaTType]]:
         """
         Parse an ODF Table into a list of lists
@@ -122,7 +122,7 @@ def get_sheet_data(
 
             for sheet_cell in sheet_cells:
                 if sheet_cell.qname == table_cell_name:
-                    value = self._get_cell_value(sheet_cell, convert_float)
+                    value = self._get_cell_value(sheet_cell)
                 else:
                     value = self.empty_value
 
@@ -183,7 +183,7 @@ def _is_empty_row(self, row) -> bool:
 
         return True
 
-    def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType:
+    def _get_cell_value(self, cell) -> Scalar | NaTType:
         from odf.namespaces import OFFICENS
 
         if str(cell) == "#N/A":
@@ -199,10 +199,9 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType:
         elif cell_type == "float":
             # GH5394
             cell_value = float(cell.attributes.get((OFFICENS, "value")))
-            if convert_float:
-                val = int(cell_value)
-                if val == cell_value:
-                    return val
+            val = int(cell_value)
+            if val == cell_value:
+                return val
             return cell_value
         elif cell_type == "percentage":
             cell_value = cell.attributes.get((OFFICENS, "value"))
diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -581,7 +581,7 @@ def get_sheet_by_index(self, index: int):
         self.raise_if_bad_sheet_by_index(index)
         return self.book.worksheets[index]
 
-    def _convert_cell(self, cell, convert_float: bool) -> Scalar:
+    def _convert_cell(self, cell) -> Scalar:
 
         from openpyxl.cell.cell import (
             TYPE_ERROR,
@@ -593,18 +593,15 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
         elif cell.data_type == TYPE_ERROR:
             return np.nan
         elif cell.data_type == TYPE_NUMERIC:
-            # GH5394, GH46988
-            if convert_float:
-                val = int(cell.value)
-                if val == cell.value:
-                    return val
-            else:
-                return float(cell.value)
+            val = int(cell.value)
+            if val == cell.value:
+                return val
+            return float(cell.value)
 
         return cell.value
 
     def get_sheet_data(
-        self, sheet, convert_float: bool, file_rows_needed: int | None = None
+        self, sheet, file_rows_needed: int | None = None
     ) -> list[list[Scalar]]:
 
         if self.book.read_only:
@@ -613,7 +610,7 @@ def get_sheet_data(
         data: list[list[Scalar]] = []
         last_row_with_data = -1
         for row_number, row in enumerate(sheet.rows):
-            converted_row = [self._convert_cell(cell, convert_float) for cell in row]
+            converted_row = [self._convert_cell(cell) for cell in row]
             while converted_row and converted_row[-1] == "":
                 # trim trailing empty elements
                 converted_row.pop()
diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py
@@ -65,12 +65,12 @@ def get_sheet_by_index(self, index: int):
         # There's a fix for this in the source, but the pypi package doesn't have it
         return self.book.get_sheet(index + 1)
 
-    def _convert_cell(self, cell, convert_float: bool) -> Scalar:
+    def _convert_cell(self, cell) -> Scalar:
         # TODO: there is no way to distinguish between floats and datetimes in pyxlsb
         # This means that there is no way to read datetime types from an xlsb file yet
         if cell.v is None:
             return ""  # Prevents non-named columns from not showing up as Unnamed: i
-        if isinstance(cell.v, float) and convert_float:
+        if isinstance(cell.v, float):
             val = int(cell.v)
             if val == cell.v:
                 return val
@@ -82,7 +82,6 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
     def get_sheet_data(
         self,
         sheet,
-        convert_float: bool,
         file_rows_needed: int | None = None,
     ) -> list[list[Scalar]]:
         data: list[list[Scalar]] = []
@@ -91,7 +90,7 @@ def get_sheet_data(
         # not returned. The cells are namedtuples of row, col, value (r, c, v).
         for row in sheet.rows(sparse=True):
             row_number = row[0].r
-            converted_row = [self._convert_cell(cell, convert_float) for cell in row]
+            converted_row = [self._convert_cell(cell) for cell in row]
             while converted_row and converted_row[-1] == "":
                 # trim trailing empty elements
                 converted_row.pop()
diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
@@ -62,7 +62,7 @@ def get_sheet_by_index(self, index):
         return self.book.sheet_by_index(index)
 
     def get_sheet_data(
-        self, sheet, convert_float: bool, file_rows_needed: int | None = None
+        self, sheet, file_rows_needed: int | None = None
     ) -> list[list[Scalar]]:
         from xlrd import (
             XL_CELL_BOOLEAN,
@@ -104,7 +104,7 @@ def _parse_cell(cell_contents, cell_typ):
                 cell_contents = np.nan
             elif cell_typ == XL_CELL_BOOLEAN:
                 cell_contents = bool(cell_contents)
-            elif convert_float and cell_typ == XL_CELL_NUMBER:
+            elif cell_typ == XL_CELL_NUMBER:
                 # GH5394 - Excel 'numbers' are always floats
                 # it's a minimal perf hit and less surprising
                 val = int(cell_contents)
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -405,7 +405,6 @@ def test_reader_special_dtypes(self, request, read_ext):
                 "FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005],
                 "BoolCol": [True, False, True, True, False],
                 "StrCol": [1, 2, 3, 4, 5],
-                # GH5394 - this is why convert_float isn't vectorized
                 "Str2Col": ["a", 3, "c", "d", "e"],
                 "DateCol": [
                     datetime(2013, 10, 30),
@@ -424,19 +423,8 @@ def test_reader_special_dtypes(self, request, read_ext):
 
         # if not coercing number, then int comes in as float
         float_expected = expected.copy()
-        float_expected["IntCol"] = float_expected["IntCol"].astype(float)
         float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
-        with tm.assert_produces_warning(
-            FutureWarning,
-            match="convert_float is deprecated",
-            raise_on_extra_warnings=False,
-        ):
-            # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning
-            # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml)
-            # See GH#41176
-            actual = pd.read_excel(
-                basename + read_ext, sheet_name="Sheet1", convert_float=False
-            )
+        actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
         tm.assert_frame_equal(actual, float_expected)
 
         # check setting Index (assuming xls and xlsx are the same here)
@@ -447,31 +435,12 @@ def test_reader_special_dtypes(self, request, read_ext):
             exp = expected.set_index(name)
             tm.assert_frame_equal(actual, exp)
 
-        # convert_float and converters should be different but both accepted
         expected["StrCol"] = expected["StrCol"].apply(str)
         actual = pd.read_excel(
             basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str}
         )
         tm.assert_frame_equal(actual, expected)
 
-        no_convert_float = float_expected.copy()
-        no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
-        with tm.assert_produces_warning(
-            FutureWarning,
-            match="convert_float is deprecated",
-            raise_on_extra_warnings=False,
-        ):
-            # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning
-            # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml)
-            # See GH#41176
-            actual = pd.read_excel(
-                basename + read_ext,
-                sheet_name="Sheet1",
-                convert_float=False,
-                converters={"StrCol": str},
-            )
-        tm.assert_frame_equal(actual, no_convert_float)
-
     # GH8212 - support for converters and missing values
     def test_reader_converters(self, read_ext):
 
@@ -1276,11 +1245,9 @@ def test_read_excel_squeeze(self, read_ext):
             tm.assert_series_equal(actual, expected)
 
     def test_deprecated_kwargs(self, read_ext):
-        with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
+        with pytest.raises(TypeError, match="but 3 positional arguments"):
             pd.read_excel("test1" + read_ext, "Sheet1", 0)
 
-        pd.read_excel("test1" + read_ext)
-
     def test_no_header_with_list_index_col(self, read_ext):
         # GH 31783
         file_name = "testmultiindex" + read_ext
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py