Skip to content

Commit af14af5

Browse files
phoflnoatamir
authored andcommitted
DEP: Enforce deprecation of mangle_dup cols and convert_float in read_excel (pandas-dev#49089)
* DEP: Enforce deprecation of mangle_dup cols and convert_float in read_excel * Remove test * Switch to int * Fix * Add whatsnew * Fix whatsnew * Adress review
1 parent 7c7cd10 commit af14af5

File tree

8 files changed

+30
-144
lines changed

8 files changed

+30
-144
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ Removal of prior version deprecations/changes
180180
- Removed argument ``try_cast`` from :meth:`DataFrame.mask`, :meth:`DataFrame.where`, :meth:`Series.mask` and :meth:`Series.where` (:issue:`38836`)
181181
- Removed argument ``is_copy`` from :meth:`DataFrame.take` and :meth:`Series.take` (:issue:`30615`)
182182
- Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
183+
- Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`)
184+
- Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`)
183185
- Removed :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`)
184186
- Removed :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`)
185187
- Removed :attr:`Rolling.is_datetimelike` (:issue:`38963`)

pandas/io/excel/_base.py

+5-47
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@
4242
from pandas.errors import EmptyDataError
4343
from pandas.util._decorators import (
4444
Appender,
45-
deprecate_kwarg,
46-
deprecate_nonkeyword_arguments,
4745
doc,
4846
)
4947
from pandas.util._exceptions import find_stack_level
@@ -269,23 +267,6 @@
269267
comment string and the end of the current line is ignored.
270268
skipfooter : int, default 0
271269
Rows at the end to skip (0-indexed).
272-
convert_float : bool, default True
273-
Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
274-
data will be read in as floats: Excel stores all numbers as floats
275-
internally.
276-
277-
.. deprecated:: 1.3.0
278-
convert_float will be removed in a future version
279-
280-
mangle_dupe_cols : bool, default True
281-
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
282-
'X'...'X'. Passing in False will cause data to be overwritten if there
283-
are duplicate names in the columns.
284-
285-
.. deprecated:: 1.5.0
286-
Not implemented, and a new argument to specify the pattern for the
287-
names of duplicated columns will be added instead
288-
289270
{storage_options}
290271
291272
.. versionadded:: 1.2.0
@@ -365,6 +346,7 @@ def read_excel(
365346
io,
366347
# sheet name is str or int -> DataFrame
367348
sheet_name: str | int = ...,
349+
*,
368350
header: int | Sequence[int] | None = ...,
369351
names: list[str] | None = ...,
370352
index_col: int | Sequence[int] | None = ...,
@@ -392,8 +374,6 @@ def read_excel(
392374
decimal: str = ...,
393375
comment: str | None = ...,
394376
skipfooter: int = ...,
395-
convert_float: bool | None = ...,
396-
mangle_dupe_cols: bool = ...,
397377
storage_options: StorageOptions = ...,
398378
) -> DataFrame:
399379
...
@@ -404,6 +384,7 @@ def read_excel(
404384
io,
405385
# sheet name is list or None -> dict[IntStrT, DataFrame]
406386
sheet_name: list[IntStrT] | None,
387+
*,
407388
header: int | Sequence[int] | None = ...,
408389
names: list[str] | None = ...,
409390
index_col: int | Sequence[int] | None = ...,
@@ -431,20 +412,17 @@ def read_excel(
431412
decimal: str = ...,
432413
comment: str | None = ...,
433414
skipfooter: int = ...,
434-
convert_float: bool | None = ...,
435-
mangle_dupe_cols: bool = ...,
436415
storage_options: StorageOptions = ...,
437416
) -> dict[IntStrT, DataFrame]:
438417
...
439418

440419

441420
@doc(storage_options=_shared_docs["storage_options"])
442-
@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
443-
@deprecate_nonkeyword_arguments(allowed_args=["io", "sheet_name"], version="2.0")
444421
@Appender(_read_excel_doc)
445422
def read_excel(
446423
io,
447424
sheet_name: str | int | list[IntStrT] | None = 0,
425+
*,
448426
header: int | Sequence[int] | None = 0,
449427
names: list[str] | None = None,
450428
index_col: int | Sequence[int] | None = None,
@@ -472,8 +450,6 @@ def read_excel(
472450
decimal: str = ".",
473451
comment: str | None = None,
474452
skipfooter: int = 0,
475-
convert_float: bool | None = None,
476-
mangle_dupe_cols: bool = True,
477453
storage_options: StorageOptions = None,
478454
) -> DataFrame | dict[IntStrT, DataFrame]:
479455

@@ -511,8 +487,6 @@ def read_excel(
511487
decimal=decimal,
512488
comment=comment,
513489
skipfooter=skipfooter,
514-
convert_float=convert_float,
515-
mangle_dupe_cols=mangle_dupe_cols,
516490
)
517491
finally:
518492
# make sure to close opened file handles
@@ -588,7 +562,7 @@ def get_sheet_by_index(self, index: int):
588562
pass
589563

590564
@abc.abstractmethod
591-
def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None):
565+
def get_sheet_data(self, sheet, rows: int | None = None):
592566
pass
593567

594568
def raise_if_bad_sheet_by_index(self, index: int) -> None:
@@ -716,20 +690,9 @@ def parse(
716690
decimal: str = ".",
717691
comment: str | None = None,
718692
skipfooter: int = 0,
719-
convert_float: bool | None = None,
720-
mangle_dupe_cols: bool = True,
721693
**kwds,
722694
):
723695

724-
if convert_float is None:
725-
convert_float = True
726-
else:
727-
warnings.warn(
728-
"convert_float is deprecated and will be removed in a future version.",
729-
FutureWarning,
730-
stacklevel=find_stack_level(),
731-
)
732-
733696
validate_header_arg(header)
734697
validate_integer("nrows", nrows)
735698

@@ -763,7 +726,7 @@ def parse(
763726
sheet = self.get_sheet_by_index(asheetname)
764727

765728
file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
766-
data = self.get_sheet_data(sheet, convert_float, file_rows_needed)
729+
data = self.get_sheet_data(sheet, file_rows_needed)
767730
if hasattr(sheet, "close"):
768731
# pyxlsb opens two TemporaryFiles
769732
sheet.close()
@@ -885,7 +848,6 @@ def parse(
885848
comment=comment,
886849
skipfooter=skipfooter,
887850
usecols=usecols,
888-
mangle_dupe_cols=mangle_dupe_cols,
889851
**kwds,
890852
)
891853

@@ -1718,8 +1680,6 @@ def parse(
17181680
thousands: str | None = None,
17191681
comment: str | None = None,
17201682
skipfooter: int = 0,
1721-
convert_float: bool | None = None,
1722-
mangle_dupe_cols: bool = True,
17231683
**kwds,
17241684
) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
17251685
"""
@@ -1751,8 +1711,6 @@ def parse(
17511711
thousands=thousands,
17521712
comment=comment,
17531713
skipfooter=skipfooter,
1754-
convert_float=convert_float,
1755-
mangle_dupe_cols=mangle_dupe_cols,
17561714
**kwds,
17571715
)
17581716

pandas/io/excel/_odfreader.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str):
9090
raise ValueError(f"sheet {name} not found")
9191

9292
def get_sheet_data(
93-
self, sheet, convert_float: bool, file_rows_needed: int | None = None
93+
self, sheet, file_rows_needed: int | None = None
9494
) -> list[list[Scalar | NaTType]]:
9595
"""
9696
Parse an ODF Table into a list of lists
@@ -122,7 +122,7 @@ def get_sheet_data(
122122

123123
for sheet_cell in sheet_cells:
124124
if sheet_cell.qname == table_cell_name:
125-
value = self._get_cell_value(sheet_cell, convert_float)
125+
value = self._get_cell_value(sheet_cell)
126126
else:
127127
value = self.empty_value
128128

@@ -183,7 +183,7 @@ def _is_empty_row(self, row) -> bool:
183183

184184
return True
185185

186-
def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType:
186+
def _get_cell_value(self, cell) -> Scalar | NaTType:
187187
from odf.namespaces import OFFICENS
188188

189189
if str(cell) == "#N/A":
@@ -199,10 +199,9 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType:
199199
elif cell_type == "float":
200200
# GH5394
201201
cell_value = float(cell.attributes.get((OFFICENS, "value")))
202-
if convert_float:
203-
val = int(cell_value)
204-
if val == cell_value:
205-
return val
202+
val = int(cell_value)
203+
if val == cell_value:
204+
return val
206205
return cell_value
207206
elif cell_type == "percentage":
208207
cell_value = cell.attributes.get((OFFICENS, "value"))

pandas/io/excel/_openpyxl.py

+7-10
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,7 @@ def get_sheet_by_index(self, index: int):
581581
self.raise_if_bad_sheet_by_index(index)
582582
return self.book.worksheets[index]
583583

584-
def _convert_cell(self, cell, convert_float: bool) -> Scalar:
584+
def _convert_cell(self, cell) -> Scalar:
585585

586586
from openpyxl.cell.cell import (
587587
TYPE_ERROR,
@@ -593,18 +593,15 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
593593
elif cell.data_type == TYPE_ERROR:
594594
return np.nan
595595
elif cell.data_type == TYPE_NUMERIC:
596-
# GH5394, GH46988
597-
if convert_float:
598-
val = int(cell.value)
599-
if val == cell.value:
600-
return val
601-
else:
602-
return float(cell.value)
596+
val = int(cell.value)
597+
if val == cell.value:
598+
return val
599+
return float(cell.value)
603600

604601
return cell.value
605602

606603
def get_sheet_data(
607-
self, sheet, convert_float: bool, file_rows_needed: int | None = None
604+
self, sheet, file_rows_needed: int | None = None
608605
) -> list[list[Scalar]]:
609606

610607
if self.book.read_only:
@@ -613,7 +610,7 @@ def get_sheet_data(
613610
data: list[list[Scalar]] = []
614611
last_row_with_data = -1
615612
for row_number, row in enumerate(sheet.rows):
616-
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
613+
converted_row = [self._convert_cell(cell) for cell in row]
617614
while converted_row and converted_row[-1] == "":
618615
# trim trailing empty elements
619616
converted_row.pop()

pandas/io/excel/_pyxlsb.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,12 @@ def get_sheet_by_index(self, index: int):
6565
# There's a fix for this in the source, but the pypi package doesn't have it
6666
return self.book.get_sheet(index + 1)
6767

68-
def _convert_cell(self, cell, convert_float: bool) -> Scalar:
68+
def _convert_cell(self, cell) -> Scalar:
6969
# TODO: there is no way to distinguish between floats and datetimes in pyxlsb
7070
# This means that there is no way to read datetime types from an xlsb file yet
7171
if cell.v is None:
7272
return "" # Prevents non-named columns from not showing up as Unnamed: i
73-
if isinstance(cell.v, float) and convert_float:
73+
if isinstance(cell.v, float):
7474
val = int(cell.v)
7575
if val == cell.v:
7676
return val
@@ -82,7 +82,6 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
8282
def get_sheet_data(
8383
self,
8484
sheet,
85-
convert_float: bool,
8685
file_rows_needed: int | None = None,
8786
) -> list[list[Scalar]]:
8887
data: list[list[Scalar]] = []
@@ -91,7 +90,7 @@ def get_sheet_data(
9190
# not returned. The cells are namedtuples of row, col, value (r, c, v).
9291
for row in sheet.rows(sparse=True):
9392
row_number = row[0].r
94-
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
93+
converted_row = [self._convert_cell(cell) for cell in row]
9594
while converted_row and converted_row[-1] == "":
9695
# trim trailing empty elements
9796
converted_row.pop()

pandas/io/excel/_xlrd.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def get_sheet_by_index(self, index):
6262
return self.book.sheet_by_index(index)
6363

6464
def get_sheet_data(
65-
self, sheet, convert_float: bool, file_rows_needed: int | None = None
65+
self, sheet, file_rows_needed: int | None = None
6666
) -> list[list[Scalar]]:
6767
from xlrd import (
6868
XL_CELL_BOOLEAN,
@@ -104,7 +104,7 @@ def _parse_cell(cell_contents, cell_typ):
104104
cell_contents = np.nan
105105
elif cell_typ == XL_CELL_BOOLEAN:
106106
cell_contents = bool(cell_contents)
107-
elif convert_float and cell_typ == XL_CELL_NUMBER:
107+
elif cell_typ == XL_CELL_NUMBER:
108108
# GH5394 - Excel 'numbers' are always floats
109109
# it's a minimal perf hit and less surprising
110110
val = int(cell_contents)

pandas/tests/io/excel/test_readers.py

+2-35
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,6 @@ def test_reader_special_dtypes(self, request, read_ext):
405405
"FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005],
406406
"BoolCol": [True, False, True, True, False],
407407
"StrCol": [1, 2, 3, 4, 5],
408-
# GH5394 - this is why convert_float isn't vectorized
409408
"Str2Col": ["a", 3, "c", "d", "e"],
410409
"DateCol": [
411410
datetime(2013, 10, 30),
@@ -424,19 +423,8 @@ def test_reader_special_dtypes(self, request, read_ext):
424423

425424
# if not coercing number, then int comes in as float
426425
float_expected = expected.copy()
427-
float_expected["IntCol"] = float_expected["IntCol"].astype(float)
428426
float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
429-
with tm.assert_produces_warning(
430-
FutureWarning,
431-
match="convert_float is deprecated",
432-
raise_on_extra_warnings=False,
433-
):
434-
# raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning
435-
# on database job Linux_py37_IO (ci/deps/actions-37-db.yaml)
436-
# See GH#41176
437-
actual = pd.read_excel(
438-
basename + read_ext, sheet_name="Sheet1", convert_float=False
439-
)
427+
actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
440428
tm.assert_frame_equal(actual, float_expected)
441429

442430
# check setting Index (assuming xls and xlsx are the same here)
@@ -447,31 +435,12 @@ def test_reader_special_dtypes(self, request, read_ext):
447435
exp = expected.set_index(name)
448436
tm.assert_frame_equal(actual, exp)
449437

450-
# convert_float and converters should be different but both accepted
451438
expected["StrCol"] = expected["StrCol"].apply(str)
452439
actual = pd.read_excel(
453440
basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str}
454441
)
455442
tm.assert_frame_equal(actual, expected)
456443

457-
no_convert_float = float_expected.copy()
458-
no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
459-
with tm.assert_produces_warning(
460-
FutureWarning,
461-
match="convert_float is deprecated",
462-
raise_on_extra_warnings=False,
463-
):
464-
# raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning
465-
# on database job Linux_py37_IO (ci/deps/actions-37-db.yaml)
466-
# See GH#41176
467-
actual = pd.read_excel(
468-
basename + read_ext,
469-
sheet_name="Sheet1",
470-
convert_float=False,
471-
converters={"StrCol": str},
472-
)
473-
tm.assert_frame_equal(actual, no_convert_float)
474-
475444
# GH8212 - support for converters and missing values
476445
def test_reader_converters(self, read_ext):
477446

@@ -1276,11 +1245,9 @@ def test_read_excel_squeeze(self, read_ext):
12761245
tm.assert_series_equal(actual, expected)
12771246

12781247
def test_deprecated_kwargs(self, read_ext):
1279-
with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
1248+
with pytest.raises(TypeError, match="but 3 positional arguments"):
12801249
pd.read_excel("test1" + read_ext, "Sheet1", 0)
12811250

1282-
pd.read_excel("test1" + read_ext)
1283-
12841251
def test_no_header_with_list_index_col(self, read_ext):
12851252
# GH 31783
12861253
file_name = "testmultiindex" + read_ext

0 commit comments

Comments
 (0)