Skip to content

Commit 22e591f

Browse files
authored
ENH: Add use nullable dtypes to read_excel (#49091)
1 parent 0bd52be commit 22e591f

File tree

5 files changed

+105
-3
lines changed

5 files changed

+105
-3
lines changed

doc/source/whatsnew/v2.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ Other enhancements
4141
- :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
4242
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
4343
- :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
44-
- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` to enable automatic conversion to nullable dtypes (:issue:`36712`)
44+
- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`)
4545
- Added new global configuration, ``io.nullable_backend`` to allow ``use_nullable_dtypes=True`` to return pyarrow-backed dtypes when set to ``"pyarrow"`` in :func:`read_parquet` (:issue:`48957`)
4646
- Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
4747
- Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)

pandas/_libs/lib.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -2370,7 +2370,7 @@ def maybe_convert_numeric(
23702370

23712371
# This occurs since we disabled float nulls showing as null in anticipation
23722372
# of seeing ints that were never seen. So then, we return float
2373-
if allow_null_in_int and seen.null_ and not seen.int_:
2373+
if allow_null_in_int and seen.null_ and not seen.int_ and not seen.bool_:
23742374
seen.float_ = True
23752375

23762376
if seen.complex_:
@@ -2390,6 +2390,8 @@ def maybe_convert_numeric(
23902390
else:
23912391
return (ints, None)
23922392
elif seen.bool_:
2393+
if allow_null_in_int:
2394+
return (bools.view(np.bool_), mask.view(np.bool_))
23932395
return (bools.view(np.bool_), None)
23942396
elif seen.uint_:
23952397
return (uints, None)

pandas/io/excel/_base.py

+15
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,13 @@
271271
272272
.. versionadded:: 1.2.0
273273
274+
use_nullable_dtypes : bool, default False
275+
Whether or not to use nullable dtypes as default when reading data. If
276+
set to True, nullable dtypes are used for all dtypes that have a nullable
277+
implementation, even if no nulls are present. Dtype takes precedence if given.
278+
279+
.. versionadded:: 2.0
280+
274281
Returns
275282
-------
276283
DataFrame or dict of DataFrames
@@ -375,6 +382,7 @@ def read_excel(
375382
comment: str | None = ...,
376383
skipfooter: int = ...,
377384
storage_options: StorageOptions = ...,
385+
use_nullable_dtypes: bool = ...,
378386
) -> DataFrame:
379387
...
380388

@@ -413,6 +421,7 @@ def read_excel(
413421
comment: str | None = ...,
414422
skipfooter: int = ...,
415423
storage_options: StorageOptions = ...,
424+
use_nullable_dtypes: bool = ...,
416425
) -> dict[IntStrT, DataFrame]:
417426
...
418427

@@ -451,6 +460,7 @@ def read_excel(
451460
comment: str | None = None,
452461
skipfooter: int = 0,
453462
storage_options: StorageOptions = None,
463+
use_nullable_dtypes: bool = False,
454464
) -> DataFrame | dict[IntStrT, DataFrame]:
455465

456466
should_close = False
@@ -487,6 +497,7 @@ def read_excel(
487497
decimal=decimal,
488498
comment=comment,
489499
skipfooter=skipfooter,
500+
use_nullable_dtypes=use_nullable_dtypes,
490501
)
491502
finally:
492503
# make sure to close opened file handles
@@ -690,6 +701,7 @@ def parse(
690701
decimal: str = ".",
691702
comment: str | None = None,
692703
skipfooter: int = 0,
704+
use_nullable_dtypes: bool = False,
693705
**kwds,
694706
):
695707

@@ -848,6 +860,7 @@ def parse(
848860
comment=comment,
849861
skipfooter=skipfooter,
850862
usecols=usecols,
863+
use_nullable_dtypes=use_nullable_dtypes,
851864
**kwds,
852865
)
853866

@@ -1684,6 +1697,7 @@ def parse(
16841697
thousands: str | None = None,
16851698
comment: str | None = None,
16861699
skipfooter: int = 0,
1700+
use_nullable_dtypes: bool = False,
16871701
**kwds,
16881702
) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
16891703
"""
@@ -1715,6 +1729,7 @@ def parse(
17151729
thousands=thousands,
17161730
comment=comment,
17171731
skipfooter=skipfooter,
1732+
use_nullable_dtypes=use_nullable_dtypes,
17181733
**kwds,
17191734
)
17201735

pandas/io/parsers/base_parser.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -774,7 +774,10 @@ def _infer_types(
774774
bool_mask = np.zeros(result.shape, dtype=np.bool_)
775775
result = BooleanArray(result, bool_mask)
776776
elif result.dtype == np.object_ and use_nullable_dtypes:
777-
result = StringDtype().construct_array_type()._from_sequence(values)
777+
# read_excel sends array of datetime objects
778+
inferred_type, _ = lib.infer_datetimelike_array(result)
779+
if inferred_type != "datetime":
780+
result = StringDtype().construct_array_type()._from_sequence(values)
778781

779782
return result, na_count
780783

pandas/tests/io/excel/test_readers.py

+82
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
Series,
2222
)
2323
import pandas._testing as tm
24+
from pandas.core.arrays import (
25+
ArrowStringArray,
26+
StringArray,
27+
)
2428

2529
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
2630
engine_params = [
@@ -532,6 +536,84 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
532536
actual = pd.read_excel(basename + read_ext, dtype=dtype)
533537
tm.assert_frame_equal(actual, expected)
534538

539+
def test_use_nullable_dtypes(self, read_ext):
540+
# GH#36712
541+
if read_ext == ".xlsb":
542+
pytest.skip("No engine for filetype: 'xlsb'")
543+
544+
df = DataFrame(
545+
{
546+
"a": Series([1, 3], dtype="Int64"),
547+
"b": Series([2.5, 4.5], dtype="Float64"),
548+
"c": Series([True, False], dtype="boolean"),
549+
"d": Series(["a", "b"], dtype="string"),
550+
"e": Series([pd.NA, 6], dtype="Int64"),
551+
"f": Series([pd.NA, 7.5], dtype="Float64"),
552+
"g": Series([pd.NA, True], dtype="boolean"),
553+
"h": Series([pd.NA, "a"], dtype="string"),
554+
"i": Series([pd.Timestamp("2019-12-31")] * 2),
555+
"j": Series([pd.NA, pd.NA], dtype="Int64"),
556+
}
557+
)
558+
with tm.ensure_clean(read_ext) as file_path:
559+
df.to_excel(file_path, "test", index=False)
560+
result = pd.read_excel(
561+
file_path, sheet_name="test", use_nullable_dtypes=True
562+
)
563+
tm.assert_frame_equal(result, df)
564+
565+
def test_use_nullabla_dtypes_and_dtype(self, read_ext):
566+
# GH#36712
567+
if read_ext == ".xlsb":
568+
pytest.skip("No engine for filetype: 'xlsb'")
569+
570+
df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]})
571+
with tm.ensure_clean(read_ext) as file_path:
572+
df.to_excel(file_path, "test", index=False)
573+
result = pd.read_excel(
574+
file_path, sheet_name="test", use_nullable_dtypes=True, dtype="float64"
575+
)
576+
tm.assert_frame_equal(result, df)
577+
578+
@td.skip_if_no("pyarrow")
579+
@pytest.mark.parametrize("storage", ["pyarrow", "python"])
580+
def test_use_nullabla_dtypes_string(self, read_ext, storage):
581+
# GH#36712
582+
if read_ext == ".xlsb":
583+
pytest.skip("No engine for filetype: 'xlsb'")
584+
585+
import pyarrow as pa
586+
587+
with pd.option_context("mode.string_storage", storage):
588+
589+
df = DataFrame(
590+
{
591+
"a": np.array(["a", "b"], dtype=np.object_),
592+
"b": np.array(["x", pd.NA], dtype=np.object_),
593+
}
594+
)
595+
with tm.ensure_clean(read_ext) as file_path:
596+
df.to_excel(file_path, "test", index=False)
597+
result = pd.read_excel(
598+
file_path, sheet_name="test", use_nullable_dtypes=True
599+
)
600+
601+
if storage == "python":
602+
expected = DataFrame(
603+
{
604+
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
605+
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
606+
}
607+
)
608+
else:
609+
expected = DataFrame(
610+
{
611+
"a": ArrowStringArray(pa.array(["a", "b"])),
612+
"b": ArrowStringArray(pa.array(["x", None])),
613+
}
614+
)
615+
tm.assert_frame_equal(result, expected)
616+
535617
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
536618
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
537619
# GH#35211

0 commit comments

Comments
 (0)