ENH: Add use nullable dtypes to read_excel (#49091)

phofl · web-flow · commit 22e591f2d142 · 2022-10-25T06:56:34.000-04:00
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -41,7 +41,7 @@ Other enhancements
 - :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
 - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
 - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
-- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` to enable automatic conversion to nullable dtypes (:issue:`36712`)
+- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`)
 - Added new global configuration, ``io.nullable_backend`` to allow ``use_nullable_dtypes=True`` to return pyarrow-backed dtypes when set to ``"pyarrow"`` in :func:`read_parquet` (:issue:`48957`)
 - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
 - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2370,7 +2370,7 @@ def maybe_convert_numeric(
 
     # This occurs since we disabled float nulls showing as null in anticipation
     # of seeing ints that were never seen. So then, we return float
-    if allow_null_in_int and seen.null_ and not seen.int_:
+    if allow_null_in_int and seen.null_ and not seen.int_ and not seen.bool_:
         seen.float_ = True
 
     if seen.complex_:
@@ -2390,6 +2390,8 @@ def maybe_convert_numeric(
         else:
             return (ints, None)
     elif seen.bool_:
+        if allow_null_in_int:
+            return (bools.view(np.bool_), mask.view(np.bool_))
         return (bools.view(np.bool_), None)
     elif seen.uint_:
         return (uints, None)
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -271,6 +271,13 @@
 
     .. versionadded:: 1.2.0
 
+use_nullable_dtypes : bool, default False
+    Whether or not to use nullable dtypes as default when reading data. If
+    set to True, nullable dtypes are used for all dtypes that have a nullable
+    implementation, even if no nulls are present. Dtype takes precedence if given.
+
+    .. versionadded:: 2.0
+
 Returns
 -------
 DataFrame or dict of DataFrames
@@ -375,6 +382,7 @@ def read_excel(
     comment: str | None = ...,
     skipfooter: int = ...,
     storage_options: StorageOptions = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> DataFrame:
     ...
 
@@ -413,6 +421,7 @@ def read_excel(
     comment: str | None = ...,
     skipfooter: int = ...,
     storage_options: StorageOptions = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> dict[IntStrT, DataFrame]:
     ...
 
@@ -451,6 +460,7 @@ def read_excel(
     comment: str | None = None,
     skipfooter: int = 0,
     storage_options: StorageOptions = None,
+    use_nullable_dtypes: bool = False,
 ) -> DataFrame | dict[IntStrT, DataFrame]:
 
     should_close = False
@@ -487,6 +497,7 @@ def read_excel(
             decimal=decimal,
             comment=comment,
             skipfooter=skipfooter,
+            use_nullable_dtypes=use_nullable_dtypes,
         )
     finally:
         # make sure to close opened file handles
@@ -690,6 +701,7 @@ def parse(
         decimal: str = ".",
         comment: str | None = None,
         skipfooter: int = 0,
+        use_nullable_dtypes: bool = False,
         **kwds,
     ):
 
@@ -848,6 +860,7 @@ def parse(
                     comment=comment,
                     skipfooter=skipfooter,
                     usecols=usecols,
+                    use_nullable_dtypes=use_nullable_dtypes,
                     **kwds,
                 )
 
@@ -1684,6 +1697,7 @@ def parse(
         thousands: str | None = None,
         comment: str | None = None,
         skipfooter: int = 0,
+        use_nullable_dtypes: bool = False,
         **kwds,
     ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
         """
@@ -1715,6 +1729,7 @@ def parse(
             thousands=thousands,
             comment=comment,
             skipfooter=skipfooter,
+            use_nullable_dtypes=use_nullable_dtypes,
             **kwds,
         )
 
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -774,7 +774,10 @@ def _infer_types(
                     bool_mask = np.zeros(result.shape, dtype=np.bool_)
                 result = BooleanArray(result, bool_mask)
             elif result.dtype == np.object_ and use_nullable_dtypes:
-                result = StringDtype().construct_array_type()._from_sequence(values)
+                # read_excel sends array of datetime objects
+                inferred_type, _ = lib.infer_datetimelike_array(result)
+                if inferred_type != "datetime":
+                    result = StringDtype().construct_array_type()._from_sequence(values)
 
         return result, na_count
 
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -21,6 +21,10 @@
     Series,
 )
 import pandas._testing as tm
+from pandas.core.arrays import (
+    ArrowStringArray,
+    StringArray,
+)
 
 read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
 engine_params = [
@@ -532,6 +536,84 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
         actual = pd.read_excel(basename + read_ext, dtype=dtype)
         tm.assert_frame_equal(actual, expected)
 
+    def test_use_nullable_dtypes(self, read_ext):
+        # GH#36712
+        if read_ext == ".xlsb":
+            pytest.skip("No engine for filetype: 'xlsb'")
+
+        df = DataFrame(
+            {
+                "a": Series([1, 3], dtype="Int64"),
+                "b": Series([2.5, 4.5], dtype="Float64"),
+                "c": Series([True, False], dtype="boolean"),
+                "d": Series(["a", "b"], dtype="string"),
+                "e": Series([pd.NA, 6], dtype="Int64"),
+                "f": Series([pd.NA, 7.5], dtype="Float64"),
+                "g": Series([pd.NA, True], dtype="boolean"),
+                "h": Series([pd.NA, "a"], dtype="string"),
+                "i": Series([pd.Timestamp("2019-12-31")] * 2),
+                "j": Series([pd.NA, pd.NA], dtype="Int64"),
+            }
+        )
+        with tm.ensure_clean(read_ext) as file_path:
+            df.to_excel(file_path, "test", index=False)
+            result = pd.read_excel(
+                file_path, sheet_name="test", use_nullable_dtypes=True
+            )
+        tm.assert_frame_equal(result, df)
+
+    def test_use_nullabla_dtypes_and_dtype(self, read_ext):
+        # GH#36712
+        if read_ext == ".xlsb":
+            pytest.skip("No engine for filetype: 'xlsb'")
+
+        df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]})
+        with tm.ensure_clean(read_ext) as file_path:
+            df.to_excel(file_path, "test", index=False)
+            result = pd.read_excel(
+                file_path, sheet_name="test", use_nullable_dtypes=True, dtype="float64"
+            )
+        tm.assert_frame_equal(result, df)
+
+    @td.skip_if_no("pyarrow")
+    @pytest.mark.parametrize("storage", ["pyarrow", "python"])
+    def test_use_nullabla_dtypes_string(self, read_ext, storage):
+        # GH#36712
+        if read_ext == ".xlsb":
+            pytest.skip("No engine for filetype: 'xlsb'")
+
+        import pyarrow as pa
+
+        with pd.option_context("mode.string_storage", storage):
+
+            df = DataFrame(
+                {
+                    "a": np.array(["a", "b"], dtype=np.object_),
+                    "b": np.array(["x", pd.NA], dtype=np.object_),
+                }
+            )
+            with tm.ensure_clean(read_ext) as file_path:
+                df.to_excel(file_path, "test", index=False)
+                result = pd.read_excel(
+                    file_path, sheet_name="test", use_nullable_dtypes=True
+                )
+
+            if storage == "python":
+                expected = DataFrame(
+                    {
+                        "a": StringArray(np.array(["a", "b"], dtype=np.object_)),
+                        "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
+                    }
+                )
+            else:
+                expected = DataFrame(
+                    {
+                        "a": ArrowStringArray(pa.array(["a", "b"])),
+                        "b": ArrowStringArray(pa.array(["x", None])),
+                    }
+                )
+            tm.assert_frame_equal(result, expected)
+
     @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
     def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
         # GH#35211