ENH: Add io.nullable_backend=pyarrow support to read_excel (#49965)

mroeschke · web-flow · commit 7e5a95cda77d · 2022-12-02T12:13:08.000+01:00
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -28,13 +28,26 @@ The available extras, found in the :ref:`installation guide<install.dependencies
 ``[all, performance, computation, timezone, fss, aws, gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql,
 sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (:issue:`39164`).
 
-.. _whatsnew_200.enhancements.io_readers_nullable_pyarrow:
+.. _whatsnew_200.enhancements.io_use_nullable_dtypes_and_nullable_backend:
 
 Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet`, :func:`read_orc` and :func:`read_csv` (with ``engine="pyarrow"``)
-to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
+The ``use_nullable_dtypes`` keyword argument has been expanded to the following functions to enable automatic conversion to nullable dtypes (:issue:`36712`)
+
+* :func:`read_csv`
+* :func:`read_excel`
+
+Additionally a new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
+to select the nullable dtypes implementation.
+
+* :func:`read_csv` (with ``engine="pyarrow"``)
+* :func:`read_excel`
+* :func:`read_parquet`
+* :func:`read_orc`
+
+By default, ``io.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also
+be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`).
 
 .. ipython:: python
 
@@ -43,10 +56,15 @@ to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
         1,2.5,True,a,,,,,
         3,4.5,False,b,6,7.5,True,a,
     """)
-    with pd.option_context("io.nullable_backend", "pyarrow"):
-        df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
+    with pd.option_context("io.nullable_backend", "pandas"):
+        df = pd.read_csv(data, use_nullable_dtypes=True)
     df.dtypes
 
+    data.seek(0)
+    with pd.option_context("io.nullable_backend", "pyarrow"):
+        df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
+    df_pyarrow.dtypes
+
 .. _whatsnew_200.enhancements.other:
 
 Other enhancements
@@ -55,7 +73,6 @@ Other enhancements
 - :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
 - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
 - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
-- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`)
 - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
 - Added support for extension array dtypes in :func:`merge` (:issue:`44240`)
 - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -26,6 +26,8 @@
 
 import numpy as np
 
+from pandas._config.config import get_option
+
 from pandas._libs import (
     lib,
     parsers,
@@ -39,6 +41,7 @@
     DtypeObj,
     Scalar,
 )
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     ParserError,
     ParserWarning,
@@ -71,6 +74,7 @@
 from pandas import StringDtype
 from pandas.core import algorithms
 from pandas.core.arrays import (
+    ArrowExtensionArray,
     BooleanArray,
     Categorical,
     ExtensionArray,
@@ -710,6 +714,7 @@ def _infer_types(
         use_nullable_dtypes: Literal[True] | Literal[False] = (
             self.use_nullable_dtypes and no_dtype_specified
         )
+        nullable_backend = get_option("io.nullable_backend")
         result: ArrayLike
 
         if try_num_bool and is_object_dtype(values.dtype):
@@ -767,6 +772,16 @@ def _infer_types(
                 if inferred_type != "datetime":
                     result = StringDtype().construct_array_type()._from_sequence(values)
 
+        if use_nullable_dtypes and nullable_backend == "pyarrow":
+            pa = import_optional_dependency("pyarrow")
+            if isinstance(result, np.ndarray):
+                result = ArrowExtensionArray(pa.array(result, from_pandas=True))
+            else:
+                # ExtensionArray
+                result = ArrowExtensionArray(
+                    pa.array(result.to_numpy(), from_pandas=True)
+                )
+
         return result, na_count
 
     def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -536,7 +536,11 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
         actual = pd.read_excel(basename + read_ext, dtype=dtype)
         tm.assert_frame_equal(actual, expected)
 
-    def test_use_nullable_dtypes(self, read_ext):
+    @pytest.mark.parametrize(
+        "nullable_backend",
+        ["pandas", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))],
+    )
+    def test_use_nullable_dtypes(self, read_ext, nullable_backend):
         # GH#36712
         if read_ext in (".xlsb", ".xls"):
             pytest.skip(f"No engine for filetype: '{read_ext}'")
@@ -557,10 +561,30 @@ def test_use_nullable_dtypes(self, read_ext):
         )
         with tm.ensure_clean(read_ext) as file_path:
             df.to_excel(file_path, "test", index=False)
-            result = pd.read_excel(
-                file_path, sheet_name="test", use_nullable_dtypes=True
+            with pd.option_context("io.nullable_backend", nullable_backend):
+                result = pd.read_excel(
+                    file_path, sheet_name="test", use_nullable_dtypes=True
+                )
+        if nullable_backend == "pyarrow":
+            import pyarrow as pa
+
+            from pandas.arrays import ArrowExtensionArray
+
+            expected = DataFrame(
+                {
+                    col: ArrowExtensionArray(pa.array(df[col], from_pandas=True))
+                    for col in df.columns
+                }
             )
-        tm.assert_frame_equal(result, df)
+            # pyarrow by default infers timestamp resolution as us, not ns
+            expected["i"] = ArrowExtensionArray(
+                expected["i"].array._data.cast(pa.timestamp(unit="us"))
+            )
+            # pyarrow supports a null type, so don't have to default to Int64
+            expected["j"] = ArrowExtensionArray(pa.array([None, None]))
+        else:
+            expected = df
+        tm.assert_frame_equal(result, expected)
 
     def test_use_nullabla_dtypes_and_dtype(self, read_ext):
         # GH#36712