Skip to content

Commit 7e5a95c

Browse files
authored
ENH: Add io.nullable_backend=pyarrow support to read_excel (#49965)
1 parent 9e6bbde commit 7e5a95c

File tree

3 files changed

+66
-10
lines changed

3 files changed

+66
-10
lines changed

doc/source/whatsnew/v2.0.0.rst

+23-6
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,26 @@ The available extras, found in the :ref:`installation guide<install.dependencies
2828
``[all, performance, computation, timezone, fss, aws, gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql,
2929
sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (:issue:`39164`).
3030

31-
.. _whatsnew_200.enhancements.io_readers_nullable_pyarrow:
31+
.. _whatsnew_200.enhancements.io_use_nullable_dtypes_and_nullable_backend:
3232

3333
Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions
3434
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3535

36-
A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet`, :func:`read_orc` and :func:`read_csv` (with ``engine="pyarrow"``)
37-
to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
36+
The ``use_nullable_dtypes`` keyword argument has been expanded to the following functions to enable automatic conversion to nullable dtypes (:issue:`36712`)
37+
38+
* :func:`read_csv`
39+
* :func:`read_excel`
40+
41+
Additionally a new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
42+
to select the nullable dtypes implementation.
43+
44+
* :func:`read_csv` (with ``engine="pyarrow"``)
45+
* :func:`read_excel`
46+
* :func:`read_parquet`
47+
* :func:`read_orc`
48+
49+
By default, ``io.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also
50+
be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`).
3851

3952
.. ipython:: python
4053
@@ -43,10 +56,15 @@ to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
4356
1,2.5,True,a,,,,,
4457
3,4.5,False,b,6,7.5,True,a,
4558
""")
46-
with pd.option_context("io.nullable_backend", "pyarrow"):
47-
df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
59+
with pd.option_context("io.nullable_backend", "pandas"):
60+
df = pd.read_csv(data, use_nullable_dtypes=True)
4861
df.dtypes
4962
63+
data.seek(0)
64+
with pd.option_context("io.nullable_backend", "pyarrow"):
65+
df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
66+
df_pyarrow.dtypes
67+
5068
.. _whatsnew_200.enhancements.other:
5169

5270
Other enhancements
@@ -55,7 +73,6 @@ Other enhancements
5573
- :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
5674
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
5775
- :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
58-
- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`)
5976
- Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
6077
- Added support for extension array dtypes in :func:`merge` (:issue:`44240`)
6178
- Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)

pandas/io/parsers/base_parser.py

+15
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626

2727
import numpy as np
2828

29+
from pandas._config.config import get_option
30+
2931
from pandas._libs import (
3032
lib,
3133
parsers,
@@ -39,6 +41,7 @@
3941
DtypeObj,
4042
Scalar,
4143
)
44+
from pandas.compat._optional import import_optional_dependency
4245
from pandas.errors import (
4346
ParserError,
4447
ParserWarning,
@@ -71,6 +74,7 @@
7174
from pandas import StringDtype
7275
from pandas.core import algorithms
7376
from pandas.core.arrays import (
77+
ArrowExtensionArray,
7478
BooleanArray,
7579
Categorical,
7680
ExtensionArray,
@@ -710,6 +714,7 @@ def _infer_types(
710714
use_nullable_dtypes: Literal[True] | Literal[False] = (
711715
self.use_nullable_dtypes and no_dtype_specified
712716
)
717+
nullable_backend = get_option("io.nullable_backend")
713718
result: ArrayLike
714719

715720
if try_num_bool and is_object_dtype(values.dtype):
@@ -767,6 +772,16 @@ def _infer_types(
767772
if inferred_type != "datetime":
768773
result = StringDtype().construct_array_type()._from_sequence(values)
769774

775+
if use_nullable_dtypes and nullable_backend == "pyarrow":
776+
pa = import_optional_dependency("pyarrow")
777+
if isinstance(result, np.ndarray):
778+
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
779+
else:
780+
# ExtensionArray
781+
result = ArrowExtensionArray(
782+
pa.array(result.to_numpy(), from_pandas=True)
783+
)
784+
770785
return result, na_count
771786

772787
def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:

pandas/tests/io/excel/test_readers.py

+28-4
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,11 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
536536
actual = pd.read_excel(basename + read_ext, dtype=dtype)
537537
tm.assert_frame_equal(actual, expected)
538538

539-
def test_use_nullable_dtypes(self, read_ext):
539+
@pytest.mark.parametrize(
540+
"nullable_backend",
541+
["pandas", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))],
542+
)
543+
def test_use_nullable_dtypes(self, read_ext, nullable_backend):
540544
# GH#36712
541545
if read_ext in (".xlsb", ".xls"):
542546
pytest.skip(f"No engine for filetype: '{read_ext}'")
@@ -557,10 +561,30 @@ def test_use_nullable_dtypes(self, read_ext):
557561
)
558562
with tm.ensure_clean(read_ext) as file_path:
559563
df.to_excel(file_path, "test", index=False)
560-
result = pd.read_excel(
561-
file_path, sheet_name="test", use_nullable_dtypes=True
564+
with pd.option_context("io.nullable_backend", nullable_backend):
565+
result = pd.read_excel(
566+
file_path, sheet_name="test", use_nullable_dtypes=True
567+
)
568+
if nullable_backend == "pyarrow":
569+
import pyarrow as pa
570+
571+
from pandas.arrays import ArrowExtensionArray
572+
573+
expected = DataFrame(
574+
{
575+
col: ArrowExtensionArray(pa.array(df[col], from_pandas=True))
576+
for col in df.columns
577+
}
562578
)
563-
tm.assert_frame_equal(result, df)
579+
# pyarrow by default infers timestamp resolution as us, not ns
580+
expected["i"] = ArrowExtensionArray(
581+
expected["i"].array._data.cast(pa.timestamp(unit="us"))
582+
)
583+
# pyarrow supports a null type, so don't have to default to Int64
584+
expected["j"] = ArrowExtensionArray(pa.array([None, None]))
585+
else:
586+
expected = df
587+
tm.assert_frame_equal(result, expected)
564588

565589
def test_use_nullabla_dtypes_and_dtype(self, read_ext):
566590
# GH#36712

0 commit comments

Comments
 (0)