From d5c82b65f0568fe7423ffe7127c38f6b22c45f82 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 29 Nov 2022 16:15:30 -0800 Subject: [PATCH 1/3] ENH: Add io.nullable_backend=pyarrow support to read_excel --- doc/source/whatsnew/v2.0.0.rst | 10 +++++---- pandas/io/parsers/base_parser.py | 15 +++++++++++++ pandas/tests/io/excel/test_readers.py | 32 +++++++++++++++++++++++---- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9c65f10eb1d4c..46103265a09b7 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -28,13 +28,16 @@ The available extras, found in the :ref:`installation guide ArrayLike: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index bff4c98fe2842..822e24b224052 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -536,7 +536,11 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - def test_use_nullable_dtypes(self, read_ext): + @pytest.mark.parametrize( + "nullable_backend", + ["pandas", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], + ) + def test_use_nullable_dtypes(self, read_ext, nullable_backend): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") @@ -557,10 +561,30 @@ def test_use_nullable_dtypes(self, read_ext): ) with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) - result = pd.read_excel( - file_path, sheet_name="test", use_nullable_dtypes=True + with pd.option_context("io.nullable_backend", nullable_backend): + result = pd.read_excel( + file_path, sheet_name="test", use_nullable_dtypes=True + ) + if nullable_backend == "pyarrow": + import pyarrow as pa + + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( + { + col: ArrowExtensionArray(pa.array(df[col], from_pandas=True)) + for col in df.columns + } ) - tm.assert_frame_equal(result, df) + # pyarrow by default infers timestamp resolution as us, not ns + expected["i"] = ArrowExtensionArray( + expected["i"].array._data.cast(pa.timestamp(unit="us")) + ) + # pyarrow supports a null type, so don't have to default to Int64 + expected["j"] = ArrowExtensionArray(pa.array([None, None])) + else: + expected = df + tm.assert_frame_equal(result, expected) def test_use_nullabla_dtypes_and_dtype(self, read_ext): # GH#36712 From b5d87aacc8dba05ad963294f9fc7d4b7041b4f6d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 1 Dec 2022 13:19:35 -0800 Subject: [PATCH 2/3] Address review for whatsnew --- doc/source/whatsnew/v2.0.0.rst | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c52c5d14e5902..8b91b17786541 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -33,10 +33,20 @@ sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` ( Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``use_nullable_dtypes`` keyword argument has been expanded to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`) +The ``use_nullable_dtypes`` keyword argument has been expanded to the following functions to enable automatic conversion to nullable dtypes (:issue:`36712`) -Additionally a new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet`, :func:`read_orc`, :func:`read_excel` and :func:`read_csv` (with ``engine="pyarrow"``) -to select the nullable dtypes implementation. By default, ``io.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also +* :func:`read_csv` +* :func:`read_excel` + +Additionally a new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions +to select the nullable dtypes implementation. + +* :func:`read_csv` (with ``engine="pyarrow"``) +* :func:`read_excel` +* :func:`read_parquet` +* :func:`read_orc` + +By default, ``io.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`). .. ipython:: python @@ -46,10 +56,14 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` ( 1,2.5,True,a,,,,, 3,4.5,False,b,6,7.5,True,a, """) - with pd.option_context("io.nullable_backend", "pyarrow"): - df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") + with pd.option_context("io.nullable_backend", "pandas"): + df = pd.read_csv(data, use_nullable_dtypes=True) df.dtypes + with pd.option_context("io.nullable_backend", "pyarrow"): + df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") + df_pyarrow.dtypes + .. _whatsnew_200.enhancements.other: Other enhancements From a1069493aafe925c28d6ddf05e9db2ca16e279bb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 1 Dec 2022 15:27:12 -0800 Subject: [PATCH 3/3] Seek StringIO --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 8b63e9a57a109..0c3d85cbcb620 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -60,6 +60,7 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` ( df = pd.read_csv(data, use_nullable_dtypes=True) df.dtypes + data.seek(0) with pd.option_context("io.nullable_backend", "pyarrow"): df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") df_pyarrow.dtypes