From 017803faa16fd496364b7f88345bc270d761f632 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 9 Nov 2021 11:55:02 -0600 Subject: [PATCH 1/9] BUG: BooleanDtype compatibility with pyarrow types_mapper parameter --- pandas/core/arrays/boolean.py | 5 ++--- pandas/tests/extension/test_boolean.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 58e7abbbe1ddd..167587826b61d 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -113,9 +113,8 @@ def _is_boolean(self) -> bool: def _is_numeric(self) -> bool: return True - def __from_arrow__( - self, array: pyarrow.Array | pyarrow.ChunkedArray - ) -> BooleanArray: + @staticmethod + def __from_arrow__(array: pyarrow.Array | pyarrow.ChunkedArray) -> BooleanArray: """ Construct BooleanArray from pyarrow Array/ChunkedArray. """ diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 05455905860d2..e3fbcea835815 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -397,3 +397,21 @@ class TestParsing(base.BaseParsingTests): class Test2DCompat(base.Dim2CompatTests): pass + + +def test_from_arrow(): + pyarrow = pytest.importorskip("pyarrow") + + def types_mapper(arrow_type): + if pyarrow.types.is_boolean(arrow_type): + return BooleanDtype + + pyarrow_array = pyarrow.array([True, None, False], type=pyarrow.bool_()) + expected = pd.Series([True, None, False], dtype="boolean") + + # Convert to RecordBatch because types_mapper argument is ignored when + # using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664 + record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"]) + dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) + series = dataframe["test_col"] + tm.assert_series_equal(series, expected, check_names=False) From 325c0bff0cb06def368a49fffdad07da0aa283db Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 9 Nov 2021 11:58:52 -0600 Subject: [PATCH 2/9] add whatsnew --- doc/source/whatsnew/v1.3.5.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index 589092c0dd7e3..f237157baa45e 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -24,7 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fix :class:`BooleanDtype` when used with ``pyarrow`` ``types_mapper`` parameter (:issue:`44368`) - .. --------------------------------------------------------------------------- From 88d69723490abfb72c7391c570e09e5e898be6d2 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 9 Nov 2021 13:37:02 -0600 Subject: [PATCH 3/9] use dtype fixture --- pandas/tests/extension/test_boolean.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index e3fbcea835815..362443ff20a3c 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -399,12 +399,12 @@ class Test2DCompat(base.Dim2CompatTests): pass -def test_from_arrow(): +def test_from_arrow(dtype): pyarrow = pytest.importorskip("pyarrow") def types_mapper(arrow_type): if pyarrow.types.is_boolean(arrow_type): - return BooleanDtype + return dtype pyarrow_array = pyarrow.array([True, None, False], type=pyarrow.bool_()) expected = pd.Series([True, None, False], dtype="boolean") From 49ad892dc29a4a5c463ec49178f062b2bb5a8c39 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 9 Nov 2021 13:40:02 -0600 Subject: [PATCH 4/9] really use dtype fixture --- pandas/tests/extension/test_boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 362443ff20a3c..31f6e61ff5d0c 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -407,7 +407,7 @@ def types_mapper(arrow_type): return dtype pyarrow_array = pyarrow.array([True, None, False], type=pyarrow.bool_()) - expected = pd.Series([True, None, False], dtype="boolean") + expected = pd.Series([True, None, False], dtype=dtype.name) # Convert to RecordBatch because types_mapper argument is ignored when # using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664 From 09fe48edd6134274a89cacf5dbff0d46c40c6e20 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 9 Nov 2021 14:02:46 -0600 Subject: [PATCH 5/9] revert dtype changes --- doc/source/whatsnew/v1.3.5.rst | 2 +- pandas/core/arrays/boolean.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index f237157baa45e..589092c0dd7e3 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -24,7 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Fix :class:`BooleanDtype` when used with ``pyarrow`` ``types_mapper`` parameter (:issue:`44368`) +- - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 167587826b61d..58e7abbbe1ddd 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -113,8 +113,9 @@ def _is_boolean(self) -> bool: def _is_numeric(self) -> bool: return True - @staticmethod - def __from_arrow__(array: pyarrow.Array | pyarrow.ChunkedArray) -> BooleanArray: + def __from_arrow__( + self, array: pyarrow.Array | pyarrow.ChunkedArray + ) -> BooleanArray: """ Construct BooleanArray from pyarrow Array/ChunkedArray. """ From ece974cacc1374ce0b98945b96e98d9166588b9d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 9 Nov 2021 16:33:22 -0600 Subject: [PATCH 6/9] remove unnecessary date_as_object --- pandas/tests/extension/test_boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 31f6e61ff5d0c..3d2f916c8e484 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -412,6 +412,6 @@ def types_mapper(arrow_type): # Convert to RecordBatch because types_mapper argument is ignored when # using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664 record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"]) - dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) + dataframe = record_batch.to_pandas(types_mapper=types_mapper) series = dataframe["test_col"] tm.assert_series_equal(series, expected, check_names=False) From 2c80f74611c83216cfaf20d04aa146a1ec2272d7 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 10 Nov 2021 12:41:46 -0600 Subject: [PATCH 7/9] move tests to test_arrow_compat.py --- .../tests/arrays/masked/test_arrow_compat.py | 40 +++++++++++++++++++ pandas/tests/extension/test_boolean.py | 18 --------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index d66a603ad568c..6ca392a164c23 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -20,6 +20,22 @@ def data(request): return request.param +@pytest.fixture( + params=[ + pd.Int8Dtype, + pd.Int16Dtype, + pd.Int32Dtype, + pd.Int64Dtype, + pd.UInt8Dtype, + pd.UInt16Dtype, + pd.UInt32Dtype, + pd.UInt64Dtype, + ] +) +def int_dtype(request): + return request.param() + + def test_arrow_array(data): arr = pa.array(data) expected = pa.array( @@ -39,6 +55,30 @@ def test_arrow_roundtrip(data): tm.assert_frame_equal(result, df) +@td.skip_if_no("pyarrow") +def test_dataframe_from_arrow_type_mapper(int_dtype): + pyarrow = pytest.importorskip("pyarrow", minversion="3.0.0") + + def types_mapper(arrow_type): + if pyarrow.types.is_boolean(arrow_type): + return pd.BooleanDtype() + elif pyarrow.types.is_integer(arrow_type): + return int_dtype + + bools_array = pyarrow.array([True, None, False], type=pyarrow.bool_()) + ints_array = pyarrow.array([1, None, 2], type=pyarrow.int64()) + record_batch = pyarrow.RecordBatch.from_arrays( + [bools_array, ints_array], ["bools", "ints"] + ) + result = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) + assert result["bools"].dtype == "boolean" + assert result["ints"].dtype == int_dtype + bools = pd.Series([True, None, False], dtype="boolean") + ints = pd.Series([1, None, 2], dtype=int_dtype.name) + expected = pd.DataFrame({"bools": bools, "ints": ints}) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("pyarrow") def test_arrow_load_from_zero_chunks(data): # GH-41040 diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 31f6e61ff5d0c..05455905860d2 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -397,21 +397,3 @@ class TestParsing(base.BaseParsingTests): class Test2DCompat(base.Dim2CompatTests): pass - - -def test_from_arrow(dtype): - pyarrow = pytest.importorskip("pyarrow") - - def types_mapper(arrow_type): - if pyarrow.types.is_boolean(arrow_type): - return dtype - - pyarrow_array = pyarrow.array([True, None, False], type=pyarrow.bool_()) - expected = pd.Series([True, None, False], dtype=dtype.name) - - # Convert to RecordBatch because types_mapper argument is ignored when - # using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664 - record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"]) - dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) - series = dataframe["test_col"] - tm.assert_series_equal(series, expected, check_names=False) From da2873fbe7e850b882d27e5260c6cb03438822ac Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 10 Nov 2021 14:12:18 -0600 Subject: [PATCH 8/9] cleanup tests --- pandas/tests/arrays/masked/test_arrow_compat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 6ca392a164c23..d055568f751c9 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -70,9 +70,7 @@ def types_mapper(arrow_type): record_batch = pyarrow.RecordBatch.from_arrays( [bools_array, ints_array], ["bools", "ints"] ) - result = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) - assert result["bools"].dtype == "boolean" - assert result["ints"].dtype == int_dtype + result = record_batch.to_pandas(types_mapper=types_mapper) bools = pd.Series([True, None, False], dtype="boolean") ints = pd.Series([1, None, 2], dtype=int_dtype.name) expected = pd.DataFrame({"bools": bools, "ints": ints}) From 31b0065ebf9be6ba38d52563b70de3a52f0c91a2 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 30 Nov 2021 17:12:44 -0600 Subject: [PATCH 9/9] simplify types_mapper tests --- pandas/tests/arrays/masked/test_arrow_compat.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index b919b478fc23d..20eb055f14835 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -3,7 +3,6 @@ import pandas as pd import pandas._testing as tm -from pandas.api.types import pandas_dtype pa = pytest.importorskip("pyarrow", minversion="1.0.1") @@ -37,24 +36,24 @@ def test_arrow_roundtrip(data): tm.assert_frame_equal(result, df) -def test_dataframe_from_arrow_types_mapper(any_int_ea_dtype): - int_dtype = pandas_dtype(any_int_ea_dtype) - +def test_dataframe_from_arrow_types_mapper(): def types_mapper(arrow_type): if pa.types.is_boolean(arrow_type): return pd.BooleanDtype() elif pa.types.is_integer(arrow_type): - return int_dtype + return pd.Int64Dtype() bools_array = pa.array([True, None, False], type=pa.bool_()) ints_array = pa.array([1, None, 2], type=pa.int64()) + small_ints_array = pa.array([-1, 0, 7], type=pa.int8()) record_batch = pa.RecordBatch.from_arrays( - [bools_array, ints_array], ["bools", "ints"] + [bools_array, ints_array, small_ints_array], ["bools", "ints", "small_ints"] ) result = record_batch.to_pandas(types_mapper=types_mapper) bools = pd.Series([True, None, False], dtype="boolean") - ints = pd.Series([1, None, 2], dtype=int_dtype.name) - expected = pd.DataFrame({"bools": bools, "ints": ints}) + ints = pd.Series([1, None, 2], dtype="Int64") + small_ints = pd.Series([-1, 0, 7], dtype="Int64") + expected = pd.DataFrame({"bools": bools, "ints": ints, "small_ints": small_ints}) tm.assert_frame_equal(result, expected)