From c86c06e218595b5ddf6bc2d973648a670fd7576f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 5 Apr 2023 23:40:49 +0200 Subject: [PATCH 1/5] BUG: segfault for null dtype in to_numpy --- doc/source/whatsnew/v2.0.1.rst | 2 +- pandas/core/arrays/arrow/array.py | 9 ++++++++- pandas/tests/extension/test_arrow.py | 10 ++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 0122c84ba2a8e..4dc7b30bdcab1 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -20,7 +20,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f76fe166dba78..124edc8e1ada8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1044,6 +1044,11 @@ def to_numpy( result = np.empty(len(self), dtype=object) mask = ~self.isna() result[mask] = np.asarray(self[mask]._pa_array) + elif pa.types.is_null(self._pa_array.type): + data = np.asarray(self._pa_array, dtype=dtype) + if not isna(na_value): + data[:] = na_value + return data elif self._hasna: data = self.copy() data[self.isna()] = na_value @@ -1634,7 +1639,9 @@ def _replace_with_mask( indices = pa.array(indices, type=pa.int64()) replacements = replacements.take(indices) return cls._if_else(mask, replacements, values) - if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type): + if isinstance(values, pa.ChunkedArray) and ( + pa.types.is_boolean(values.type) or pa.types.is_null(values.type) + ): # GH#52059 replace_with_mask segfaults for chunked array # https://github.com/apache/arrow/issues/34634 values = values.combine_chunks() diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index df470d85a4fad..cf68a9df5c5a5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -27,6 +27,7 @@ import numpy as np import pytest +from pandas._libs import lib from pandas.compat import ( PY311, is_ci_environment, @@ -1676,6 +1677,15 @@ def test_to_numpy_int_with_na(): tm.assert_numpy_array_equal(result, expected) +@pytest.mark.parametrize("na_val, exp", [(lib.no_default, np.nan), (1, 1)]) +def test_to_numpy_null_array(na_val, exp): + # GH#52443 + arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]") + result = arr.to_numpy(dtype="float64", na_value=na_val) + expected = np.array([exp] * 2, dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + def test_setitem_null_slice(data): # GH50248 orig = data.copy() From ef0cdc1601f6d2df49854fd379a001a5fce321d6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 6 Apr 2023 01:12:19 +0200 Subject: [PATCH 2/5] Add test fix mypy --- pandas/core/arrays/arrow/array.py | 6 +++--- pandas/tests/extension/test_arrow.py | 12 ++++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 124edc8e1ada8..987e2ebc2f828 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1045,10 +1045,10 @@ def to_numpy( mask = ~self.isna() result[mask] = np.asarray(self[mask]._pa_array) elif pa.types.is_null(self._pa_array.type): - data = np.asarray(self._pa_array, dtype=dtype) + result = np.asarray(self._pa_array, dtype=dtype) if not isna(na_value): - data[:] = na_value - return data + result[:] = na_value + return result elif self._hasna: data = self.copy() data[self.isna()] = na_value diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index cf68a9df5c5a5..a2fa2ea2d9de2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1678,14 +1678,22 @@ def test_to_numpy_int_with_na(): @pytest.mark.parametrize("na_val, exp", [(lib.no_default, np.nan), (1, 1)]) -def test_to_numpy_null_array(na_val, exp): +def test_to_numpy_null_array(na_val, exp, dtype): # GH#52443 arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]") - result = arr.to_numpy(dtype="float64", na_value=na_val) + result = arr.to_numpy(dtype=dtype, na_value=na_val) expected = np.array([exp] * 2, dtype="float64") tm.assert_numpy_array_equal(result, expected) +def test_to_numpy_null_array_no_dtype(): + # GH#52443 + arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]") + result = arr.to_numpy(dtype=None) + expected = np.array([pd.NA] * 2, dtype="object") + tm.assert_numpy_array_equal(result, expected) + + def test_setitem_null_slice(data): # GH50248 orig = data.copy() From a11f8b53c5244e397c8f0c152f2133290540a19a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 6 Apr 2023 01:36:22 +0200 Subject: [PATCH 3/5] Fix --- pandas/tests/extension/test_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a2fa2ea2d9de2..182458e87c566 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1678,10 +1678,10 @@ def test_to_numpy_int_with_na(): @pytest.mark.parametrize("na_val, exp", [(lib.no_default, np.nan), (1, 1)]) -def test_to_numpy_null_array(na_val, exp, dtype): +def test_to_numpy_null_array(na_val, exp): # GH#52443 arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]") - result = arr.to_numpy(dtype=dtype, na_value=na_val) + result = arr.to_numpy(dtype="float64", na_value=na_val) expected = np.array([exp] * 2, dtype="float64") tm.assert_numpy_array_equal(result, expected) From 87807edd58eea61b170de87818cda0de90ddc637 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 6 Apr 2023 10:34:53 +0200 Subject: [PATCH 4/5] Update v2.0.1.rst --- doc/source/whatsnew/v2.0.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index d49419c5f24a9..2c8a901cffd87 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -20,8 +20,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) - Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) +- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: From 8ff572fae76e61cf9898abe874a5bf859ffa2526 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 6 Apr 2023 10:59:38 +0200 Subject: [PATCH 5/5] Update array.py --- pandas/core/arrays/arrow/array.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 987e2ebc2f828..5859a6a8b556b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1639,9 +1639,7 @@ def _replace_with_mask( indices = pa.array(indices, type=pa.int64()) replacements = replacements.take(indices) return cls._if_else(mask, replacements, values) - if isinstance(values, pa.ChunkedArray) and ( - pa.types.is_boolean(values.type) or pa.types.is_null(values.type) - ): + if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type): # GH#52059 replace_with_mask segfaults for chunked array # https://github.com/apache/arrow/issues/34634 values = values.combine_chunks()