From 2accefa695073c94a75ca7bd52e009cffd1bfc03 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 27 Mar 2023 17:48:30 -0400 Subject: [PATCH 1/2] BUG: Arrow setitem segfaults when len > 145 000 (#52075) * BUG: Arrow setitem segfaults when len > 145 000 * Add gh ref * Address review * Restrict to bool type (cherry picked from commit 10000db023208c1db0bba6a7d819bfe87dc49908) --- pandas/core/arrays/arrow/array.py | 4 ++++ pandas/tests/extension/test_arrow.py | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 81d8183a79bc1..8612f5a4718cd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1618,6 +1618,10 @@ def _replace_with_mask( indices = pa.array(indices, type=pa.int64()) replacements = replacements.take(indices) return cls._if_else(mask, replacements, values) + if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type): + # GH#52059 replace_with_mask segfaults for chunked array + # https://github.com/apache/arrow/issues/34634 + values = values.combine_chunks() try: return pc.replace_with_mask(values, mask, replacements) except pa.ArrowNotImplementedError: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9a6d88f2adfe0..8b44358b9547f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2333,3 +2333,12 @@ def test_series_from_string_array(dtype): ser = pd.Series(arr, dtype=dtype) expected = pd.Series(ArrowExtensionArray(arr), dtype=dtype) tm.assert_series_equal(ser, expected) + + +def test_setitem_boolean_replace_with_mask_segfault(): + # GH#52059 + N = 145_000 + arr = ArrowExtensionArray(pa.chunked_array([np.ones((N,), dtype=np.bool_)])) + expected = arr.copy() + arr[np.zeros((N,), dtype=np.bool_)] = False + assert arr._pa_array == expected._pa_array From c2b1ed26062ca8af3f99c0334c6f18fca4ad7337 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 28 Mar 2023 14:22:21 -0700 Subject: [PATCH 2/2] _data --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8b44358b9547f..4f38d1e6ed7dd 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2341,4 +2341,4 @@ def test_setitem_boolean_replace_with_mask_segfault(): arr = ArrowExtensionArray(pa.chunked_array([np.ones((N,), dtype=np.bool_)])) expected = arr.copy() arr[np.zeros((N,), dtype=np.bool_)] = False - assert arr._pa_array == expected._pa_array + assert arr._data == expected._data