From 8c7889232069fe5c54c29abca57f70f3b0e33483 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 19 Mar 2023 18:58:29 +0100 Subject: [PATCH 1/4] BUG: Arrow setitem segfaults when len > 145 000 --- pandas/core/arrays/arrow/array.py | 3 +++ pandas/tests/extension/test_arrow.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2313e28950de7..a4511998185ec 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1634,6 +1634,9 @@ def _replace_with_mask( indices = pa.array(indices, type=pa.int64()) replacements = replacements.take(indices) return cls._if_else(mask, replacements, values) + if isinstance(values, pa.ChunkedArray): + # 52059 replace_with_mask segfaults for chunked array + values = values.combine_chunks() try: return pc.replace_with_mask(values, mask, replacements) except pa.ArrowNotImplementedError: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 517626f8c2abb..a760f9fc96293 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2368,3 +2368,12 @@ def test_pickle_old_arrowextensionarray(): tm.assert_extension_array_equal(result, expected) assert result._pa_array == pa.chunked_array(data) assert not hasattr(result, "_data") + + +def test_setitem_boolean_replace_with_mask_segfault(): + # GH#52059 + N = 145_000 + arr = ArrowExtensionArray(pa.chunked_array([np.array([True] * N)])) + expected = arr.copy() + arr[np.array([False] * N)] = False + assert arr._pa_array == expected._pa_array From f5c7754ad0350fe3db44f576ff11250f633af865 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 19 Mar 2023 19:00:24 +0100 Subject: [PATCH 2/4] Add gh ref --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a4511998185ec..81a2209e0fa24 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1635,7 +1635,7 @@ def _replace_with_mask( replacements = replacements.take(indices) return cls._if_else(mask, replacements, values) if isinstance(values, pa.ChunkedArray): - # 52059 replace_with_mask segfaults for chunked array + # GH#52059 replace_with_mask segfaults for chunked array values = values.combine_chunks() try: return pc.replace_with_mask(values, mask, replacements) From e72007d4e433f15718241a70dd41ff2a8384b701 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 21 Mar 2023 09:41:04 +0100 Subject: [PATCH 3/4] Address review --- pandas/core/arrays/arrow/array.py | 1 + pandas/tests/extension/test_arrow.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 81a2209e0fa24..22c7ab034c269 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1636,6 +1636,7 @@ def _replace_with_mask( return cls._if_else(mask, replacements, values) if isinstance(values, pa.ChunkedArray): # GH#52059 replace_with_mask segfaults for chunked array + # https://github.com/apache/arrow/issues/34634 values = values.combine_chunks() try: return pc.replace_with_mask(values, mask, replacements) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a760f9fc96293..8da4972820d7a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2373,7 +2373,7 @@ def test_pickle_old_arrowextensionarray(): def test_setitem_boolean_replace_with_mask_segfault(): # GH#52059 N = 145_000 - arr = ArrowExtensionArray(pa.chunked_array([np.array([True] * N)])) + arr = ArrowExtensionArray(pa.chunked_array([np.ones((N,), dtype=np.bool_)])) expected = arr.copy() - arr[np.array([False] * N)] = False + arr[np.zeros((N,), dtype=np.bool_)] = False assert arr._pa_array == expected._pa_array From 7535639d0b574f6534bca8f1552fe12bf1e0234c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 21 Mar 2023 16:33:28 -0400 Subject: [PATCH 4/4] Restrict to bool type --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 22c7ab034c269..ab4cbcf99f470 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1634,7 +1634,7 @@ def _replace_with_mask( indices = pa.array(indices, type=pa.int64()) replacements = replacements.take(indices) return cls._if_else(mask, replacements, values) - if isinstance(values, pa.ChunkedArray): + if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type): # GH#52059 replace_with_mask segfaults for chunked array # https://github.com/apache/arrow/issues/34634 values = values.combine_chunks()