From 4e4b36d470ebd61d0a1d3f63430d8f5482b01a90 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 13 Dec 2022 18:08:00 -0500 Subject: [PATCH 1/7] perf: ArrowExtensionArray.__setitem__(null_slice) --- asv_bench/benchmarks/array.py | 3 +++ doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 22 ++++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index cb949637ea745..fa9f5476feafa 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -90,5 +90,8 @@ def time_setitem_list(self, multiple_chunks): def time_setitem_slice(self, multiple_chunks): self.array[::10] = "foo" + def time_setitem_null_slice(self, multiple_chunks): + self.array[:] = "foo" + def time_tolist(self, multiple_chunks): self.array.tolist() diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d6e0bb2ae0830..a450576256959 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -668,6 +668,7 @@ Performance improvements - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`#####`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 254ff8894b36c..b296a3a393a77 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -37,6 +37,7 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray +import pandas.core.common as com from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -896,6 +897,27 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: ------- None """ + # fast path (GH#####) + if com.is_null_slice(key): + if is_scalar(value) and not pa_version_under6p0: + fill_value = pa.scalar(value, type=self._data.type, from_pandas=True) + try: + self._data = pc.if_else(True, fill_value, self._data) + return + except pa.ArrowNotImplementedError: + # ArrowNotImplementedError: Function 'if_else' has no kernel + # matching input types (bool, duration[ns], duration[ns]) + # TODO: remove try/except wrapper if/when pyarrow implements + # a kernel for duration types. + pass + elif len(value) == len(self): + if isinstance(value, type(self)) and value.dtype == self.dtype: + self._data = value._data + else: + arr = pa.array(value, type=self._data.type, from_pandas=True) + self._data = pa.chunked_array([arr]) + return + key = check_array_indexer(self, key) indices = self._indexing_key_to_indices(key) value = self._maybe_convert_setitem_value(value) From ba454d5c1699585a11612a4a1026f0edf78e98ba Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 13 Dec 2022 19:16:36 -0500 Subject: [PATCH 2/7] gh refs --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/arrays/arrow/array.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index a450576256959..dc98145314af0 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -668,7 +668,7 @@ Performance improvements - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`#####`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`50248`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b296a3a393a77..fb61264b62f3b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -897,7 +897,7 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: ------- None """ - # fast path (GH#####) + # fast path (GH50248) if com.is_null_slice(key): if is_scalar(value) and not pa_version_under6p0: fill_value = pa.scalar(value, type=self._data.type, from_pandas=True) From b8d005ac6f7d2521d307fbaff85bb133bd8fc0aa Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 13 Dec 2022 20:03:36 -0500 Subject: [PATCH 3/7] fix test --- pandas/core/arrays/arrow/array.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fb61264b62f3b..7b8844827ce97 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -897,6 +897,9 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: ------- None """ + key = check_array_indexer(self, key) + value = self._maybe_convert_setitem_value(value) + # fast path (GH50248) if com.is_null_slice(key): if is_scalar(value) and not pa_version_under6p0: @@ -918,9 +921,7 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: self._data = pa.chunked_array([arr]) return - key = check_array_indexer(self, key) indices = self._indexing_key_to_indices(key) - value = self._maybe_convert_setitem_value(value) argsort = np.argsort(indices) indices = indices[argsort] From 6c49682c4e0b3419690e8a5791b5cf8a98384b8e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 15 Dec 2022 21:51:20 -0500 Subject: [PATCH 4/7] add test for setitem null slice paths --- pandas/tests/extension/test_arrow.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e6f1675bb8bc8..c5c82222d236f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1406,3 +1406,25 @@ def test_astype_from_non_pyarrow(data): assert not isinstance(pd_array.dtype, ArrowDtype) assert isinstance(result.dtype, ArrowDtype) tm.assert_extension_array_equal(result, data) + + +def test_setitem_null_slice(data): + # GH50248 + orig = data.copy() + + result = orig.copy() + result[:] = data[0] + expected = ArrowExtensionArray( + pa.array([data[0]] * len(data), type=data._data.type) + ) + tm.assert_extension_array_equal(result, expected) + + result = orig.copy() + result[:] = data[::-1] + expected = data[::-1] + tm.assert_extension_array_equal(result, expected) + + result = orig.copy() + result[:] = data.tolist() + expected = data + tm.assert_extension_array_equal(result, expected) From 2d817e7943d3b2d187a34608e9ddd71bea2a78c1 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 16 Dec 2022 21:46:56 -0500 Subject: [PATCH 5/7] add test --- pandas/tests/extension/test_arrow.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c5c82222d236f..e819e6f55a668 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1428,3 +1428,14 @@ def test_setitem_null_slice(data): result[:] = data.tolist() expected = data tm.assert_extension_array_equal(result, expected) + + +def test_setitem_invalid_dtype(data): + # GH50248 + pa_type = data._data.type + if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): + fill_value = 123 + else: + fill_value = "foo" + with pytest.raises(pa.ArrowInvalid, match="Could not convert"): + data[:] = fill_value From c61f1aa8a4a3a81f5af6b044c1c7b75558b7b4ad Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 16 Dec 2022 21:48:57 -0500 Subject: [PATCH 6/7] remove version check --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7b8844827ce97..46910f521cfe2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -902,7 +902,7 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: # fast path (GH50248) if com.is_null_slice(key): - if is_scalar(value) and not pa_version_under6p0: + if is_scalar(value): fill_value = pa.scalar(value, type=self._data.type, from_pandas=True) try: self._data = pc.if_else(True, fill_value, self._data) From 16e67b2fff343f918fafba56ee7d09b5657cc7db Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 17 Dec 2022 06:51:33 -0500 Subject: [PATCH 7/7] fix text --- pandas/tests/extension/test_arrow.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fbe4bead4276b..f93cf3d6bc138 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1467,7 +1467,19 @@ def test_setitem_invalid_dtype(data): pa_type = data._data.type if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): fill_value = 123 + err = pa.ArrowTypeError + msg = "Expected bytes" + elif ( + pa.types.is_integer(pa_type) + or pa.types.is_floating(pa_type) + or pa.types.is_boolean(pa_type) + ): + fill_value = "foo" + err = pa.ArrowInvalid + msg = "Could not convert" else: fill_value = "foo" - with pytest.raises(pa.ArrowInvalid, match="Could not convert"): + err = pa.ArrowTypeError + msg = "cannot be converted" + with pytest.raises(err, match=msg): data[:] = fill_value