diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 924040ff0648b..1ed7481d7b2f5 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -90,6 +90,9 @@ def time_setitem_list(self, multiple_chunks): def time_setitem_slice(self, multiple_chunks): self.array[::10] = "foo" + def time_setitem_null_slice(self, multiple_chunks): + self.array[:] = "foo" + def time_tolist(self, multiple_chunks): self.array.tolist() diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index cbad169fe4d56..7e92724254f9e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -748,6 +748,7 @@ Performance improvements - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`50248`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d698c5eb11751..606c975c32928 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -40,6 +40,7 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray +import pandas.core.common as com from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -931,9 +932,31 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: None """ key = check_array_indexer(self, key) - indices = self._indexing_key_to_indices(key) value = self._maybe_convert_setitem_value(value) + # fast path (GH50248) + if com.is_null_slice(key): + if is_scalar(value): + fill_value = pa.scalar(value, type=self._data.type, from_pandas=True) + try: + self._data = pc.if_else(True, fill_value, self._data) + return + except pa.ArrowNotImplementedError: + # ArrowNotImplementedError: Function 'if_else' has no kernel + # matching input types (bool, duration[ns], duration[ns]) + # TODO: remove try/except wrapper if/when pyarrow implements + # a kernel for duration types. + pass + elif len(value) == len(self): + if isinstance(value, type(self)) and value.dtype == self.dtype: + self._data = value._data + else: + arr = pa.array(value, type=self._data.type, from_pandas=True) + self._data = pa.chunked_array([arr]) + return + + indices = self._indexing_key_to_indices(key) + argsort = np.argsort(indices) indices = indices[argsort] diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c36b129f919e8..f93cf3d6bc138 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1438,3 +1438,48 @@ def test_to_numpy_with_defaults(data): expected[pd.isna(data)] = pd.NA tm.assert_numpy_array_equal(result, expected) + + +def test_setitem_null_slice(data): + # GH50248 + orig = data.copy() + + result = orig.copy() + result[:] = data[0] + expected = ArrowExtensionArray( + pa.array([data[0]] * len(data), type=data._data.type) + ) + tm.assert_extension_array_equal(result, expected) + + result = orig.copy() + result[:] = data[::-1] + expected = data[::-1] + tm.assert_extension_array_equal(result, expected) + + result = orig.copy() + result[:] = data.tolist() + expected = data + tm.assert_extension_array_equal(result, expected) + + +def test_setitem_invalid_dtype(data): + # GH50248 + pa_type = data._data.type + if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): + fill_value = 123 + err = pa.ArrowTypeError + msg = "Expected bytes" + elif ( + pa.types.is_integer(pa_type) + or pa.types.is_floating(pa_type) + or pa.types.is_boolean(pa_type) + ): + fill_value = "foo" + err = pa.ArrowInvalid + msg = "Could not convert" + else: + fill_value = "foo" + err = pa.ArrowTypeError + msg = "cannot be converted" + with pytest.raises(err, match=msg): + data[:] = fill_value