diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index cb8fcac38edce..3ffaaf706d636 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -93,7 +93,7 @@ def time_setitem(self, multiple_chunks): self.array[i] = "foo" def time_setitem_list(self, multiple_chunks): - indexer = list(range(0, 50)) + list(range(-50, 0)) + indexer = list(range(0, 50)) + list(range(-1000, 0, 50)) self.array[indexer] = ["foo"] * len(indexer) def time_setitem_slice(self, multiple_chunks): diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3ac004ef335ac..c0d23d66fd87b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -825,7 +825,7 @@ Performance improvements - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.IntervalArray.from_tuples` (:issue:`50620`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`50248`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` (:issue:`50248`, :issue:`50632`) - Performance improvement in :class:`~arrays.ArrowExtensionArray` comparison methods when array contains NA (:issue:`50524`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`) - Performance improvement when parsing strings to :class:`BooleanDtype` (:issue:`50613`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index de85ed67e7e8c..5ee93f242b7ea 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -19,6 +19,7 @@ Iterator, NpDtype, PositionalIndexer, + Scalar, SortKind, TakeIndexer, npt, @@ -26,6 +27,7 @@ from pandas.compat import ( pa_version_under6p0, pa_version_under7p0, + pa_version_under8p0, pa_version_under9p0, ) from pandas.util._decorators import doc @@ -36,6 +38,7 @@ is_bool_dtype, is_integer, is_integer_dtype, + is_list_like, is_object_dtype, is_scalar, ) @@ -1034,76 +1037,56 @@ def __setitem__(self, key, value) -> None: key = check_array_indexer(self, key) value = self._maybe_convert_setitem_value(value) - # fast path (GH50248) if com.is_null_slice(key): - if is_scalar(value): - fill_value = pa.scalar(value, type=self._data.type, from_pandas=True) - try: - self._data = pc.if_else(True, fill_value, self._data) - return - except pa.ArrowNotImplementedError: - # ArrowNotImplementedError: Function 'if_else' has no kernel - # matching input types (bool, duration[ns], duration[ns]) - # TODO: remove try/except wrapper if/when pyarrow implements - # a kernel for duration types. - pass - elif len(value) == len(self): - if isinstance(value, type(self)) and value.dtype == self.dtype: - self._data = value._data - else: - arr = pa.array(value, type=self._data.type, from_pandas=True) - self._data = pa.chunked_array([arr]) - return - - indices = self._indexing_key_to_indices(key) - argsort = np.argsort(indices) - indices = indices[argsort] - - if is_scalar(value): - value = np.broadcast_to(value, len(self)) - elif len(indices) != len(value): - raise ValueError("Length of indexer and values mismatch") - else: - value = np.asarray(value)[argsort] + # fast path (GH50248) + data = self._if_else(True, value, self._data) - self._data = self._set_via_chunk_iteration(indices=indices, value=value) + elif is_integer(key): + # fast path + key = cast(int, key) + n = len(self) + if key < 0: + key += n + if not 0 <= key < n: + raise IndexError( + f"index {key} is out of bounds for axis 0 with size {n}" + ) + if is_list_like(value): + raise ValueError("Length of indexer and values mismatch") + elif isinstance(value, pa.Scalar): + value = value.as_py() + chunks = [ + *self._data[:key].chunks, + pa.array([value], type=self._data.type, from_pandas=True), + *self._data[key + 1 :].chunks, + ] + data = pa.chunked_array(chunks).combine_chunks() - def _indexing_key_to_indices( - self, key: int | slice | np.ndarray - ) -> npt.NDArray[np.intp]: - """ - Convert indexing key for self into positional indices. + elif is_bool_dtype(key): + key = np.asarray(key, dtype=np.bool_) + data = self._replace_with_mask(self._data, key, value) - Parameters - ---------- - key : int | slice | np.ndarray + elif is_scalar(value) or isinstance(value, pa.Scalar): + mask = np.zeros(len(self), dtype=np.bool_) + mask[key] = True + data = self._if_else(mask, value, self._data) - Returns - ------- - npt.NDArray[np.intp] - """ - n = len(self) - if isinstance(key, slice): - indices = np.arange(n)[key] - elif is_integer(key): - # error: Invalid index type "List[Union[int, ndarray[Any, Any]]]" - # for "ndarray[Any, dtype[signedinteger[Any]]]"; expected type - # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_, - # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union - # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]] - # , Tuple[Union[SupportsIndex, _SupportsArray[dtype[Union[bool_ - # , integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union - # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]], ...]]" - indices = np.arange(n)[[key]] # type: ignore[index] - elif is_bool_dtype(key): - key = np.asarray(key) - if len(key) != n: - raise ValueError("Length of indexer and values mismatch") - indices = key.nonzero()[0] else: - key = np.asarray(key) - indices = np.arange(n)[key] - return indices + indices = np.arange(len(self))[key] + if len(indices) != len(value): + raise ValueError("Length of indexer and values mismatch") + if len(indices) == 0: + return + argsort = np.argsort(indices) + indices = indices[argsort] + value = value.take(argsort) + mask = np.zeros(len(self), dtype=np.bool_) + mask[indices] = True + data = self._replace_with_mask(self._data, mask, value) + + if isinstance(data, pa.Array): + data = pa.chunked_array([data]) + self._data = data def _rank( self, @@ -1219,95 +1202,110 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" - # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value + if value is None: + return value + if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): + return value + if is_list_like(value): + pa_box = pa.array + else: + pa_box = pa.scalar + try: + value = pa_box(value, type=self._data.type, from_pandas=True) + except pa.ArrowTypeError as err: + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + raise TypeError(msg) from err return value - def _set_via_chunk_iteration( - self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] - ) -> pa.ChunkedArray: + @classmethod + def _if_else( + cls, + cond: npt.NDArray[np.bool_] | bool, + left: ArrayLike | Scalar, + right: ArrayLike | Scalar, + ): """ - Loop through the array chunks and set the new values while - leaving the chunking layout unchanged. + Choose values based on a condition. + + Analogous to pyarrow.compute.if_else, with logic + to fallback to numpy for unsupported types. Parameters ---------- - indices : npt.NDArray[np.intp] - Position indices for the underlying ChunkedArray. - - value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object - value or values to be set of ``key``. + cond : npt.NDArray[np.bool_] or bool + left : ArrayLike | Scalar + right : ArrayLike | Scalar - Notes - ----- - Assumes that indices is sorted. Caller is responsible for sorting. + Returns + ------- + pa.Array """ - new_data = [] - stop = 0 - for chunk in self._data.iterchunks(): - start, stop = stop, stop + len(chunk) - if len(indices) == 0 or stop <= indices[0]: - new_data.append(chunk) + try: + return pc.if_else(cond, left, right) + except pa.ArrowNotImplementedError: + pass + + def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: + if isinstance(value, (pa.Array, pa.ChunkedArray)): + pa_type = value.type + elif isinstance(value, pa.Scalar): + pa_type = value.type + value = value.as_py() else: - n = int(np.searchsorted(indices, stop, side="left")) - c_ind = indices[:n] - start - indices = indices[n:] - n = len(c_ind) - c_value, value = value[:n], value[n:] - new_data.append(self._replace_with_indices(chunk, c_ind, c_value)) - return pa.chunked_array(new_data) + pa_type = None + return np.array(value, dtype=object), pa_type + + left, left_type = _to_numpy_and_type(left) + right, right_type = _to_numpy_and_type(right) + pa_type = left_type or right_type + result = np.where(cond, left, right) + return pa.array(result, type=pa_type, from_pandas=True) @classmethod - def _replace_with_indices( + def _replace_with_mask( cls, - chunk: pa.Array, - indices: npt.NDArray[np.intp], - value: npt.NDArray[Any], - ) -> pa.Array: + values: pa.Array | pa.ChunkedArray, + mask: npt.NDArray[np.bool_] | bool, + replacements: ArrayLike | Scalar, + ): """ - Replace items selected with a set of positional indices. + Replace items selected with a mask. - Analogous to pyarrow.compute.replace_with_mask, except that replacement - positions are identified via indices rather than a mask. + Analogous to pyarrow.compute.replace_with_mask, with logic + to fallback to numpy for unsupported types. Parameters ---------- - chunk : pa.Array - indices : npt.NDArray[np.intp] - value : npt.NDArray[Any] - Replacement value(s). + values : pa.Array or pa.ChunkedArray + mask : npt.NDArray[np.bool_] or bool + replacements : ArrayLike or Scalar + Replacement value(s) Returns ------- - pa.Array + pa.Array or pa.ChunkedArray """ - n = len(indices) - - if n == 0: - return chunk - - start, stop = indices[[0, -1]] - - if (stop - start) == (n - 1): - # fast path for a contiguous set of indices - arrays = [ - chunk[:start], - pa.array(value, type=chunk.type, from_pandas=True), - chunk[stop + 1 :], - ] - arrays = [arr for arr in arrays if len(arr)] - if len(arrays) == 1: - return arrays[0] - return pa.concat_arrays(arrays) - - mask = np.zeros(len(chunk), dtype=np.bool_) - mask[indices] = True - - if pa_version_under6p0: - arr = chunk.to_numpy(zero_copy_only=False) - arr[mask] = value - return pa.array(arr, type=chunk.type) - - if isna(value).all(): - return pc.if_else(mask, None, chunk) - - return pc.replace_with_mask(chunk, mask, value) + if isinstance(replacements, pa.ChunkedArray): + # replacements must be array or scalar, not ChunkedArray + replacements = replacements.combine_chunks() + if pa_version_under8p0: + # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0: + # version <= 7: segfaults with various types + # version <= 6: fails to replace nulls + if isinstance(replacements, pa.Array): + indices = np.full(len(values), None) + indices[mask] = np.arange(len(replacements)) + indices = pa.array(indices, type=pa.int64()) + replacements = replacements.take(indices) + return cls._if_else(mask, replacements, values) + try: + return pc.replace_with_mask(values, mask, replacements) + except pa.ArrowNotImplementedError: + pass + if isinstance(replacements, pa.Array): + replacements = np.array(replacements, dtype=object) + elif isinstance(replacements, pa.Scalar): + replacements = replacements.as_py() + result = np.array(values, dtype=object) + result[mask] = replacements + return pa.array(result, type=values.type, from_pandas=True) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fb081d0e63c96..4aebe61412866 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -170,7 +170,7 @@ def _maybe_convert_setitem_value(self, value): for v in value: if not (v is None or isinstance(v, str)): raise ValueError("Scalar must be NA or str") - return value + return super()._maybe_convert_setitem_value(value) def isin(self, values) -> npt.NDArray[np.bool_]: value_set = [ diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 4f0c4daa3c64f..071f5cad725cf 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -172,7 +172,6 @@ def test_setitem(multiple_chunks, key, value, expected): result[key] = value tm.assert_equal(result, expected) - assert result._data.num_chunks == expected._data.num_chunks @skip_if_no_pyarrow diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c02fa0aecdacc..3e606e7b466b3 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1523,8 +1523,8 @@ def test_setitem_invalid_dtype(data): pa_type = data._data.type if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): fill_value = 123 - err = pa.ArrowTypeError - msg = "Expected bytes" + err = TypeError + msg = "Invalid value '123' for dtype" elif ( pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type) @@ -1535,8 +1535,8 @@ def test_setitem_invalid_dtype(data): msg = "Could not convert" else: fill_value = "foo" - err = pa.ArrowTypeError - msg = "cannot be converted" + err = TypeError + msg = "Invalid value 'foo' for dtype" with pytest.raises(err, match=msg): data[:] = fill_value