From cac4c2ae4b40ceb56a23f3666b26ee5de3c4b17a Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 8 Jan 2023 15:23:43 -0500 Subject: [PATCH 1/7] REF/PERF: ArrowExtensionArray.__setitem__ --- pandas/core/arrays/arrow/array.py | 278 ++++++++++++--------------- pandas/tests/extension/test_arrow.py | 8 +- 2 files changed, 129 insertions(+), 157 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index de85ed67e7e8c..75dea3f2d9629 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -19,6 +19,7 @@ Iterator, NpDtype, PositionalIndexer, + Scalar, SortKind, TakeIndexer, npt, @@ -26,6 +27,7 @@ from pandas.compat import ( pa_version_under6p0, pa_version_under7p0, + pa_version_under8p0, pa_version_under9p0, ) from pandas.util._decorators import doc @@ -36,6 +38,7 @@ is_bool_dtype, is_integer, is_integer_dtype, + is_list_like, is_object_dtype, is_scalar, ) @@ -586,23 +589,7 @@ def fillna( f" expected {len(self)}" ) - def convert_fill_value(value, pa_type, dtype): - if value is None: - return value - if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): - return value - if is_array_like(value): - pa_box = pa.array - else: - pa_box = pa.scalar - try: - value = pa_box(value, type=pa_type, from_pandas=True) - except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {dtype}" - raise TypeError(msg) from err - return value - - fill_value = convert_fill_value(value, self._data.type, self.dtype) + fill_value = self._maybe_convert_setitem_value(value) try: if method is None: @@ -1034,76 +1021,54 @@ def __setitem__(self, key, value) -> None: key = check_array_indexer(self, key) value = self._maybe_convert_setitem_value(value) - # fast path (GH50248) if com.is_null_slice(key): - if is_scalar(value): - fill_value = pa.scalar(value, type=self._data.type, from_pandas=True) - try: - self._data = pc.if_else(True, fill_value, self._data) - return - except pa.ArrowNotImplementedError: - # ArrowNotImplementedError: Function 'if_else' has no kernel - # matching input types (bool, duration[ns], duration[ns]) - # TODO: remove try/except wrapper if/when pyarrow implements - # a kernel for duration types. - pass - elif len(value) == len(self): - if isinstance(value, type(self)) and value.dtype == self.dtype: - self._data = value._data - else: - arr = pa.array(value, type=self._data.type, from_pandas=True) - self._data = pa.chunked_array([arr]) - return - - indices = self._indexing_key_to_indices(key) - argsort = np.argsort(indices) - indices = indices[argsort] - - if is_scalar(value): - value = np.broadcast_to(value, len(self)) - elif len(indices) != len(value): - raise ValueError("Length of indexer and values mismatch") - else: - value = np.asarray(value)[argsort] + # fast path (GH50248) + data = self._if_else(True, value, self._data) - self._data = self._set_via_chunk_iteration(indices=indices, value=value) + elif is_integer(key): + # fast path + key = cast(int, key) + n = len(self) + if not (-n <= key < n): + raise IndexError( + f"index {key} is out of bounds for axis 0 with size {n}" + ) + if is_list_like(value): + raise ValueError("Length of indexer and values mismatch") + elif isinstance(value, pa.Scalar): + value = value.as_py() + chunks = [ + *self._data[:key].chunks, + pa.array([value], type=self._data.type, from_pandas=True), + *self._data[key + 1 :].chunks, + ] + data = pa.chunked_array(chunks).combine_chunks() - def _indexing_key_to_indices( - self, key: int | slice | np.ndarray - ) -> npt.NDArray[np.intp]: - """ - Convert indexing key for self into positional indices. + elif is_bool_dtype(key): + key = np.asarray(key, dtype=np.bool_) + data = self._replace_with_mask(self._data, key, value) - Parameters - ---------- - key : int | slice | np.ndarray + elif is_scalar(value) or isinstance(value, pa.Scalar): + mask = np.zeros(len(self), dtype=np.bool_) + mask[key] = True + data = self._if_else(mask, value, self._data) - Returns - ------- - npt.NDArray[np.intp] - """ - n = len(self) - if isinstance(key, slice): - indices = np.arange(n)[key] - elif is_integer(key): - # error: Invalid index type "List[Union[int, ndarray[Any, Any]]]" - # for "ndarray[Any, dtype[signedinteger[Any]]]"; expected type - # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_, - # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union - # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]] - # , Tuple[Union[SupportsIndex, _SupportsArray[dtype[Union[bool_ - # , integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union - # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]], ...]]" - indices = np.arange(n)[[key]] # type: ignore[index] - elif is_bool_dtype(key): - key = np.asarray(key) - if len(key) != n: - raise ValueError("Length of indexer and values mismatch") - indices = key.nonzero()[0] else: - key = np.asarray(key) - indices = np.arange(n)[key] - return indices + indices = np.arange(len(self))[key] + if len(indices) != len(value): + raise ValueError("Length of indexer and values mismatch") + if len(indices) == 0: + return + argsort = np.argsort(indices) + indices = indices[argsort] + value = value.take(argsort) + mask = np.zeros(len(self), dtype=np.bool_) + mask[indices] = True + data = self._replace_with_mask(self._data, mask, value) + + if isinstance(data, pa.Array): + data = pa.chunked_array([data]) + self._data = data def _rank( self, @@ -1219,95 +1184,102 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" - # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value + if value is None: + return value + if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): + return value + if is_list_like(value): + pa_box = pa.array + else: + pa_box = pa.scalar + try: + value = pa_box(value, type=self._data.type, from_pandas=True) + except pa.ArrowTypeError as err: + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + raise TypeError(msg) from err return value - def _set_via_chunk_iteration( - self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] - ) -> pa.ChunkedArray: - """ - Loop through the array chunks and set the new values while - leaving the chunking layout unchanged. - - Parameters - ---------- - indices : npt.NDArray[np.intp] - Position indices for the underlying ChunkedArray. - - value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object - value or values to be set of ``key``. - - Notes - ----- - Assumes that indices is sorted. Caller is responsible for sorting. - """ - new_data = [] - stop = 0 - for chunk in self._data.iterchunks(): - start, stop = stop, stop + len(chunk) - if len(indices) == 0 or stop <= indices[0]: - new_data.append(chunk) - else: - n = int(np.searchsorted(indices, stop, side="left")) - c_ind = indices[:n] - start - indices = indices[n:] - n = len(c_ind) - c_value, value = value[:n], value[n:] - new_data.append(self._replace_with_indices(chunk, c_ind, c_value)) - return pa.chunked_array(new_data) - @classmethod - def _replace_with_indices( + def _if_else( cls, - chunk: pa.Array, - indices: npt.NDArray[np.intp], - value: npt.NDArray[Any], - ) -> pa.Array: + cond: npt.NDArray[np.bool_] | bool, + left: ArrayLike | Scalar, + right: ArrayLike | Scalar, + ): """ - Replace items selected with a set of positional indices. + Choose values based on a condition. - Analogous to pyarrow.compute.replace_with_mask, except that replacement - positions are identified via indices rather than a mask. + Analogous to pyarrow.compute.if_else, with logic + to fallback to numpy for unsupported types. Parameters ---------- - chunk : pa.Array - indices : npt.NDArray[np.intp] - value : npt.NDArray[Any] - Replacement value(s). + cond : npt.NDArray[np.bool_] or bool + left : ArrayLike | Scalar + right : ArrayLike | Scalar Returns ------- pa.Array """ - n = len(indices) - - if n == 0: - return chunk + try: + return pc.if_else(cond, left, right) + except pa.ArrowNotImplementedError: - start, stop = indices[[0, -1]] + def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: + if isinstance(value, (pa.Array, pa.ChunkedArray)): + pa_type = value.type + elif isinstance(value, pa.Scalar): + pa_type = value.type + value = value.as_py() + else: + pa_type = None + return np.array(value, dtype=object), pa_type - if (stop - start) == (n - 1): - # fast path for a contiguous set of indices - arrays = [ - chunk[:start], - pa.array(value, type=chunk.type, from_pandas=True), - chunk[stop + 1 :], - ] - arrays = [arr for arr in arrays if len(arr)] - if len(arrays) == 1: - return arrays[0] - return pa.concat_arrays(arrays) + left, left_type = _to_numpy_and_type(left) + right, right_type = _to_numpy_and_type(right) + pa_type = left_type or right_type + result = np.where(cond, left, right) + return pa.array(result, type=pa_type, from_pandas=True) - mask = np.zeros(len(chunk), dtype=np.bool_) - mask[indices] = True + @classmethod + def _replace_with_mask( + cls, + values: pa.Array | pa.ChunkedArray, + mask: npt.NDArray[np.bool_] | bool, + replacements: ArrayLike | Scalar, + ): + """ + Replace items selected with a mask. - if pa_version_under6p0: - arr = chunk.to_numpy(zero_copy_only=False) - arr[mask] = value - return pa.array(arr, type=chunk.type) + Analogous to pyarrow.compute.replace_with_mask, with logic + to fallback to numpy for unsupported types. - if isna(value).all(): - return pc.if_else(mask, None, chunk) + Parameters + ---------- + values : pa.Array or pa.ChunkedArray + mask : npt.NDArray[np.bool_] or bool + replacements : ArrayLike or Scalar + Replacement value(s) - return pc.replace_with_mask(chunk, mask, value) + Returns + ------- + pa.Array or pa.ChunkedArray + """ + if isinstance(values, pa.ChunkedArray) and pa_version_under8p0: + # pc.replace_with_mask segfaults on chunked arrays for + # version 7 and below + values = values.combine_chunks() + if isinstance(replacements, pa.ChunkedArray): + # replacements must be array or scalar, not ChunkedArray + replacements = replacements.combine_chunks() + try: + return pc.replace_with_mask(values, mask, replacements) + except pa.ArrowNotImplementedError: + if isinstance(replacements, (pa.Array, pa.ChunkedArray)): + replacements = np.array(replacements, dtype=object) + elif isinstance(replacements, pa.Scalar): + replacements = replacements.as_py() + result = np.array(values, dtype=object) + result[mask] = replacements + return pa.array(result, type=values.type, from_pandas=True) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c02fa0aecdacc..3e606e7b466b3 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1523,8 +1523,8 @@ def test_setitem_invalid_dtype(data): pa_type = data._data.type if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): fill_value = 123 - err = pa.ArrowTypeError - msg = "Expected bytes" + err = TypeError + msg = "Invalid value '123' for dtype" elif ( pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type) @@ -1535,8 +1535,8 @@ def test_setitem_invalid_dtype(data): msg = "Could not convert" else: fill_value = "foo" - err = pa.ArrowTypeError - msg = "cannot be converted" + err = TypeError + msg = "Invalid value 'foo' for dtype" with pytest.raises(err, match=msg): data[:] = fill_value From 91049c371a6505ba30940ef565127251ab1883b7 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 8 Jan 2023 15:37:46 -0500 Subject: [PATCH 2/7] update asv --- asv_bench/benchmarks/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index cb8fcac38edce..3ffaaf706d636 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -93,7 +93,7 @@ def time_setitem(self, multiple_chunks): self.array[i] = "foo" def time_setitem_list(self, multiple_chunks): - indexer = list(range(0, 50)) + list(range(-50, 0)) + indexer = list(range(0, 50)) + list(range(-1000, 0, 50)) self.array[indexer] = ["foo"] * len(indexer) def time_setitem_slice(self, multiple_chunks): From e01e6e14dbbbbf044e89721506afb25d292232d3 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 8 Jan 2023 15:52:10 -0500 Subject: [PATCH 3/7] whatsnew --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3ebb13e82c447..f24687d6c4f0a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -779,7 +779,7 @@ Performance improvements - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.IntervalArray.from_tuples` (:issue:`50620`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`50248`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` (:issue:`50248`, :issue:`50632`) - Performance improvement in :class:`~arrays.ArrowExtensionArray` comparison methods when array contains NA (:issue:`50524`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`) - Performance improvement when parsing strings to :class:`BooleanDtype` (:issue:`50613`) From a205bf74ec53337720f98215bb247b60e9f4d93b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 8 Jan 2023 19:59:47 -0500 Subject: [PATCH 4/7] fixes --- pandas/core/arrays/arrow/array.py | 22 +++++++++++++++++-- .../tests/arrays/string_/test_string_arrow.py | 1 - 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 75dea3f2d9629..375f8a04449a4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -589,7 +589,23 @@ def fillna( f" expected {len(self)}" ) - fill_value = self._maybe_convert_setitem_value(value) + def convert_fill_value(value, pa_type, dtype): + if value is None: + return value + if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): + return value + if is_array_like(value): + pa_box = pa.array + else: + pa_box = pa.scalar + try: + value = pa_box(value, type=pa_type, from_pandas=True) + except pa.ArrowTypeError as err: + msg = f"Invalid value '{str(value)}' for dtype {dtype}" + raise TypeError(msg) from err + return value + + fill_value = convert_fill_value(value, self._data.type, self.dtype) try: if method is None: @@ -1029,7 +1045,9 @@ def __setitem__(self, key, value) -> None: # fast path key = cast(int, key) n = len(self) - if not (-n <= key < n): + if key < 0: + key += n + if not 0 <= key < n: raise IndexError( f"index {key} is out of bounds for axis 0 with size {n}" ) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 4f0c4daa3c64f..071f5cad725cf 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -172,7 +172,6 @@ def test_setitem(multiple_chunks, key, value, expected): result[key] = value tm.assert_equal(result, expected) - assert result._data.num_chunks == expected._data.num_chunks @skip_if_no_pyarrow From 851704f194bb4a94bf67806f257b7a09fba1ac4a Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 10 Jan 2023 07:47:41 -0500 Subject: [PATCH 5/7] fix min versions --- pandas/core/arrays/arrow/array.py | 52 ++++++++++++++++++------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 375f8a04449a4..82ad3ba046213 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1243,22 +1243,23 @@ def _if_else( try: return pc.if_else(cond, left, right) except pa.ArrowNotImplementedError: + pass - def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: - if isinstance(value, (pa.Array, pa.ChunkedArray)): - pa_type = value.type - elif isinstance(value, pa.Scalar): - pa_type = value.type - value = value.as_py() - else: - pa_type = None - return np.array(value, dtype=object), pa_type + def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: + if isinstance(value, (pa.Array, pa.ChunkedArray)): + pa_type = value.type + elif isinstance(value, pa.Scalar): + pa_type = value.type + value = value.as_py() + else: + pa_type = None + return np.array(value, dtype=object), pa_type - left, left_type = _to_numpy_and_type(left) - right, right_type = _to_numpy_and_type(right) - pa_type = left_type or right_type - result = np.where(cond, left, right) - return pa.array(result, type=pa_type, from_pandas=True) + left, left_type = _to_numpy_and_type(left) + right, right_type = _to_numpy_and_type(right) + pa_type = left_type or right_type + result = np.where(cond, left, right) + return pa.array(result, type=pa_type, from_pandas=True) @classmethod def _replace_with_mask( @@ -1291,13 +1292,22 @@ def _replace_with_mask( if isinstance(replacements, pa.ChunkedArray): # replacements must be array or scalar, not ChunkedArray replacements = replacements.combine_chunks() + if pa_version_under7p0 and values.null_count > 0: + # pc.replace_with_mask fails to replace nulls in version 6 and lower + if isinstance(replacements, pa.Array): + indices = np.full(len(values), None) + indices[mask] = np.arange(len(replacements)) + indices = pa.array(indices, type=pa.int64()) + replacements = replacements.take(indices) + return cls._if_else(mask, replacements, values) try: return pc.replace_with_mask(values, mask, replacements) except pa.ArrowNotImplementedError: - if isinstance(replacements, (pa.Array, pa.ChunkedArray)): - replacements = np.array(replacements, dtype=object) - elif isinstance(replacements, pa.Scalar): - replacements = replacements.as_py() - result = np.array(values, dtype=object) - result[mask] = replacements - return pa.array(result, type=values.type, from_pandas=True) + pass + if isinstance(replacements, pa.Array): + replacements = np.array(replacements, dtype=object) + elif isinstance(replacements, pa.Scalar): + replacements = replacements.as_py() + result = np.array(values, dtype=object) + result[mask] = replacements + return pa.array(result, type=values.type, from_pandas=True) From 833baa6497f53d883acfde490acdac08e3927ba0 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 10 Jan 2023 17:49:57 -0500 Subject: [PATCH 6/7] fix min versions --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fb081d0e63c96..4aebe61412866 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -170,7 +170,7 @@ def _maybe_convert_setitem_value(self, value): for v in value: if not (v is None or isinstance(v, str)): raise ValueError("Scalar must be NA or str") - return value + return super()._maybe_convert_setitem_value(value) def isin(self, values) -> npt.NDArray[np.bool_]: value_set = [ From 3c31f1be83596d9cbce592d54935398cd04a7ceb Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 10 Jan 2023 22:12:37 -0500 Subject: [PATCH 7/7] more min version fixes --- pandas/core/arrays/arrow/array.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 82ad3ba046213..5ee93f242b7ea 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1285,15 +1285,13 @@ def _replace_with_mask( ------- pa.Array or pa.ChunkedArray """ - if isinstance(values, pa.ChunkedArray) and pa_version_under8p0: - # pc.replace_with_mask segfaults on chunked arrays for - # version 7 and below - values = values.combine_chunks() if isinstance(replacements, pa.ChunkedArray): # replacements must be array or scalar, not ChunkedArray replacements = replacements.combine_chunks() - if pa_version_under7p0 and values.null_count > 0: - # pc.replace_with_mask fails to replace nulls in version 6 and lower + if pa_version_under8p0: + # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0: + # version <= 7: segfaults with various types + # version <= 6: fails to replace nulls if isinstance(replacements, pa.Array): indices = np.full(len(values), None) indices[mask] = np.arange(len(replacements))