Skip to content

REF/PERF: ArrowExtensionArray.__setitem__ #50632

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def time_setitem(self, multiple_chunks):
self.array[i] = "foo"

def time_setitem_list(self, multiple_chunks):
indexer = list(range(0, 50)) + list(range(-50, 0))
indexer = list(range(0, 50)) + list(range(-1000, 0, 50))
self.array[indexer] = ["foo"] * len(indexer)

def time_setitem_slice(self, multiple_chunks):
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -825,7 +825,7 @@ Performance improvements
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
- Performance improvement in :meth:`~arrays.IntervalArray.from_tuples` (:issue:`50620`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`50248`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` (:issue:`50248`, :issue:`50632`)
- Performance improvement in :class:`~arrays.ArrowExtensionArray` comparison methods when array contains NA (:issue:`50524`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`)
- Performance improvement when parsing strings to :class:`BooleanDtype` (:issue:`50613`)
Expand Down
268 changes: 133 additions & 135 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,15 @@
Iterator,
NpDtype,
PositionalIndexer,
Scalar,
SortKind,
TakeIndexer,
npt,
)
from pandas.compat import (
pa_version_under6p0,
pa_version_under7p0,
pa_version_under8p0,
pa_version_under9p0,
)
from pandas.util._decorators import doc
Expand All @@ -36,6 +38,7 @@
is_bool_dtype,
is_integer,
is_integer_dtype,
is_list_like,
is_object_dtype,
is_scalar,
)
Expand Down Expand Up @@ -1034,76 +1037,56 @@ def __setitem__(self, key, value) -> None:
key = check_array_indexer(self, key)
value = self._maybe_convert_setitem_value(value)

# fast path (GH50248)
if com.is_null_slice(key):
if is_scalar(value):
fill_value = pa.scalar(value, type=self._data.type, from_pandas=True)
try:
self._data = pc.if_else(True, fill_value, self._data)
return
except pa.ArrowNotImplementedError:
# ArrowNotImplementedError: Function 'if_else' has no kernel
# matching input types (bool, duration[ns], duration[ns])
# TODO: remove try/except wrapper if/when pyarrow implements
# a kernel for duration types.
pass
elif len(value) == len(self):
if isinstance(value, type(self)) and value.dtype == self.dtype:
self._data = value._data
else:
arr = pa.array(value, type=self._data.type, from_pandas=True)
self._data = pa.chunked_array([arr])
return

indices = self._indexing_key_to_indices(key)
argsort = np.argsort(indices)
indices = indices[argsort]

if is_scalar(value):
value = np.broadcast_to(value, len(self))
elif len(indices) != len(value):
raise ValueError("Length of indexer and values mismatch")
else:
value = np.asarray(value)[argsort]
# fast path (GH50248)
data = self._if_else(True, value, self._data)

self._data = self._set_via_chunk_iteration(indices=indices, value=value)
elif is_integer(key):
# fast path
key = cast(int, key)
n = len(self)
if key < 0:
key += n
if not 0 <= key < n:
raise IndexError(
f"index {key} is out of bounds for axis 0 with size {n}"
)
if is_list_like(value):
raise ValueError("Length of indexer and values mismatch")
elif isinstance(value, pa.Scalar):
value = value.as_py()
chunks = [
*self._data[:key].chunks,
pa.array([value], type=self._data.type, from_pandas=True),
*self._data[key + 1 :].chunks,
]
data = pa.chunked_array(chunks).combine_chunks()

def _indexing_key_to_indices(
self, key: int | slice | np.ndarray
) -> npt.NDArray[np.intp]:
"""
Convert indexing key for self into positional indices.
elif is_bool_dtype(key):
key = np.asarray(key, dtype=np.bool_)
data = self._replace_with_mask(self._data, key, value)

Parameters
----------
key : int | slice | np.ndarray
elif is_scalar(value) or isinstance(value, pa.Scalar):
mask = np.zeros(len(self), dtype=np.bool_)
mask[key] = True
data = self._if_else(mask, value, self._data)

Returns
-------
npt.NDArray[np.intp]
"""
n = len(self)
if isinstance(key, slice):
indices = np.arange(n)[key]
elif is_integer(key):
# error: Invalid index type "List[Union[int, ndarray[Any, Any]]]"
# for "ndarray[Any, dtype[signedinteger[Any]]]"; expected type
# "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
# integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union
# [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]
# , Tuple[Union[SupportsIndex, _SupportsArray[dtype[Union[bool_
# , integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union
# [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]], ...]]"
indices = np.arange(n)[[key]] # type: ignore[index]
elif is_bool_dtype(key):
key = np.asarray(key)
if len(key) != n:
raise ValueError("Length of indexer and values mismatch")
indices = key.nonzero()[0]
else:
key = np.asarray(key)
indices = np.arange(n)[key]
return indices
indices = np.arange(len(self))[key]
if len(indices) != len(value):
raise ValueError("Length of indexer and values mismatch")
if len(indices) == 0:
return
argsort = np.argsort(indices)
indices = indices[argsort]
value = value.take(argsort)
mask = np.zeros(len(self), dtype=np.bool_)
mask[indices] = True
data = self._replace_with_mask(self._data, mask, value)

if isinstance(data, pa.Array):
data = pa.chunked_array([data])
self._data = data
Copy link
Member

@mroeschke mroeschke Jan 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we guaranteed here that self._data.type matches self.dtype.pyarrow_dtype?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I believe so. I addressed the TODO in ArrowExtensionArray._maybe_convert_setitem_value so that it now boxes the setitem values will raise if the replacement values cannot be cast to the original self._data.type.


def _rank(
self,
Expand Down Expand Up @@ -1219,95 +1202,110 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra

def _maybe_convert_setitem_value(self, value):
"""Maybe convert value to be pyarrow compatible."""
# TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value
if value is None:
return value
if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
return value
if is_list_like(value):
pa_box = pa.array
else:
pa_box = pa.scalar
try:
value = pa_box(value, type=self._data.type, from_pandas=True)
except pa.ArrowTypeError as err:
msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
raise TypeError(msg) from err
return value

def _set_via_chunk_iteration(
self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any]
) -> pa.ChunkedArray:
@classmethod
def _if_else(
cls,
cond: npt.NDArray[np.bool_] | bool,
left: ArrayLike | Scalar,
right: ArrayLike | Scalar,
):
"""
Loop through the array chunks and set the new values while
leaving the chunking layout unchanged.
Choose values based on a condition.

Analogous to pyarrow.compute.if_else, with logic
to fallback to numpy for unsupported types.

Parameters
----------
indices : npt.NDArray[np.intp]
Position indices for the underlying ChunkedArray.

value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
value or values to be set of ``key``.
cond : npt.NDArray[np.bool_] or bool
left : ArrayLike | Scalar
right : ArrayLike | Scalar

Notes
-----
Assumes that indices is sorted. Caller is responsible for sorting.
Returns
-------
pa.Array
"""
new_data = []
stop = 0
for chunk in self._data.iterchunks():
start, stop = stop, stop + len(chunk)
if len(indices) == 0 or stop <= indices[0]:
new_data.append(chunk)
try:
return pc.if_else(cond, left, right)
except pa.ArrowNotImplementedError:
pass

def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
if isinstance(value, (pa.Array, pa.ChunkedArray)):
pa_type = value.type
elif isinstance(value, pa.Scalar):
pa_type = value.type
value = value.as_py()
else:
n = int(np.searchsorted(indices, stop, side="left"))
c_ind = indices[:n] - start
indices = indices[n:]
n = len(c_ind)
c_value, value = value[:n], value[n:]
new_data.append(self._replace_with_indices(chunk, c_ind, c_value))
return pa.chunked_array(new_data)
pa_type = None
return np.array(value, dtype=object), pa_type

left, left_type = _to_numpy_and_type(left)
right, right_type = _to_numpy_and_type(right)
pa_type = left_type or right_type
result = np.where(cond, left, right)
return pa.array(result, type=pa_type, from_pandas=True)

@classmethod
def _replace_with_indices(
def _replace_with_mask(
cls,
chunk: pa.Array,
indices: npt.NDArray[np.intp],
value: npt.NDArray[Any],
) -> pa.Array:
values: pa.Array | pa.ChunkedArray,
mask: npt.NDArray[np.bool_] | bool,
replacements: ArrayLike | Scalar,
):
"""
Replace items selected with a set of positional indices.
Replace items selected with a mask.

Analogous to pyarrow.compute.replace_with_mask, except that replacement
positions are identified via indices rather than a mask.
Analogous to pyarrow.compute.replace_with_mask, with logic
to fallback to numpy for unsupported types.

Parameters
----------
chunk : pa.Array
indices : npt.NDArray[np.intp]
value : npt.NDArray[Any]
Replacement value(s).
values : pa.Array or pa.ChunkedArray
mask : npt.NDArray[np.bool_] or bool
replacements : ArrayLike or Scalar
Replacement value(s)

Returns
-------
pa.Array
pa.Array or pa.ChunkedArray
"""
n = len(indices)

if n == 0:
return chunk

start, stop = indices[[0, -1]]

if (stop - start) == (n - 1):
# fast path for a contiguous set of indices
arrays = [
chunk[:start],
pa.array(value, type=chunk.type, from_pandas=True),
chunk[stop + 1 :],
]
arrays = [arr for arr in arrays if len(arr)]
if len(arrays) == 1:
return arrays[0]
return pa.concat_arrays(arrays)

mask = np.zeros(len(chunk), dtype=np.bool_)
mask[indices] = True

if pa_version_under6p0:
arr = chunk.to_numpy(zero_copy_only=False)
arr[mask] = value
return pa.array(arr, type=chunk.type)

if isna(value).all():
return pc.if_else(mask, None, chunk)

return pc.replace_with_mask(chunk, mask, value)
if isinstance(replacements, pa.ChunkedArray):
# replacements must be array or scalar, not ChunkedArray
replacements = replacements.combine_chunks()
if pa_version_under8p0:
# pc.replace_with_mask seems to be a bit unreliable for versions < 8.0:
# version <= 7: segfaults with various types
# version <= 6: fails to replace nulls
if isinstance(replacements, pa.Array):
indices = np.full(len(values), None)
indices[mask] = np.arange(len(replacements))
indices = pa.array(indices, type=pa.int64())
replacements = replacements.take(indices)
return cls._if_else(mask, replacements, values)
try:
return pc.replace_with_mask(values, mask, replacements)
except pa.ArrowNotImplementedError:
pass
if isinstance(replacements, pa.Array):
replacements = np.array(replacements, dtype=object)
elif isinstance(replacements, pa.Scalar):
replacements = replacements.as_py()
result = np.array(values, dtype=object)
result[mask] = replacements
return pa.array(result, type=values.type, from_pandas=True)
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def _maybe_convert_setitem_value(self, value):
for v in value:
if not (v is None or isinstance(v, str)):
raise ValueError("Scalar must be NA or str")
return value
return super()._maybe_convert_setitem_value(value)

def isin(self, values) -> npt.NDArray[np.bool_]:
value_set = [
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@ def test_setitem(multiple_chunks, key, value, expected):

result[key] = value
tm.assert_equal(result, expected)
assert result._data.num_chunks == expected._data.num_chunks
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this no longer holds?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The existing implementation iterated through the chunks and set values chunk by chunk. This implementation passes the entire ChunkedArray to pyarrow's compute functions. At the moment it looks like pyarrow.compute.if_else combines chunks (but still returns a ChunkedArray with one chunk) whereas pyarrow.compute.replace_with_mask maintains the chunking layout of the input. I'm not sure if that behavior applies for all cases or based on inputs. Let me know if you think pandas should ensure chunking layout remains unchanged.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally I think we should try to maintain the chunking layout as much as possible, but it's more of an implementation detail and if the pyarrow compute functions don't necessarily maintain the chunking layout I suppose this is fine



@skip_if_no_pyarrow
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1523,8 +1523,8 @@ def test_setitem_invalid_dtype(data):
pa_type = data._data.type
if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
fill_value = 123
err = pa.ArrowTypeError
msg = "Expected bytes"
err = TypeError
msg = "Invalid value '123' for dtype"
elif (
pa.types.is_integer(pa_type)
or pa.types.is_floating(pa_type)
Expand All @@ -1535,8 +1535,8 @@ def test_setitem_invalid_dtype(data):
msg = "Could not convert"
else:
fill_value = "foo"
err = pa.ArrowTypeError
msg = "cannot be converted"
err = TypeError
msg = "Invalid value 'foo' for dtype"
with pytest.raises(err, match=msg):
data[:] = fill_value

Expand Down