Skip to content

REF/PERF: ArrowStringArray.__setitem__ #46400

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions asv_bench/benchmarks/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import pandas as pd

from .pandas_vb_common import tm


class BooleanArray:
def setup(self):
Expand Down Expand Up @@ -39,3 +41,32 @@ def time_constructor(self):

def time_from_integer_array(self):
pd.array(self.values_integer, dtype="Int64")


class ArrowStringArray:

params = [False, True]
param_names = ["multiple_chunks"]

def setup(self, multiple_chunks):
try:
import pyarrow as pa
except ImportError:
raise NotImplementedError
strings = tm.rands_array(3, 10_000)
if multiple_chunks:
chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)]
self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks))
else:
self.array = pd.arrays.ArrowStringArray(pa.array(strings))

def time_setitem(self, multiple_chunks):
for i in range(200):
self.array[i] = "foo"

def time_setitem_list(self, multiple_chunks):
indexer = list(range(0, 50)) + list(range(-50, 0))
self.array[indexer] = ["foo"] * len(indexer)

def time_setitem_slice(self, multiple_chunks):
self.array[::10] = "foo"
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ Performance improvements
- Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
- Performance improvement in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when target is a :class:`MultiIndex` (:issue:`46235`)
- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
- Performance improvement in :func:`factorize` (:issue:`46109`)
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)

Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
pa_version_under2p0,
pa_version_under3p0,
pa_version_under4p0,
pa_version_under5p0,
)

PY39 = sys.version_info >= (3, 9)
Expand Down Expand Up @@ -148,4 +149,5 @@ def get_lzma_file():
"pa_version_under2p0",
"pa_version_under3p0",
"pa_version_under4p0",
"pa_version_under5p0",
]
144 changes: 110 additions & 34 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
TYPE_CHECKING,
Any,
Union,
cast,
overload,
)

Expand All @@ -31,6 +30,7 @@
pa_version_under2p0,
pa_version_under3p0,
pa_version_under4p0,
pa_version_under5p0,
)
from pandas.util._decorators import doc

Expand Down Expand Up @@ -362,49 +362,125 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
None
"""
key = check_array_indexer(self, key)
indices = self._key_to_indices(key)

if is_integer(key):
key = cast(int, key)

if not is_scalar(value):
raise ValueError("Must pass scalars with scalar indexer")
elif isna(value):
if is_scalar(value):
if isna(value):
value = None
elif not isinstance(value, str):
raise ValueError("Scalar must be NA or str")
value = np.broadcast_to(value, len(indices))
else:
value = np.array(value, dtype=object, copy=True)
for i, v in enumerate(value):
if isna(v):
value[i] = None
elif not isinstance(v, str):
raise ValueError("Scalar must be NA or str")

if len(indices) != len(value):
raise ValueError("Length of indexer and values mismatch")

argsort = np.argsort(indices)
indices = indices[argsort]
value = value[argsort]

self._data = self._set_via_chunk_iteration(indices=indices, value=value)

def _key_to_indices(self, key: int | slice | np.ndarray) -> npt.NDArray[np.intp]:
"""Convert indexing key for self to positional indices."""
if isinstance(key, slice):
indices = np.arange(len(self))[key]
elif is_bool_dtype(key):
key = np.asarray(key)
if len(key) != len(self):
raise ValueError("Length of indexer and values mismatch")
indices = key.nonzero()[0]
else:
key_arr = np.array([key]) if is_integer(key) else np.asarray(key)
indices = np.arange(len(self))[key_arr]
return indices

# Slice data and insert in-between
new_data = [
*self._data[0:key].chunks,
def _set_via_chunk_iteration(
self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any]
) -> pa.ChunkedArray:
"""
Loop through the array chunks and set the new values while
leaving the chunking layout unchanged.
"""

chunk_indices = self._within_chunk_indices(indices)
new_data = []

for i, chunk in enumerate(self._data.iterchunks()):

c_ind = chunk_indices[i]
n = len(c_ind)
c_value, value = value[:n], value[n:]

if n == 1:
# fast path
chunk = self._set_single_index_in_chunk(chunk, c_ind[0], c_value[0])
elif n > 0:
mask = np.zeros(len(chunk), dtype=np.bool_)
mask[c_ind] = True
if not pa_version_under5p0:
if c_value is None or isna(np.array(c_value)).all():
chunk = pc.if_else(mask, None, chunk)
else:
chunk = pc.replace_with_mask(chunk, mask, c_value)
else:
# The pyarrow compute functions were added in
# version 5.0. For prior versions we implement
# our own by converting to numpy and back.
chunk = chunk.to_numpy(zero_copy_only=False)
chunk[mask] = c_value
chunk = pa.array(chunk, type=pa.string())

new_data.append(chunk)

return pa.chunked_array(new_data)

@staticmethod
def _set_single_index_in_chunk(chunk: pa.Array, index: int, value: Any) -> pa.Array:
"""Set a single position in a pyarrow array."""
assert is_scalar(value)
return pa.concat_arrays(
[
chunk[:index],
pa.array([value], type=pa.string()),
*self._data[(key + 1) :].chunks,
chunk[index + 1 :],
]
self._data = pa.chunked_array(new_data)
else:
# Convert to integer indices and iteratively assign.
# TODO: Make a faster variant of this in Arrow upstream.
# This is probably extremely slow.

# Convert all possible input key types to an array of integers
if isinstance(key, slice):
key_array = np.array(range(len(self))[key])
elif is_bool_dtype(key):
# TODO(ARROW-9430): Directly support setitem(booleans)
key_array = np.argwhere(key).flatten()
else:
# TODO(ARROW-9431): Directly support setitem(integers)
key_array = np.asanyarray(key)
)

if is_scalar(value):
value = np.broadcast_to(value, len(key_array))
def _within_chunk_indices(
self, indices: npt.NDArray[np.intp]
) -> list[npt.NDArray[np.intp]]:
"""
Convert indices for self into a list of ndarrays each containing
the indices *within* each chunk of the chunked array.
"""
# indices must be sorted
chunk_indices = []
for start, stop in self._chunk_ranges():
if len(indices) == 0 or indices[0] >= stop:
c_ind = np.array([], dtype=np.intp)
else:
value = np.asarray(value)
n = int(np.searchsorted(indices, stop, side="left"))
c_ind = indices[:n] - start
indices = indices[n:]
chunk_indices.append(c_ind)
return chunk_indices

if len(key_array) != len(value):
raise ValueError("Length of indexer and values mismatch")

for k, v in zip(key_array, value):
self[k] = v
def _chunk_ranges(self) -> list[tuple]:
"""
Return a list of tuples each containing the left (inclusive)
and right (exclusive) bounds of each chunk.
"""
lengths = [len(c) for c in self._data.iterchunks()]
stops = np.cumsum(lengths)
starts = np.concatenate([[0], stops[:-1]])
return list(zip(starts, stops))

def take(
self,
Expand Down
65 changes: 65 additions & 0 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,68 @@ def test_pyarrow_not_installed_raises():

with pytest.raises(ImportError, match=msg):
ArrowStringArray._from_sequence(["a", None, "b"])


@skip_if_no_pyarrow
@pytest.mark.parametrize("multiple_chunks", [False, True])
@pytest.mark.parametrize(
"key, value, expected",
[
(-1, "XX", ["a", "b", "c", "d", "XX"]),
(1, "XX", ["a", "XX", "c", "d", "e"]),
(1, None, ["a", None, "c", "d", "e"]),
(1, pd.NA, ["a", None, "c", "d", "e"]),
([1, 3], "XX", ["a", "XX", "c", "XX", "e"]),
([1, 3], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
([1, 3], ["XX", None], ["a", "XX", "c", None, "e"]),
([1, 3], ["XX", pd.NA], ["a", "XX", "c", None, "e"]),
([0, -1], ["XX", "YY"], ["XX", "b", "c", "d", "YY"]),
([-1, 0], ["XX", "YY"], ["YY", "b", "c", "d", "XX"]),
(slice(3, None), "XX", ["a", "b", "c", "XX", "XX"]),
(slice(2, 4), ["XX", "YY"], ["a", "b", "XX", "YY", "e"]),
(slice(3, 1, -1), ["XX", "YY"], ["a", "b", "YY", "XX", "e"]),
(slice(None), "XX", ["XX", "XX", "XX", "XX", "XX"]),
([False, True, False, True, False], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
],
)
def test_setitem(multiple_chunks, key, value, expected):
import pyarrow as pa

result = pa.array(list("abcde"))
expected = pa.array(expected)

if multiple_chunks:
result = pa.chunked_array([result[:3], result[3:]])
expected = pa.chunked_array([expected[:3], expected[3:]])

result = ArrowStringArray(result)
expected = ArrowStringArray(expected)

result[key] = value
tm.assert_equal(result, expected)
assert result._data.num_chunks == expected._data.num_chunks


@skip_if_no_pyarrow
def test_setitem_invalid_indexer_raises():
import pyarrow as pa

arr = ArrowStringArray(pa.array(list("abcde")))

with pytest.raises(IndexError, match=None):
arr[5] = "foo"

with pytest.raises(IndexError, match=None):
arr[-6] = "foo"

with pytest.raises(IndexError, match=None):
arr[[0, 5]] = "foo"

with pytest.raises(IndexError, match=None):
arr[[0, -6]] = "foo"

with pytest.raises(IndexError, match=None):
arr[[True, True, False]] = "foo"

with pytest.raises(ValueError, match=None):
arr[[0, 1]] = ["foo", "bar", "baz"]