diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 103df0fd94847..b58200911749e 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -2,6 +2,8 @@ import pandas as pd +from .pandas_vb_common import tm + class BooleanArray: def setup(self): @@ -39,3 +41,32 @@ def time_constructor(self): def time_from_integer_array(self): pd.array(self.values_integer, dtype="Int64") + + +class ArrowStringArray: + + params = [False, True] + param_names = ["multiple_chunks"] + + def setup(self, multiple_chunks): + try: + import pyarrow as pa + except ImportError: + raise NotImplementedError + strings = tm.rands_array(3, 10_000) + if multiple_chunks: + chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)] + self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks)) + else: + self.array = pd.arrays.ArrowStringArray(pa.array(strings)) + + def time_setitem(self, multiple_chunks): + for i in range(200): + self.array[i] = "foo" + + def time_setitem_list(self, multiple_chunks): + indexer = list(range(0, 50)) + list(range(-50, 0)) + self.array[indexer] = ["foo"] * len(indexer) + + def time_setitem_slice(self, multiple_chunks): + self.array[::10] = "foo" diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 089ba62e461d1..a16a615e70496 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -315,6 +315,7 @@ Performance improvements - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`) - Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`) - Performance improvement in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when target is a :class:`MultiIndex` (:issue:`46235`) +- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index d0d7d6a1b8da6..5078d87bc91c7 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -22,6 +22,7 @@ pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, + pa_version_under5p0, ) PY39 = sys.version_info >= (3, 9) @@ -148,4 +149,5 @@ def get_lzma_file(): "pa_version_under2p0", "pa_version_under3p0", "pa_version_under4p0", + "pa_version_under5p0", ] diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e39ebd3afd2ff..9b582a4e911a8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -6,7 +6,6 @@ TYPE_CHECKING, Any, Union, - cast, overload, ) @@ -31,6 +30,7 @@ pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, + pa_version_under5p0, ) from pandas.util._decorators import doc @@ -362,49 +362,125 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: None """ key = check_array_indexer(self, key) + indices = self._key_to_indices(key) - if is_integer(key): - key = cast(int, key) - - if not is_scalar(value): - raise ValueError("Must pass scalars with scalar indexer") - elif isna(value): + if is_scalar(value): + if isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") + value = np.broadcast_to(value, len(indices)) + else: + value = np.array(value, dtype=object, copy=True) + for i, v in enumerate(value): + if isna(v): + value[i] = None + elif not isinstance(v, str): + raise ValueError("Scalar must be NA or str") + + if len(indices) != len(value): + raise ValueError("Length of indexer and values mismatch") + + argsort = np.argsort(indices) + indices = indices[argsort] + value = value[argsort] + + self._data = self._set_via_chunk_iteration(indices=indices, value=value) + + def _key_to_indices(self, key: int | slice | np.ndarray) -> npt.NDArray[np.intp]: + """Convert indexing key for self to positional indices.""" + if isinstance(key, slice): + indices = np.arange(len(self))[key] + elif is_bool_dtype(key): + key = np.asarray(key) + if len(key) != len(self): + raise ValueError("Length of indexer and values mismatch") + indices = key.nonzero()[0] + else: + key_arr = np.array([key]) if is_integer(key) else np.asarray(key) + indices = np.arange(len(self))[key_arr] + return indices - # Slice data and insert in-between - new_data = [ - *self._data[0:key].chunks, + def _set_via_chunk_iteration( + self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] + ) -> pa.ChunkedArray: + """ + Loop through the array chunks and set the new values while + leaving the chunking layout unchanged. + """ + + chunk_indices = self._within_chunk_indices(indices) + new_data = [] + + for i, chunk in enumerate(self._data.iterchunks()): + + c_ind = chunk_indices[i] + n = len(c_ind) + c_value, value = value[:n], value[n:] + + if n == 1: + # fast path + chunk = self._set_single_index_in_chunk(chunk, c_ind[0], c_value[0]) + elif n > 0: + mask = np.zeros(len(chunk), dtype=np.bool_) + mask[c_ind] = True + if not pa_version_under5p0: + if c_value is None or isna(np.array(c_value)).all(): + chunk = pc.if_else(mask, None, chunk) + else: + chunk = pc.replace_with_mask(chunk, mask, c_value) + else: + # The pyarrow compute functions were added in + # version 5.0. For prior versions we implement + # our own by converting to numpy and back. + chunk = chunk.to_numpy(zero_copy_only=False) + chunk[mask] = c_value + chunk = pa.array(chunk, type=pa.string()) + + new_data.append(chunk) + + return pa.chunked_array(new_data) + + @staticmethod + def _set_single_index_in_chunk(chunk: pa.Array, index: int, value: Any) -> pa.Array: + """Set a single position in a pyarrow array.""" + assert is_scalar(value) + return pa.concat_arrays( + [ + chunk[:index], pa.array([value], type=pa.string()), - *self._data[(key + 1) :].chunks, + chunk[index + 1 :], ] - self._data = pa.chunked_array(new_data) - else: - # Convert to integer indices and iteratively assign. - # TODO: Make a faster variant of this in Arrow upstream. - # This is probably extremely slow. - - # Convert all possible input key types to an array of integers - if isinstance(key, slice): - key_array = np.array(range(len(self))[key]) - elif is_bool_dtype(key): - # TODO(ARROW-9430): Directly support setitem(booleans) - key_array = np.argwhere(key).flatten() - else: - # TODO(ARROW-9431): Directly support setitem(integers) - key_array = np.asanyarray(key) + ) - if is_scalar(value): - value = np.broadcast_to(value, len(key_array)) + def _within_chunk_indices( + self, indices: npt.NDArray[np.intp] + ) -> list[npt.NDArray[np.intp]]: + """ + Convert indices for self into a list of ndarrays each containing + the indices *within* each chunk of the chunked array. + """ + # indices must be sorted + chunk_indices = [] + for start, stop in self._chunk_ranges(): + if len(indices) == 0 or indices[0] >= stop: + c_ind = np.array([], dtype=np.intp) else: - value = np.asarray(value) + n = int(np.searchsorted(indices, stop, side="left")) + c_ind = indices[:n] - start + indices = indices[n:] + chunk_indices.append(c_ind) + return chunk_indices - if len(key_array) != len(value): - raise ValueError("Length of indexer and values mismatch") - - for k, v in zip(key_array, value): - self[k] = v + def _chunk_ranges(self) -> list[tuple]: + """ + Return a list of tuples each containing the left (inclusive) + and right (exclusive) bounds of each chunk. + """ + lengths = [len(c) for c in self._data.iterchunks()] + stops = np.cumsum(lengths) + starts = np.concatenate([[0], stops[:-1]]) + return list(zip(starts, stops)) def take( self, diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 265afa89d6530..de1b7a9c603af 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -132,3 +132,68 @@ def test_pyarrow_not_installed_raises(): with pytest.raises(ImportError, match=msg): ArrowStringArray._from_sequence(["a", None, "b"]) + + +@skip_if_no_pyarrow +@pytest.mark.parametrize("multiple_chunks", [False, True]) +@pytest.mark.parametrize( + "key, value, expected", + [ + (-1, "XX", ["a", "b", "c", "d", "XX"]), + (1, "XX", ["a", "XX", "c", "d", "e"]), + (1, None, ["a", None, "c", "d", "e"]), + (1, pd.NA, ["a", None, "c", "d", "e"]), + ([1, 3], "XX", ["a", "XX", "c", "XX", "e"]), + ([1, 3], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]), + ([1, 3], ["XX", None], ["a", "XX", "c", None, "e"]), + ([1, 3], ["XX", pd.NA], ["a", "XX", "c", None, "e"]), + ([0, -1], ["XX", "YY"], ["XX", "b", "c", "d", "YY"]), + ([-1, 0], ["XX", "YY"], ["YY", "b", "c", "d", "XX"]), + (slice(3, None), "XX", ["a", "b", "c", "XX", "XX"]), + (slice(2, 4), ["XX", "YY"], ["a", "b", "XX", "YY", "e"]), + (slice(3, 1, -1), ["XX", "YY"], ["a", "b", "YY", "XX", "e"]), + (slice(None), "XX", ["XX", "XX", "XX", "XX", "XX"]), + ([False, True, False, True, False], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]), + ], +) +def test_setitem(multiple_chunks, key, value, expected): + import pyarrow as pa + + result = pa.array(list("abcde")) + expected = pa.array(expected) + + if multiple_chunks: + result = pa.chunked_array([result[:3], result[3:]]) + expected = pa.chunked_array([expected[:3], expected[3:]]) + + result = ArrowStringArray(result) + expected = ArrowStringArray(expected) + + result[key] = value + tm.assert_equal(result, expected) + assert result._data.num_chunks == expected._data.num_chunks + + +@skip_if_no_pyarrow +def test_setitem_invalid_indexer_raises(): + import pyarrow as pa + + arr = ArrowStringArray(pa.array(list("abcde"))) + + with pytest.raises(IndexError, match=None): + arr[5] = "foo" + + with pytest.raises(IndexError, match=None): + arr[-6] = "foo" + + with pytest.raises(IndexError, match=None): + arr[[0, 5]] = "foo" + + with pytest.raises(IndexError, match=None): + arr[[0, -6]] = "foo" + + with pytest.raises(IndexError, match=None): + arr[[True, True, False]] = "foo" + + with pytest.raises(ValueError, match=None): + arr[[0, 1]] = ["foo", "bar", "baz"]