REF/PERF: ArrowStringArray.__setitem__ (#46400)

lukemanley · web-flow · commit ec3eedd9017b · 2022-03-18T12:17:07.000-04:00
diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
@@ -2,6 +2,8 @@
 
 import pandas as pd
 
+from .pandas_vb_common import tm
+
 
 class BooleanArray:
     def setup(self):
@@ -39,3 +41,32 @@ def time_constructor(self):
 
     def time_from_integer_array(self):
         pd.array(self.values_integer, dtype="Int64")
+
+
+class ArrowStringArray:
+
+    params = [False, True]
+    param_names = ["multiple_chunks"]
+
+    def setup(self, multiple_chunks):
+        try:
+            import pyarrow as pa
+        except ImportError:
+            raise NotImplementedError
+        strings = tm.rands_array(3, 10_000)
+        if multiple_chunks:
+            chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)]
+            self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks))
+        else:
+            self.array = pd.arrays.ArrowStringArray(pa.array(strings))
+
+    def time_setitem(self, multiple_chunks):
+        for i in range(200):
+            self.array[i] = "foo"
+
+    def time_setitem_list(self, multiple_chunks):
+        indexer = list(range(0, 50)) + list(range(-50, 0))
+        self.array[indexer] = ["foo"] * len(indexer)
+
+    def time_setitem_slice(self, multiple_chunks):
+        self.array[::10] = "foo"
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -323,6 +323,7 @@ Performance improvements
 - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
 - Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
 - Performance improvement in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when target is a :class:`MultiIndex` (:issue:`46235`)
+- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
 - Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
 
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -22,6 +22,7 @@
     pa_version_under2p0,
     pa_version_under3p0,
     pa_version_under4p0,
+    pa_version_under5p0,
 )
 
 PY39 = sys.version_info >= (3, 9)
@@ -148,4 +149,5 @@ def get_lzma_file():
     "pa_version_under2p0",
     "pa_version_under3p0",
     "pa_version_under4p0",
+    "pa_version_under5p0",
 ]
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -6,7 +6,6 @@
     TYPE_CHECKING,
     Any,
     Union,
-    cast,
     overload,
 )
 
@@ -31,6 +30,7 @@
     pa_version_under2p0,
     pa_version_under3p0,
     pa_version_under4p0,
+    pa_version_under5p0,
 )
 from pandas.util._decorators import doc
 
@@ -365,49 +365,125 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
         None
         """
         key = check_array_indexer(self, key)
+        indices = self._key_to_indices(key)
 
-        if is_integer(key):
-            key = cast(int, key)
-
-            if not is_scalar(value):
-                raise ValueError("Must pass scalars with scalar indexer")
-            elif isna(value):
+        if is_scalar(value):
+            if isna(value):
                 value = None
             elif not isinstance(value, str):
                 raise ValueError("Scalar must be NA or str")
+            value = np.broadcast_to(value, len(indices))
+        else:
+            value = np.array(value, dtype=object, copy=True)
+            for i, v in enumerate(value):
+                if isna(v):
+                    value[i] = None
+                elif not isinstance(v, str):
+                    raise ValueError("Scalar must be NA or str")
+
+        if len(indices) != len(value):
+            raise ValueError("Length of indexer and values mismatch")
+
+        argsort = np.argsort(indices)
+        indices = indices[argsort]
+        value = value[argsort]
+
+        self._data = self._set_via_chunk_iteration(indices=indices, value=value)
+
+    def _key_to_indices(self, key: int | slice | np.ndarray) -> npt.NDArray[np.intp]:
+        """Convert indexing key for self to positional indices."""
+        if isinstance(key, slice):
+            indices = np.arange(len(self))[key]
+        elif is_bool_dtype(key):
+            key = np.asarray(key)
+            if len(key) != len(self):
+                raise ValueError("Length of indexer and values mismatch")
+            indices = key.nonzero()[0]
+        else:
+            key_arr = np.array([key]) if is_integer(key) else np.asarray(key)
+            indices = np.arange(len(self))[key_arr]
+        return indices
 
-            # Slice data and insert in-between
-            new_data = [
-                *self._data[0:key].chunks,
+    def _set_via_chunk_iteration(
+        self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any]
+    ) -> pa.ChunkedArray:
+        """
+        Loop through the array chunks and set the new values while
+        leaving the chunking layout unchanged.
+        """
+
+        chunk_indices = self._within_chunk_indices(indices)
+        new_data = []
+
+        for i, chunk in enumerate(self._data.iterchunks()):
+
+            c_ind = chunk_indices[i]
+            n = len(c_ind)
+            c_value, value = value[:n], value[n:]
+
+            if n == 1:
+                # fast path
+                chunk = self._set_single_index_in_chunk(chunk, c_ind[0], c_value[0])
+            elif n > 0:
+                mask = np.zeros(len(chunk), dtype=np.bool_)
+                mask[c_ind] = True
+                if not pa_version_under5p0:
+                    if c_value is None or isna(np.array(c_value)).all():
+                        chunk = pc.if_else(mask, None, chunk)
+                    else:
+                        chunk = pc.replace_with_mask(chunk, mask, c_value)
+                else:
+                    # The pyarrow compute functions were added in
+                    # version 5.0. For prior versions we implement
+                    # our own by converting to numpy and back.
+                    chunk = chunk.to_numpy(zero_copy_only=False)
+                    chunk[mask] = c_value
+                    chunk = pa.array(chunk, type=pa.string())
+
+            new_data.append(chunk)
+
+        return pa.chunked_array(new_data)
+
+    @staticmethod
+    def _set_single_index_in_chunk(chunk: pa.Array, index: int, value: Any) -> pa.Array:
+        """Set a single position in a pyarrow array."""
+        assert is_scalar(value)
+        return pa.concat_arrays(
+            [
+                chunk[:index],
                 pa.array([value], type=pa.string()),
-                *self._data[(key + 1) :].chunks,
+                chunk[index + 1 :],
             ]
-            self._data = pa.chunked_array(new_data)
-        else:
-            # Convert to integer indices and iteratively assign.
-            # TODO: Make a faster variant of this in Arrow upstream.
-            #       This is probably extremely slow.
-
-            # Convert all possible input key types to an array of integers
-            if isinstance(key, slice):
-                key_array = np.array(range(len(self))[key])
-            elif is_bool_dtype(key):
-                # TODO(ARROW-9430): Directly support setitem(booleans)
-                key_array = np.argwhere(key).flatten()
-            else:
-                # TODO(ARROW-9431): Directly support setitem(integers)
-                key_array = np.asanyarray(key)
+        )
 
-            if is_scalar(value):
-                value = np.broadcast_to(value, len(key_array))
+    def _within_chunk_indices(
+        self, indices: npt.NDArray[np.intp]
+    ) -> list[npt.NDArray[np.intp]]:
+        """
+        Convert indices for self into a list of ndarrays each containing
+        the indices *within* each chunk of the chunked array.
+        """
+        # indices must be sorted
+        chunk_indices = []
+        for start, stop in self._chunk_ranges():
+            if len(indices) == 0 or indices[0] >= stop:
+                c_ind = np.array([], dtype=np.intp)
             else:
-                value = np.asarray(value)
+                n = int(np.searchsorted(indices, stop, side="left"))
+                c_ind = indices[:n] - start
+                indices = indices[n:]
+            chunk_indices.append(c_ind)
+        return chunk_indices
 
-            if len(key_array) != len(value):
-                raise ValueError("Length of indexer and values mismatch")
-
-            for k, v in zip(key_array, value):
-                self[k] = v
+    def _chunk_ranges(self) -> list[tuple]:
+        """
+        Return a list of tuples each containing the left (inclusive)
+        and right (exclusive) bounds of each chunk.
+        """
+        lengths = [len(c) for c in self._data.iterchunks()]
+        stops = np.cumsum(lengths)
+        starts = np.concatenate([[0], stops[:-1]])
+        return list(zip(starts, stops))
 
     def take(
         self,
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -132,3 +132,68 @@ def test_pyarrow_not_installed_raises():
 
     with pytest.raises(ImportError, match=msg):
         ArrowStringArray._from_sequence(["a", None, "b"])
+
+
+@skip_if_no_pyarrow
+@pytest.mark.parametrize("multiple_chunks", [False, True])
+@pytest.mark.parametrize(
+    "key, value, expected",
+    [
+        (-1, "XX", ["a", "b", "c", "d", "XX"]),
+        (1, "XX", ["a", "XX", "c", "d", "e"]),
+        (1, None, ["a", None, "c", "d", "e"]),
+        (1, pd.NA, ["a", None, "c", "d", "e"]),
+        ([1, 3], "XX", ["a", "XX", "c", "XX", "e"]),
+        ([1, 3], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
+        ([1, 3], ["XX", None], ["a", "XX", "c", None, "e"]),
+        ([1, 3], ["XX", pd.NA], ["a", "XX", "c", None, "e"]),
+        ([0, -1], ["XX", "YY"], ["XX", "b", "c", "d", "YY"]),
+        ([-1, 0], ["XX", "YY"], ["YY", "b", "c", "d", "XX"]),
+        (slice(3, None), "XX", ["a", "b", "c", "XX", "XX"]),
+        (slice(2, 4), ["XX", "YY"], ["a", "b", "XX", "YY", "e"]),
+        (slice(3, 1, -1), ["XX", "YY"], ["a", "b", "YY", "XX", "e"]),
+        (slice(None), "XX", ["XX", "XX", "XX", "XX", "XX"]),
+        ([False, True, False, True, False], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
+    ],
+)
+def test_setitem(multiple_chunks, key, value, expected):
+    import pyarrow as pa
+
+    result = pa.array(list("abcde"))
+    expected = pa.array(expected)
+
+    if multiple_chunks:
+        result = pa.chunked_array([result[:3], result[3:]])
+        expected = pa.chunked_array([expected[:3], expected[3:]])
+
+    result = ArrowStringArray(result)
+    expected = ArrowStringArray(expected)
+
+    result[key] = value
+    tm.assert_equal(result, expected)
+    assert result._data.num_chunks == expected._data.num_chunks
+
+
+@skip_if_no_pyarrow
+def test_setitem_invalid_indexer_raises():
+    import pyarrow as pa
+
+    arr = ArrowStringArray(pa.array(list("abcde")))
+
+    with pytest.raises(IndexError, match=None):
+        arr[5] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[-6] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[0, 5]] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[0, -6]] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[True, True, False]] = "foo"
+
+    with pytest.raises(ValueError, match=None):
+        arr[[0, 1]] = ["foo", "bar", "baz"]

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`pa_version_under2p0,`
`23`	`23`	`pa_version_under3p0,`
`24`	`24`	`pa_version_under4p0,`
	`25`	`+ pa_version_under5p0,`
`25`	`26`	`)`
`26`	`27`
`27`	`28`	`PY39 = sys.version_info >= (3, 9)`
`@@ -148,4 +149,5 @@ def get_lzma_file():`
`148`	`149`	`"pa_version_under2p0",`
`149`	`150`	`"pa_version_under3p0",`
`150`	`151`	`"pa_version_under4p0",`
	`152`	`+ "pa_version_under5p0",`
`151`	`153`	`]`