REF/PERF: ArrowExtensionArray.__setitem__ (#50632)

lukemanley · web-flow · commit 601f227c9a28 · 2023-01-18T11:18:10.000-08:00
* REF/PERF: ArrowExtensionArray.__setitem__

* update asv

* whatsnew

* fixes

* fix min versions

* fix min versions

* more min version fixes
diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
@@ -93,7 +93,7 @@ def time_setitem(self, multiple_chunks):
             self.array[i] = "foo"
 
     def time_setitem_list(self, multiple_chunks):
-        indexer = list(range(0, 50)) + list(range(-50, 0))
+        indexer = list(range(0, 50)) + list(range(-1000, 0, 50))
         self.array[indexer] = ["foo"] * len(indexer)
 
     def time_setitem_slice(self, multiple_chunks):
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -850,7 +850,7 @@ Performance improvements
 - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
 - Performance improvement in :meth:`~arrays.IntervalArray.from_tuples` (:issue:`50620`)
 - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
-- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`50248`)
+- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` (:issue:`50248`, :issue:`50632`)
 - Performance improvement in :class:`~arrays.ArrowExtensionArray` comparison methods when array contains NA (:issue:`50524`)
 - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`)
 - Performance improvement when parsing strings to :class:`BooleanDtype` (:issue:`50613`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -19,13 +19,15 @@
     Iterator,
     NpDtype,
     PositionalIndexer,
+    Scalar,
     SortKind,
     TakeIndexer,
     npt,
 )
 from pandas.compat import (
     pa_version_under6p0,
     pa_version_under7p0,
+    pa_version_under8p0,
     pa_version_under9p0,
 )
 from pandas.util._decorators import doc
@@ -36,6 +38,7 @@
     is_bool_dtype,
     is_integer,
     is_integer_dtype,
+    is_list_like,
     is_object_dtype,
     is_scalar,
 )
@@ -1056,76 +1059,56 @@ def __setitem__(self, key, value) -> None:
         key = check_array_indexer(self, key)
         value = self._maybe_convert_setitem_value(value)
 
-        # fast path (GH50248)
         if com.is_null_slice(key):
-            if is_scalar(value):
-                fill_value = pa.scalar(value, type=self._data.type, from_pandas=True)
-                try:
-                    self._data = pc.if_else(True, fill_value, self._data)
-                    return
-                except pa.ArrowNotImplementedError:
-                    # ArrowNotImplementedError: Function 'if_else' has no kernel
-                    #   matching input types (bool, duration[ns], duration[ns])
-                    # TODO: remove try/except wrapper if/when pyarrow implements
-                    #   a kernel for duration types.
-                    pass
-            elif len(value) == len(self):
-                if isinstance(value, type(self)) and value.dtype == self.dtype:
-                    self._data = value._data
-                else:
-                    arr = pa.array(value, type=self._data.type, from_pandas=True)
-                    self._data = pa.chunked_array([arr])
-                return
-
-        indices = self._indexing_key_to_indices(key)
-        argsort = np.argsort(indices)
-        indices = indices[argsort]
-
-        if is_scalar(value):
-            value = np.broadcast_to(value, len(self))
-        elif len(indices) != len(value):
-            raise ValueError("Length of indexer and values mismatch")
-        else:
-            value = np.asarray(value)[argsort]
+            # fast path (GH50248)
+            data = self._if_else(True, value, self._data)
 
-        self._data = self._set_via_chunk_iteration(indices=indices, value=value)
+        elif is_integer(key):
+            # fast path
+            key = cast(int, key)
+            n = len(self)
+            if key < 0:
+                key += n
+            if not 0 <= key < n:
+                raise IndexError(
+                    f"index {key} is out of bounds for axis 0 with size {n}"
+                )
+            if is_list_like(value):
+                raise ValueError("Length of indexer and values mismatch")
+            elif isinstance(value, pa.Scalar):
+                value = value.as_py()
+            chunks = [
+                *self._data[:key].chunks,
+                pa.array([value], type=self._data.type, from_pandas=True),
+                *self._data[key + 1 :].chunks,
+            ]
+            data = pa.chunked_array(chunks).combine_chunks()
 
-    def _indexing_key_to_indices(
-        self, key: int | slice | np.ndarray
-    ) -> npt.NDArray[np.intp]:
-        """
-        Convert indexing key for self into positional indices.
+        elif is_bool_dtype(key):
+            key = np.asarray(key, dtype=np.bool_)
+            data = self._replace_with_mask(self._data, key, value)
 
-        Parameters
-        ----------
-        key : int | slice | np.ndarray
+        elif is_scalar(value) or isinstance(value, pa.Scalar):
+            mask = np.zeros(len(self), dtype=np.bool_)
+            mask[key] = True
+            data = self._if_else(mask, value, self._data)
 
-        Returns
-        -------
-        npt.NDArray[np.intp]
-        """
-        n = len(self)
-        if isinstance(key, slice):
-            indices = np.arange(n)[key]
-        elif is_integer(key):
-            # error: Invalid index type "List[Union[int, ndarray[Any, Any]]]"
-            # for "ndarray[Any, dtype[signedinteger[Any]]]"; expected type
-            # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
-            # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union
-            # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]
-            # , Tuple[Union[SupportsIndex, _SupportsArray[dtype[Union[bool_
-            # , integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union
-            # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]], ...]]"
-            indices = np.arange(n)[[key]]  # type: ignore[index]
-        elif is_bool_dtype(key):
-            key = np.asarray(key)
-            if len(key) != n:
-                raise ValueError("Length of indexer and values mismatch")
-            indices = key.nonzero()[0]
         else:
-            key = np.asarray(key)
-            indices = np.arange(n)[key]
-        return indices
+            indices = np.arange(len(self))[key]
+            if len(indices) != len(value):
+                raise ValueError("Length of indexer and values mismatch")
+            if len(indices) == 0:
+                return
+            argsort = np.argsort(indices)
+            indices = indices[argsort]
+            value = value.take(argsort)
+            mask = np.zeros(len(self), dtype=np.bool_)
+            mask[indices] = True
+            data = self._replace_with_mask(self._data, mask, value)
+
+        if isinstance(data, pa.Array):
+            data = pa.chunked_array([data])
+        self._data = data
 
     def _rank(
         self,
@@ -1241,95 +1224,110 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra
 
     def _maybe_convert_setitem_value(self, value):
         """Maybe convert value to be pyarrow compatible."""
-        # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value
+        if value is None:
+            return value
+        if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
+            return value
+        if is_list_like(value):
+            pa_box = pa.array
+        else:
+            pa_box = pa.scalar
+        try:
+            value = pa_box(value, type=self._data.type, from_pandas=True)
+        except pa.ArrowTypeError as err:
+            msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
+            raise TypeError(msg) from err
         return value
 
-    def _set_via_chunk_iteration(
-        self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any]
-    ) -> pa.ChunkedArray:
+    @classmethod
+    def _if_else(
+        cls,
+        cond: npt.NDArray[np.bool_] | bool,
+        left: ArrayLike | Scalar,
+        right: ArrayLike | Scalar,
+    ):
         """
-        Loop through the array chunks and set the new values while
-        leaving the chunking layout unchanged.
+        Choose values based on a condition.
+
+        Analogous to pyarrow.compute.if_else, with logic
+        to fallback to numpy for unsupported types.
 
         Parameters
         ----------
-        indices : npt.NDArray[np.intp]
-            Position indices for the underlying ChunkedArray.
-
-        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
-            value or values to be set of ``key``.
+        cond : npt.NDArray[np.bool_] or bool
+        left : ArrayLike | Scalar
+        right : ArrayLike | Scalar
 
-        Notes
-        -----
-        Assumes that indices is sorted. Caller is responsible for sorting.
+        Returns
+        -------
+        pa.Array
         """
-        new_data = []
-        stop = 0
-        for chunk in self._data.iterchunks():
-            start, stop = stop, stop + len(chunk)
-            if len(indices) == 0 or stop <= indices[0]:
-                new_data.append(chunk)
+        try:
+            return pc.if_else(cond, left, right)
+        except pa.ArrowNotImplementedError:
+            pass
+
+        def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
+            if isinstance(value, (pa.Array, pa.ChunkedArray)):
+                pa_type = value.type
+            elif isinstance(value, pa.Scalar):
+                pa_type = value.type
+                value = value.as_py()
             else:
-                n = int(np.searchsorted(indices, stop, side="left"))
-                c_ind = indices[:n] - start
-                indices = indices[n:]
-                n = len(c_ind)
-                c_value, value = value[:n], value[n:]
-                new_data.append(self._replace_with_indices(chunk, c_ind, c_value))
-        return pa.chunked_array(new_data)
+                pa_type = None
+            return np.array(value, dtype=object), pa_type
+
+        left, left_type = _to_numpy_and_type(left)
+        right, right_type = _to_numpy_and_type(right)
+        pa_type = left_type or right_type
+        result = np.where(cond, left, right)
+        return pa.array(result, type=pa_type, from_pandas=True)
 
     @classmethod
-    def _replace_with_indices(
+    def _replace_with_mask(
         cls,
-        chunk: pa.Array,
-        indices: npt.NDArray[np.intp],
-        value: npt.NDArray[Any],
-    ) -> pa.Array:
+        values: pa.Array | pa.ChunkedArray,
+        mask: npt.NDArray[np.bool_] | bool,
+        replacements: ArrayLike | Scalar,
+    ):
         """
-        Replace items selected with a set of positional indices.
+        Replace items selected with a mask.
 
-        Analogous to pyarrow.compute.replace_with_mask, except that replacement
-        positions are identified via indices rather than a mask.
+        Analogous to pyarrow.compute.replace_with_mask, with logic
+        to fallback to numpy for unsupported types.
 
         Parameters
         ----------
-        chunk : pa.Array
-        indices : npt.NDArray[np.intp]
-        value : npt.NDArray[Any]
-            Replacement value(s).
+        values : pa.Array or pa.ChunkedArray
+        mask : npt.NDArray[np.bool_] or bool
+        replacements : ArrayLike or Scalar
+            Replacement value(s)
 
         Returns
         -------
-        pa.Array
+        pa.Array or pa.ChunkedArray
         """
-        n = len(indices)
-
-        if n == 0:
-            return chunk
-
-        start, stop = indices[[0, -1]]
-
-        if (stop - start) == (n - 1):
-            # fast path for a contiguous set of indices
-            arrays = [
-                chunk[:start],
-                pa.array(value, type=chunk.type, from_pandas=True),
-                chunk[stop + 1 :],
-            ]
-            arrays = [arr for arr in arrays if len(arr)]
-            if len(arrays) == 1:
-                return arrays[0]
-            return pa.concat_arrays(arrays)
-
-        mask = np.zeros(len(chunk), dtype=np.bool_)
-        mask[indices] = True
-
-        if pa_version_under6p0:
-            arr = chunk.to_numpy(zero_copy_only=False)
-            arr[mask] = value
-            return pa.array(arr, type=chunk.type)
-
-        if isna(value).all():
-            return pc.if_else(mask, None, chunk)
-
-        return pc.replace_with_mask(chunk, mask, value)
+        if isinstance(replacements, pa.ChunkedArray):
+            # replacements must be array or scalar, not ChunkedArray
+            replacements = replacements.combine_chunks()
+        if pa_version_under8p0:
+            # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0:
+            #  version <= 7: segfaults with various types
+            #  version <= 6: fails to replace nulls
+            if isinstance(replacements, pa.Array):
+                indices = np.full(len(values), None)
+                indices[mask] = np.arange(len(replacements))
+                indices = pa.array(indices, type=pa.int64())
+                replacements = replacements.take(indices)
+            return cls._if_else(mask, replacements, values)
+        try:
+            return pc.replace_with_mask(values, mask, replacements)
+        except pa.ArrowNotImplementedError:
+            pass
+        if isinstance(replacements, pa.Array):
+            replacements = np.array(replacements, dtype=object)
+        elif isinstance(replacements, pa.Scalar):
+            replacements = replacements.as_py()
+        result = np.array(values, dtype=object)
+        result[mask] = replacements
+        return pa.array(result, type=values.type, from_pandas=True)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -170,7 +170,7 @@ def _maybe_convert_setitem_value(self, value):
             for v in value:
                 if not (v is None or isinstance(v, str)):
                     raise ValueError("Scalar must be NA or str")
-        return value
+        return super()._maybe_convert_setitem_value(value)
 
     def isin(self, values) -> npt.NDArray[np.bool_]:
         value_set = [
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -172,7 +172,6 @@ def test_setitem(multiple_chunks, key, value, expected):
 
     result[key] = value
     tm.assert_equal(result, expected)
-    assert result._data.num_chunks == expected._data.num_chunks
 
 
 @skip_if_no_pyarrow
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -1526,8 +1526,8 @@ def test_setitem_invalid_dtype(data):
     pa_type = data._data.type
     if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
         fill_value = 123
-        err = pa.ArrowTypeError
-        msg = "Expected bytes"
+        err = TypeError
+        msg = "Invalid value '123' for dtype"
     elif (
         pa.types.is_integer(pa_type)
         or pa.types.is_floating(pa_type)
@@ -1538,8 +1538,8 @@ def test_setitem_invalid_dtype(data):
         msg = "Could not convert"
     else:
         fill_value = "foo"
-        err = pa.ArrowTypeError
-        msg = "cannot be converted"
+        err = TypeError
+        msg = "Invalid value 'foo' for dtype"
     with pytest.raises(err, match=msg):
         data[:] = fill_value