From e379a2201e16fe39754af854643ba621d8903344 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 16 Mar 2022 23:32:56 -0400
Subject: [PATCH 1/6] ArrowStringArray.__setitem__

---
 asv_bench/benchmarks/array.py                 |  28 ++++
 pandas/compat/__init__.py                     |   2 +
 pandas/core/arrays/string_arrow.py            | 141 +++++++++++++-----
 .../tests/arrays/string_/test_string_arrow.py |  65 ++++++++
 4 files changed, 202 insertions(+), 34 deletions(-)

diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
index 103df0fd94847..f82928a8fced9 100644
--- a/asv_bench/benchmarks/array.py
+++ b/asv_bench/benchmarks/array.py
@@ -1,7 +1,10 @@
 import numpy as np
+import pyarrow as pa
 
 import pandas as pd
 
+from .pandas_vb_common import tm
+
 
 class BooleanArray:
     def setup(self):
@@ -39,3 +42,28 @@ def time_constructor(self):
 
     def time_from_integer_array(self):
         pd.array(self.values_integer, dtype="Int64")
+
+
+class ArrowStringArray:
+
+    params = [False, True]
+    param_names = ["multiple_chunks"]
+
+    def setup(self, multiple_chunks):
+        strings = tm.rands_array(3, 10_000)
+        if multiple_chunks:
+            chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)]
+            self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks))
+        else:
+            self.array = pd.arrays.ArrowStringArray(pa.array(strings))
+
+    def time_setitem(self, multiple_chunks):
+        for i in range(200):
+            self.array[i] = "foo"
+
+    def time_setitem_list(self, multiple_chunks):
+        indexer = list(range(0, 50)) + list(range(-50, 0))
+        self.array[indexer] = ["foo"] * len(indexer)
+
+    def time_setitem_slice(self, multiple_chunks):
+        self.array[::10] = "foo"
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
index d0d7d6a1b8da6..5078d87bc91c7 100644
--- a/pandas/compat/__init__.py
+++ b/pandas/compat/__init__.py
@@ -22,6 +22,7 @@
     pa_version_under2p0,
     pa_version_under3p0,
     pa_version_under4p0,
+    pa_version_under5p0,
 )
 
 PY39 = sys.version_info >= (3, 9)
@@ -148,4 +149,5 @@ def get_lzma_file():
     "pa_version_under2p0",
     "pa_version_under3p0",
     "pa_version_under4p0",
+    "pa_version_under5p0",
 ]
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index e39ebd3afd2ff..2f2609b95c900 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -6,7 +6,6 @@
     TYPE_CHECKING,
     Any,
     Union,
-    cast,
     overload,
 )
 
@@ -31,6 +30,7 @@
     pa_version_under2p0,
     pa_version_under3p0,
     pa_version_under4p0,
+    pa_version_under5p0,
 )
 from pandas.util._decorators import doc
 
@@ -362,49 +362,122 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
         None
         """
         key = check_array_indexer(self, key)
+        value_is_scalar = is_scalar(value)
 
-        if is_integer(key):
-            key = cast(int, key)
-
-            if not is_scalar(value):
-                raise ValueError("Must pass scalars with scalar indexer")
-            elif isna(value):
+        # NA -> None
+        if value_is_scalar:
+            if isna(value):
                 value = None
             elif not isinstance(value, str):
                 raise ValueError("Scalar must be NA or str")
-
-            # Slice data and insert in-between
-            new_data = [
-                *self._data[0:key].chunks,
-                pa.array([value], type=pa.string()),
-                *self._data[(key + 1) :].chunks,
-            ]
-            self._data = pa.chunked_array(new_data)
         else:
-            # Convert to integer indices and iteratively assign.
-            # TODO: Make a faster variant of this in Arrow upstream.
-            #       This is probably extremely slow.
-
-            # Convert all possible input key types to an array of integers
-            if isinstance(key, slice):
-                key_array = np.array(range(len(self))[key])
-            elif is_bool_dtype(key):
-                # TODO(ARROW-9430): Directly support setitem(booleans)
-                key_array = np.argwhere(key).flatten()
-            else:
-                # TODO(ARROW-9431): Directly support setitem(integers)
-                key_array = np.asanyarray(key)
+            value = np.asarray(value)
+            value[isna(value)] = None
+
+        # reorder values to align with the mask positions
+        if is_bool_dtype(key):
+            pass
+        elif isinstance(key, slice):
+            if not value_is_scalar and key.step is not None and key.step < 0:
+                value = value[::-1]
+        else:
+            if not value_is_scalar:
+                key = np.asarray(key)
+                if len(key) != len(value):
+                    raise ValueError("Length of indexer and values mismatch")
+
+            if np.any(key < -len(self)):
+                min_key = np.asarray(key).min()
+                raise IndexError(
+                    f"index {min_key} is out of bounds for array of length {len(self)}"
+                )
+            if np.any(key >= len(self)):
+                max_key = np.asarray(key).max()
+                raise IndexError(
+                    f"index {max_key} is out of bounds for array of length {len(self)}"
+                )
 
-            if is_scalar(value):
-                value = np.broadcast_to(value, len(key_array))
+            # convert negative indices to positive before sorting
+            if is_integer(key):
+                if key < 0:
+                    key += len(self)
             else:
-                value = np.asarray(value)
+                key[key < 0] += len(self)
+                if not value_is_scalar:
+                    value = value[np.argsort(key)]
+
+        # fast path
+        if is_integer(key) and value_is_scalar and self._data.num_chunks == 1:
+            chunk = pa.concat_arrays(
+                [
+                    self._data.chunks[0][:key],
+                    pa.array([value], type=pa.string()),
+                    self._data.chunks[0][key + 1 :],
+                ]
+            )
+            self._data = pa.chunked_array([chunk])
+            return
 
-            if len(key_array) != len(value):
+        # create mask for positions to set
+        if is_bool_dtype(key):
+            mask = key
+        else:
+            mask = np.zeros(len(self), dtype=np.bool_)
+            mask[key] = True
+
+        if not value_is_scalar:
+            if len(value) != np.sum(mask):
                 raise ValueError("Length of indexer and values mismatch")
 
-            for k, v in zip(key_array, value):
-                self[k] = v
+        indices = mask.nonzero()[0]
+
+        # loop through the array chunks and set the new values while
+        # leaving the chunking layout unchanged
+        start = stop = 0
+        new_data = []
+
+        for chunk in self._data.iterchunks():
+            start, stop = stop, stop + len(chunk)
+
+            if len(indices) == 0 or indices[0] >= stop:
+                new_data.append(chunk)
+                continue
+
+            n = np.searchsorted(indices, np.intp(stop), side="left")
+            c_indices, indices = indices[:n], indices[n:]
+
+            if value_is_scalar:
+                c_value = value
+            else:
+                c_value, value = value[:n], value[n:]
+
+            if n == 1:
+                # fast path
+                idx = c_indices[0] - start
+                v = [c_value] if value_is_scalar else c_value
+                chunk = pa.concat_arrays(
+                    [
+                        chunk[:idx],
+                        pa.array(v, type=pa.string()),
+                        chunk[idx + 1 :],
+                    ]
+                )
+
+            elif n > 0:
+                submask = mask[start:stop]
+                if not pa_version_under5p0:
+                    chunk = pc.replace_with_mask(chunk, submask, c_value)
+                else:
+                    # The replace_with_mask compute function was added in
+                    # version 5.0. For prior versions we implement our own
+                    # by converting to numpy and back.
+                    chunk = chunk.to_numpy(zero_copy_only=False)
+                    chunk[submask] = c_value
+                    chunk = pa.array(chunk, type=pa.string())
+
+            new_data.append(chunk)
+
+        self._data = pa.chunked_array(new_data)
 
     def take(
         self,
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 265afa89d6530..de1b7a9c603af 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -132,3 +132,68 @@ def test_pyarrow_not_installed_raises():
 
     with pytest.raises(ImportError, match=msg):
         ArrowStringArray._from_sequence(["a", None, "b"])
+
+
+@skip_if_no_pyarrow
+@pytest.mark.parametrize("multiple_chunks", [False, True])
+@pytest.mark.parametrize(
+    "key, value, expected",
+    [
+        (-1, "XX", ["a", "b", "c", "d", "XX"]),
+        (1, "XX", ["a", "XX", "c", "d", "e"]),
+        (1, None, ["a", None, "c", "d", "e"]),
+        (1, pd.NA, ["a", None, "c", "d", "e"]),
+        ([1, 3], "XX", ["a", "XX", "c", "XX", "e"]),
+        ([1, 3], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
+        ([1, 3], ["XX", None], ["a", "XX", "c", None, "e"]),
+        ([1, 3], ["XX", pd.NA], ["a", "XX", "c", None, "e"]),
+        ([0, -1], ["XX", "YY"], ["XX", "b", "c", "d", "YY"]),
+        ([-1, 0], ["XX", "YY"], ["YY", "b", "c", "d", "XX"]),
+        (slice(3, None), "XX", ["a", "b", "c", "XX", "XX"]),
+        (slice(2, 4), ["XX", "YY"], ["a", "b", "XX", "YY", "e"]),
+        (slice(3, 1, -1), ["XX", "YY"], ["a", "b", "YY", "XX", "e"]),
+        (slice(None), "XX", ["XX", "XX", "XX", "XX", "XX"]),
+        ([False, True, False, True, False], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
+    ],
+)
+def test_setitem(multiple_chunks, key, value, expected):
+    import pyarrow as pa
+
+    result = pa.array(list("abcde"))
+    expected = pa.array(expected)
+
+    if multiple_chunks:
+        result = pa.chunked_array([result[:3], result[3:]])
+        expected = pa.chunked_array([expected[:3], expected[3:]])
+
+    result = ArrowStringArray(result)
+    expected = ArrowStringArray(expected)
+
+    result[key] = value
+    tm.assert_equal(result, expected)
+    assert result._data.num_chunks == expected._data.num_chunks
+
+
+@skip_if_no_pyarrow
+def test_setitem_invalid_indexer_raises():
+    import pyarrow as pa
+
+    arr = ArrowStringArray(pa.array(list("abcde")))
+
+    with pytest.raises(IndexError, match=None):
+        arr[5] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[-6] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[0, 5]] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[0, -6]] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[True, True, False]] = "foo"
+
+    with pytest.raises(ValueError, match=None):
+        arr[[0, 1]] = ["foo", "bar", "baz"]

From 0e35f6a9e75596a67e03f4133714f97267faa08a Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Thu, 17 Mar 2022 11:00:26 -0400
Subject: [PATCH 2/6] fixes

---
 asv_bench/benchmarks/array.py      |  5 +++-
 pandas/core/arrays/string_arrow.py | 39 ++++++++++++++++++++++--------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
index f82928a8fced9..b58200911749e 100644
--- a/asv_bench/benchmarks/array.py
+++ b/asv_bench/benchmarks/array.py
@@ -1,5 +1,4 @@
 import numpy as np
-import pyarrow as pa
 
 import pandas as pd
 
@@ -50,6 +49,10 @@ class ArrowStringArray:
     param_names = ["multiple_chunks"]
 
     def setup(self, multiple_chunks):
+        try:
+            import pyarrow as pa
+        except ImportError:
+            raise NotImplementedError
         strings = tm.rands_array(3, 10_000)
         if multiple_chunks:
             chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)]
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 2f2609b95c900..a6ccd9e6d47e0 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -40,6 +40,7 @@
     is_dtype_equal,
     is_integer,
     is_integer_dtype,
+    is_list_like,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
@@ -362,6 +363,12 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
         None
         """
         key = check_array_indexer(self, key)
+
+        if is_list_like(key):
+            key = np.asarray(key)
+            if len(key) == 1:
+                key = key[0]
+
         value_is_scalar = is_scalar(value)
 
         # NA -> None
@@ -371,8 +378,12 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
             elif not isinstance(value, str):
                 raise ValueError("Scalar must be NA or str")
         else:
-            value = np.asarray(value)
-            value[isna(value)] = None
+            value = np.asarray(value, dtype=object)
+            for i, v in enumerate(value):
+                if isna(v):
+                    value[i] = None
+                elif not isinstance(v, str):
+                    raise ValueError("Scalar must be NA or str")
 
         # reorder values to align with the mask positions
         if is_bool_dtype(key):
@@ -382,6 +393,8 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
                 value = value[::-1]
         else:
             if not value_is_scalar:
+                if is_scalar(key):
+                    raise ValueError("Length of indexer and values mismatch")
                 key = np.asarray(key)
                 if len(key) != len(value):
                     raise ValueError("Length of indexer and values mismatch")
@@ -402,25 +415,28 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
                 if key < 0:
                     key += len(self)
             else:
+                key = np.asarray(key)
                 key[key < 0] += len(self)
                 if not value_is_scalar:
                     value = value[np.argsort(key)]
 
         # fast path
         if is_integer(key) and value_is_scalar and self._data.num_chunks == 1:
+            idx = int(key)  # type: ignore[arg-type]
             chunk = pa.concat_arrays(
                 [
-                    self._data.chunks[0][:key],
+                    self._data.chunks[0][:idx],
                     pa.array([value], type=pa.string()),
-                    self._data.chunks[0][key + 1 :],
+                    self._data.chunks[0][idx + 1 :],
                 ]
             )
             self._data = pa.chunked_array([chunk])
             return
 
         # create mask for positions to set
+        mask: npt.NDArray[np.bool_]
         if is_bool_dtype(key):
-            mask = key
+            mask = key  # type: ignore[assignment]
         else:
             mask = np.zeros(len(self), dtype=np.bool_)
             mask[key] = True
@@ -443,7 +459,7 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
                 new_data.append(chunk)
                 continue
 
-            n = np.searchsorted(indices, np.intp(stop), side="left")
+            n = int(np.searchsorted(indices, stop, side="left"))
             c_indices, indices = indices[:n], indices[n:]
 
             if value_is_scalar:
@@ -466,11 +482,14 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
             elif n > 0:
                 submask = mask[start:stop]
                 if not pa_version_under5p0:
-                    chunk = pc.replace_with_mask(chunk, submask, c_value)
+                    if c_value is None:
+                        chunk = pc.if_else(submask, c_value, chunk)
+                    else:
+                        chunk = pc.replace_with_mask(chunk, submask, c_value)
                 else:
-                    # The replace_with_mask compute function was added in
-                    # version 5.0. For prior versions we implement our own
-                    # by converting to numpy and back.
+                    # The pyarrow compute functions were added in
+                    # version 5.0. For prior versions we implement
+                    # our own by converting to numpy and back.
                     chunk = chunk.to_numpy(zero_copy_only=False)
                     chunk[submask] = c_value
                     chunk = pa.array(chunk, type=pa.string())

From f2920540215ef90d1eae57fae4a050184bef8891 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Thu, 17 Mar 2022 12:43:52 -0400
Subject: [PATCH 3/6] whatsnew

---
 doc/source/whatsnew/v1.5.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 089ba62e461d1..a16a615e70496 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -315,6 +315,7 @@ Performance improvements
 - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
 - Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
 - Performance improvement in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when target is a :class:`MultiIndex` (:issue:`46235`)
+- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
 - Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
 

From 773f375d62e2e2989d36388f349ced6d9252c33e Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Thu, 17 Mar 2022 13:00:09 -0400
Subject: [PATCH 4/6] fix test

---
 pandas/core/arrays/string_arrow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index a6ccd9e6d47e0..a3d26d63ff7a5 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -482,8 +482,8 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
             elif n > 0:
                 submask = mask[start:stop]
                 if not pa_version_under5p0:
-                    if c_value is None:
-                        chunk = pc.if_else(submask, c_value, chunk)
+                    if c_value is None or isna(np.array(c_value)).all():
+                        chunk = pc.if_else(submask, None, chunk)
                     else:
                         chunk = pc.replace_with_mask(chunk, submask, c_value)
                 else:

From f44bcbbf6ebbc60cf450335737626aba52363020 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Fri, 18 Mar 2022 00:54:55 -0400
Subject: [PATCH 5/6] refactor

---
 pandas/core/arrays/string_arrow.py | 180 +++++++++++++----------------
 1 file changed, 82 insertions(+), 98 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index a3d26d63ff7a5..e64b1135c33b3 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -40,7 +40,6 @@
     is_dtype_equal,
     is_integer,
     is_integer_dtype,
-    is_list_like,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
@@ -363,140 +362,125 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
         None
         """
         key = check_array_indexer(self, key)
+        indices = self._key_to_indices(key)
 
-        if is_list_like(key):
-            key = np.asarray(key)
-            if len(key) == 1:
-                key = key[0]
-
-        value_is_scalar = is_scalar(value)
-
-        # NA -> None
-        if value_is_scalar:
+        if is_scalar(value):
             if isna(value):
                 value = None
             elif not isinstance(value, str):
                 raise ValueError("Scalar must be NA or str")
+            value = np.broadcast_to(value, len(indices))
         else:
-            value = np.asarray(value, dtype=object)
+            value = np.array(value, dtype=object, copy=True)
             for i, v in enumerate(value):
                 if isna(v):
                     value[i] = None
                 elif not isinstance(v, str):
                     raise ValueError("Scalar must be NA or str")
 
-        # reorder values to align with the mask positions
-        if is_bool_dtype(key):
-            pass
-        elif isinstance(key, slice):
-            if not value_is_scalar and key.step is not None and key.step < 0:
-                value = value[::-1]
-        else:
-            if not value_is_scalar:
-                if is_scalar(key):
-                    raise ValueError("Length of indexer and values mismatch")
-                key = np.asarray(key)
-                if len(key) != len(value):
-                    raise ValueError("Length of indexer and values mismatch")
-
-            if np.any(key < -len(self)):
-                min_key = np.asarray(key).min()
-                raise IndexError(
-                    f"index {min_key} is out of bounds for array of length {len(self)}"
-                )
-            if np.any(key >= len(self)):
-                max_key = np.asarray(key).max()
-                raise IndexError(
-                    f"index {max_key} is out of bounds for array of length {len(self)}"
-                )
+        if len(indices) != len(value):
+            raise ValueError("Length of indexer and values mismatch")
 
-            # convert negative indices to positive before sorting
-            if is_integer(key):
-                if key < 0:
-                    key += len(self)
-            else:
-                key = np.asarray(key)
-                key[key < 0] += len(self)
-                if not value_is_scalar:
-                    value = value[np.argsort(key)]
-
-        # fast path
-        if is_integer(key) and value_is_scalar and self._data.num_chunks == 1:
-            idx = int(key)  # type: ignore[arg-type]
-            chunk = pa.concat_arrays(
-                [
-                    self._data.chunks[0][:idx],
-                    pa.array([value], type=pa.string()),
-                    self._data.chunks[0][idx + 1 :],
-                ]
-            )
-            self._data = pa.chunked_array([chunk])
-            return
+        argsort = np.argsort(indices)
+        indices = indices[argsort]
+        value = value[argsort]
 
-        # create mask for positions to set
-        mask: npt.NDArray[np.bool_]
-        if is_bool_dtype(key):
-            mask = key  # type: ignore[assignment]
-        else:
-            mask = np.zeros(len(self), dtype=np.bool_)
-            mask[key] = True
+        self._data = self._set_via_chunk_iteration(indices=indices, value=value)
 
-        if not value_is_scalar:
-            if len(value) != np.sum(mask):
+    def _key_to_indices(self, key: int | slice | np.ndarray) -> npt.NDArray[np.intp]:
+        """Convert indexing key for self to positional indices."""
+        if isinstance(key, slice):
+            indices = np.arange(len(self))[key]
+        elif is_bool_dtype(key):
+            key = np.asarray(key)
+            if len(key) != len(self):
                 raise ValueError("Length of indexer and values mismatch")
+            indices = key.nonzero()[0]
+        else:
+            key_arr = np.array([key]) if is_integer(key) else np.asarray(key)
+            indices = np.arange(len(self))[key_arr]
+        return indices
 
-        indices = mask.nonzero()[0]
+    def _set_via_chunk_iteration(
+        self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any]
+    ) -> pa.ChunkedArray:
+        """
+        Loop through the array chunks and set the new values while
+        leaving the chunking layout unchanged.
+        """
 
-        # loop through the array chunks and set the new values while
-        # leaving the chunking layout unchanged
-        start = stop = 0
+        chunk_indices = self._within_chunk_indices(indices)
         new_data = []
 
-        for chunk in self._data.iterchunks():
-            start, stop = stop, stop + len(chunk)
+        for i, chunk in enumerate(self._data.iterchunks()):
 
-            if len(indices) == 0 or indices[0] >= stop:
-                new_data.append(chunk)
-                continue
-
-            n = int(np.searchsorted(indices, stop, side="left"))
-            c_indices, indices = indices[:n], indices[n:]
-
-            if value_is_scalar:
-                c_value = value
-            else:
-                c_value, value = value[:n], value[n:]
+            c_ind = chunk_indices[i]
+            n = len(c_ind)
+            c_value, value = value[:n], value[n:]
 
             if n == 1:
                 # fast path
-                idx = c_indices[0] - start
-                v = [c_value] if value_is_scalar else c_value
-                chunk = pa.concat_arrays(
-                    [
-                        chunk[:idx],
-                        pa.array(v, type=pa.string()),
-                        chunk[idx + 1 :],
-                    ]
-                )
-
+                chunk = self._set_single_index_in_chunk(chunk, c_ind[0], c_value[0])
             elif n > 0:
-                submask = mask[start:stop]
+                mask = np.zeros(len(chunk), dtype=np.bool_)
+                mask[c_ind] = True
                 if not pa_version_under5p0:
                     if c_value is None or isna(np.array(c_value)).all():
-                        chunk = pc.if_else(submask, None, chunk)
+                        chunk = pc.if_else(mask, None, chunk)
                     else:
-                        chunk = pc.replace_with_mask(chunk, submask, c_value)
+                        chunk = pc.replace_with_mask(chunk, mask, c_value)
                 else:
                     # The pyarrow compute functions were added in
                     # version 5.0. For prior versions we implement
                     # our own by converting to numpy and back.
                     chunk = chunk.to_numpy(zero_copy_only=False)
-                    chunk[submask] = c_value
+                    chunk[mask] = c_value
                     chunk = pa.array(chunk, type=pa.string())
 
             new_data.append(chunk)
 
-        self._data = pa.chunked_array(new_data)
+        return pa.chunked_array(new_data)
+
+    @staticmethod
+    def _set_single_index_in_chunk(chunk: pa.Array, index: int, value: Any) -> pa.Array:
+        """Set a single position in a pyarrow array."""
+        assert is_scalar(value)
+        return pa.concat_arrays(
+            [
+                chunk[:index],
+                pa.array([value], type=pa.string()),
+                chunk[index + 1 :],
+            ]
+        )
+
+    def _within_chunk_indices(
+        self, indices: npt.NDArray[np.intp]
+    ) -> list[npt.NDArray[np.intp]]:
+        """
+        Convert a list of indices for self into a list of tuples each containing
+        the indices within each chunk of the chunked array.
+        """
+        # indices must be sorted
+        chunk_indices = []
+        for start, stop in self._chunk_ranges():
+            if len(indices) == 0 or indices[0] >= stop:
+                c_ind = np.array([], dtype=np.intp)
+            else:
+                n = int(np.searchsorted(indices, stop, side="left"))
+                c_ind = indices[:n] - start
+                indices = indices[n:]
+            chunk_indices.append(c_ind)
+        return chunk_indices
+
+    def _chunk_ranges(self) -> list[tuple]:
+        """
+        Return a list of tuples each containing the left (inclusive)
+        and right (exclusive) bounds of each chunk.
+        """
+        lengths = [len(c) for c in self._data.iterchunks()]
+        stops = np.cumsum(lengths)
+        starts = np.concatenate([[0], stops[:-1]])
+        return list(zip(starts, stops))
 
     def take(
         self,

From 76a25a98f775febcce55c38afc88dce3e4ec5204 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Fri, 18 Mar 2022 01:06:24 -0400
Subject: [PATCH 6/6] fix docstring

---
 pandas/core/arrays/string_arrow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index e64b1135c33b3..9b582a4e911a8 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -457,8 +457,8 @@ def _within_chunk_indices(
         self, indices: npt.NDArray[np.intp]
     ) -> list[npt.NDArray[np.intp]]:
         """
-        Convert a list of indices for self into a list of tuples each containing
-        the indices within each chunk of the chunked array.
+        Convert indices for self into a list of ndarrays each containing
+        the indices *within* each chunk of the chunked array.
         """
         # indices must be sorted
         chunk_indices = []