From e379a2201e16fe39754af854643ba621d8903344 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 16 Mar 2022 23:32:56 -0400 Subject: [PATCH 1/6] ArrowStringArray.__setitem__ --- asv_bench/benchmarks/array.py | 28 ++++ pandas/compat/__init__.py | 2 + pandas/core/arrays/string_arrow.py | 141 +++++++++++++----- .../tests/arrays/string_/test_string_arrow.py | 65 ++++++++ 4 files changed, 202 insertions(+), 34 deletions(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 103df0fd94847..f82928a8fced9 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -1,7 +1,10 @@ import numpy as np +import pyarrow as pa import pandas as pd +from .pandas_vb_common import tm + class BooleanArray: def setup(self): @@ -39,3 +42,28 @@ def time_constructor(self): def time_from_integer_array(self): pd.array(self.values_integer, dtype="Int64") + + +class ArrowStringArray: + + params = [False, True] + param_names = ["multiple_chunks"] + + def setup(self, multiple_chunks): + strings = tm.rands_array(3, 10_000) + if multiple_chunks: + chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)] + self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks)) + else: + self.array = pd.arrays.ArrowStringArray(pa.array(strings)) + + def time_setitem(self, multiple_chunks): + for i in range(200): + self.array[i] = "foo" + + def time_setitem_list(self, multiple_chunks): + indexer = list(range(0, 50)) + list(range(-50, 0)) + self.array[indexer] = ["foo"] * len(indexer) + + def time_setitem_slice(self, multiple_chunks): + self.array[::10] = "foo" diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index d0d7d6a1b8da6..5078d87bc91c7 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -22,6 +22,7 @@ pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, + pa_version_under5p0, ) PY39 = sys.version_info >= (3, 9) @@ -148,4 +149,5 @@ def get_lzma_file(): "pa_version_under2p0", "pa_version_under3p0", "pa_version_under4p0", + "pa_version_under5p0", ] diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e39ebd3afd2ff..2f2609b95c900 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -6,7 +6,6 @@ TYPE_CHECKING, Any, Union, - cast, overload, ) @@ -31,6 +30,7 @@ pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, + pa_version_under5p0, ) from pandas.util._decorators import doc @@ -362,49 +362,122 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: None """ key = check_array_indexer(self, key) + value_is_scalar = is_scalar(value) - if is_integer(key): - key = cast(int, key) - - if not is_scalar(value): - raise ValueError("Must pass scalars with scalar indexer") - elif isna(value): + # NA -> None + if value_is_scalar: + if isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") - - # Slice data and insert in-between - new_data = [ - *self._data[0:key].chunks, - pa.array([value], type=pa.string()), - *self._data[(key + 1) :].chunks, - ] - self._data = pa.chunked_array(new_data) else: - # Convert to integer indices and iteratively assign. - # TODO: Make a faster variant of this in Arrow upstream. - # This is probably extremely slow. - - # Convert all possible input key types to an array of integers - if isinstance(key, slice): - key_array = np.array(range(len(self))[key]) - elif is_bool_dtype(key): - # TODO(ARROW-9430): Directly support setitem(booleans) - key_array = np.argwhere(key).flatten() - else: - # TODO(ARROW-9431): Directly support setitem(integers) - key_array = np.asanyarray(key) + value = np.asarray(value) + value[isna(value)] = None + + # reorder values to align with the mask positions + if is_bool_dtype(key): + pass + elif isinstance(key, slice): + if not value_is_scalar and key.step is not None and key.step < 0: + value = value[::-1] + else: + if not value_is_scalar: + key = np.asarray(key) + if len(key) != len(value): + raise ValueError("Length of indexer and values mismatch") + + if np.any(key < -len(self)): + min_key = np.asarray(key).min() + raise IndexError( + f"index {min_key} is out of bounds for array of length {len(self)}" + ) + if np.any(key >= len(self)): + max_key = np.asarray(key).max() + raise IndexError( + f"index {max_key} is out of bounds for array of length {len(self)}" + ) - if is_scalar(value): - value = np.broadcast_to(value, len(key_array)) + # convert negative indices to positive before sorting + if is_integer(key): + if key < 0: + key += len(self) else: - value = np.asarray(value) + key[key < 0] += len(self) + if not value_is_scalar: + value = value[np.argsort(key)] + + # fast path + if is_integer(key) and value_is_scalar and self._data.num_chunks == 1: + chunk = pa.concat_arrays( + [ + self._data.chunks[0][:key], + pa.array([value], type=pa.string()), + self._data.chunks[0][key + 1 :], + ] + ) + self._data = pa.chunked_array([chunk]) + return - if len(key_array) != len(value): + # create mask for positions to set + if is_bool_dtype(key): + mask = key + else: + mask = np.zeros(len(self), dtype=np.bool_) + mask[key] = True + + if not value_is_scalar: + if len(value) != np.sum(mask): raise ValueError("Length of indexer and values mismatch") - for k, v in zip(key_array, value): - self[k] = v + indices = mask.nonzero()[0] + + # loop through the array chunks and set the new values while + # leaving the chunking layout unchanged + start = stop = 0 + new_data = [] + + for chunk in self._data.iterchunks(): + start, stop = stop, stop + len(chunk) + + if len(indices) == 0 or indices[0] >= stop: + new_data.append(chunk) + continue + + n = np.searchsorted(indices, np.intp(stop), side="left") + c_indices, indices = indices[:n], indices[n:] + + if value_is_scalar: + c_value = value + else: + c_value, value = value[:n], value[n:] + + if n == 1: + # fast path + idx = c_indices[0] - start + v = [c_value] if value_is_scalar else c_value + chunk = pa.concat_arrays( + [ + chunk[:idx], + pa.array(v, type=pa.string()), + chunk[idx + 1 :], + ] + ) + + elif n > 0: + submask = mask[start:stop] + if not pa_version_under5p0: + chunk = pc.replace_with_mask(chunk, submask, c_value) + else: + # The replace_with_mask compute function was added in + # version 5.0. For prior versions we implement our own + # by converting to numpy and back. + chunk = chunk.to_numpy(zero_copy_only=False) + chunk[submask] = c_value + chunk = pa.array(chunk, type=pa.string()) + + new_data.append(chunk) + + self._data = pa.chunked_array(new_data) def take( self, diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 265afa89d6530..de1b7a9c603af 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -132,3 +132,68 @@ def test_pyarrow_not_installed_raises(): with pytest.raises(ImportError, match=msg): ArrowStringArray._from_sequence(["a", None, "b"]) + + +@skip_if_no_pyarrow +@pytest.mark.parametrize("multiple_chunks", [False, True]) +@pytest.mark.parametrize( + "key, value, expected", + [ + (-1, "XX", ["a", "b", "c", "d", "XX"]), + (1, "XX", ["a", "XX", "c", "d", "e"]), + (1, None, ["a", None, "c", "d", "e"]), + (1, pd.NA, ["a", None, "c", "d", "e"]), + ([1, 3], "XX", ["a", "XX", "c", "XX", "e"]), + ([1, 3], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]), + ([1, 3], ["XX", None], ["a", "XX", "c", None, "e"]), + ([1, 3], ["XX", pd.NA], ["a", "XX", "c", None, "e"]), + ([0, -1], ["XX", "YY"], ["XX", "b", "c", "d", "YY"]), + ([-1, 0], ["XX", "YY"], ["YY", "b", "c", "d", "XX"]), + (slice(3, None), "XX", ["a", "b", "c", "XX", "XX"]), + (slice(2, 4), ["XX", "YY"], ["a", "b", "XX", "YY", "e"]), + (slice(3, 1, -1), ["XX", "YY"], ["a", "b", "YY", "XX", "e"]), + (slice(None), "XX", ["XX", "XX", "XX", "XX", "XX"]), + ([False, True, False, True, False], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]), + ], +) +def test_setitem(multiple_chunks, key, value, expected): + import pyarrow as pa + + result = pa.array(list("abcde")) + expected = pa.array(expected) + + if multiple_chunks: + result = pa.chunked_array([result[:3], result[3:]]) + expected = pa.chunked_array([expected[:3], expected[3:]]) + + result = ArrowStringArray(result) + expected = ArrowStringArray(expected) + + result[key] = value + tm.assert_equal(result, expected) + assert result._data.num_chunks == expected._data.num_chunks + + +@skip_if_no_pyarrow +def test_setitem_invalid_indexer_raises(): + import pyarrow as pa + + arr = ArrowStringArray(pa.array(list("abcde"))) + + with pytest.raises(IndexError, match=None): + arr[5] = "foo" + + with pytest.raises(IndexError, match=None): + arr[-6] = "foo" + + with pytest.raises(IndexError, match=None): + arr[[0, 5]] = "foo" + + with pytest.raises(IndexError, match=None): + arr[[0, -6]] = "foo" + + with pytest.raises(IndexError, match=None): + arr[[True, True, False]] = "foo" + + with pytest.raises(ValueError, match=None): + arr[[0, 1]] = ["foo", "bar", "baz"] From 0e35f6a9e75596a67e03f4133714f97267faa08a Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 17 Mar 2022 11:00:26 -0400 Subject: [PATCH 2/6] fixes --- asv_bench/benchmarks/array.py | 5 +++- pandas/core/arrays/string_arrow.py | 39 ++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index f82928a8fced9..b58200911749e 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -1,5 +1,4 @@ import numpy as np -import pyarrow as pa import pandas as pd @@ -50,6 +49,10 @@ class ArrowStringArray: param_names = ["multiple_chunks"] def setup(self, multiple_chunks): + try: + import pyarrow as pa + except ImportError: + raise NotImplementedError strings = tm.rands_array(3, 10_000) if multiple_chunks: chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)] diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 2f2609b95c900..a6ccd9e6d47e0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -40,6 +40,7 @@ is_dtype_equal, is_integer, is_integer_dtype, + is_list_like, is_object_dtype, is_scalar, is_string_dtype, @@ -362,6 +363,12 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: None """ key = check_array_indexer(self, key) + + if is_list_like(key): + key = np.asarray(key) + if len(key) == 1: + key = key[0] + value_is_scalar = is_scalar(value) # NA -> None @@ -371,8 +378,12 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") else: - value = np.asarray(value) - value[isna(value)] = None + value = np.asarray(value, dtype=object) + for i, v in enumerate(value): + if isna(v): + value[i] = None + elif not isinstance(v, str): + raise ValueError("Scalar must be NA or str") # reorder values to align with the mask positions if is_bool_dtype(key): @@ -382,6 +393,8 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: value = value[::-1] else: if not value_is_scalar: + if is_scalar(key): + raise ValueError("Length of indexer and values mismatch") key = np.asarray(key) if len(key) != len(value): raise ValueError("Length of indexer and values mismatch") @@ -402,25 +415,28 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: if key < 0: key += len(self) else: + key = np.asarray(key) key[key < 0] += len(self) if not value_is_scalar: value = value[np.argsort(key)] # fast path if is_integer(key) and value_is_scalar and self._data.num_chunks == 1: + idx = int(key) # type: ignore[arg-type] chunk = pa.concat_arrays( [ - self._data.chunks[0][:key], + self._data.chunks[0][:idx], pa.array([value], type=pa.string()), - self._data.chunks[0][key + 1 :], + self._data.chunks[0][idx + 1 :], ] ) self._data = pa.chunked_array([chunk]) return # create mask for positions to set + mask: npt.NDArray[np.bool_] if is_bool_dtype(key): - mask = key + mask = key # type: ignore[assignment] else: mask = np.zeros(len(self), dtype=np.bool_) mask[key] = True @@ -443,7 +459,7 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: new_data.append(chunk) continue - n = np.searchsorted(indices, np.intp(stop), side="left") + n = int(np.searchsorted(indices, stop, side="left")) c_indices, indices = indices[:n], indices[n:] if value_is_scalar: @@ -466,11 +482,14 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: elif n > 0: submask = mask[start:stop] if not pa_version_under5p0: - chunk = pc.replace_with_mask(chunk, submask, c_value) + if c_value is None: + chunk = pc.if_else(submask, c_value, chunk) + else: + chunk = pc.replace_with_mask(chunk, submask, c_value) else: - # The replace_with_mask compute function was added in - # version 5.0. For prior versions we implement our own - # by converting to numpy and back. + # The pyarrow compute functions were added in + # version 5.0. For prior versions we implement + # our own by converting to numpy and back. chunk = chunk.to_numpy(zero_copy_only=False) chunk[submask] = c_value chunk = pa.array(chunk, type=pa.string()) From f2920540215ef90d1eae57fae4a050184bef8891 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 17 Mar 2022 12:43:52 -0400 Subject: [PATCH 3/6] whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 089ba62e461d1..a16a615e70496 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -315,6 +315,7 @@ Performance improvements - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`) - Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`) - Performance improvement in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when target is a :class:`MultiIndex` (:issue:`46235`) +- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) From 773f375d62e2e2989d36388f349ced6d9252c33e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 17 Mar 2022 13:00:09 -0400 Subject: [PATCH 4/6] fix test --- pandas/core/arrays/string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a6ccd9e6d47e0..a3d26d63ff7a5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -482,8 +482,8 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: elif n > 0: submask = mask[start:stop] if not pa_version_under5p0: - if c_value is None: - chunk = pc.if_else(submask, c_value, chunk) + if c_value is None or isna(np.array(c_value)).all(): + chunk = pc.if_else(submask, None, chunk) else: chunk = pc.replace_with_mask(chunk, submask, c_value) else: From f44bcbbf6ebbc60cf450335737626aba52363020 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 18 Mar 2022 00:54:55 -0400 Subject: [PATCH 5/6] refactor --- pandas/core/arrays/string_arrow.py | 180 +++++++++++++---------------- 1 file changed, 82 insertions(+), 98 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a3d26d63ff7a5..e64b1135c33b3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -40,7 +40,6 @@ is_dtype_equal, is_integer, is_integer_dtype, - is_list_like, is_object_dtype, is_scalar, is_string_dtype, @@ -363,140 +362,125 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: None """ key = check_array_indexer(self, key) + indices = self._key_to_indices(key) - if is_list_like(key): - key = np.asarray(key) - if len(key) == 1: - key = key[0] - - value_is_scalar = is_scalar(value) - - # NA -> None - if value_is_scalar: + if is_scalar(value): if isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") + value = np.broadcast_to(value, len(indices)) else: - value = np.asarray(value, dtype=object) + value = np.array(value, dtype=object, copy=True) for i, v in enumerate(value): if isna(v): value[i] = None elif not isinstance(v, str): raise ValueError("Scalar must be NA or str") - # reorder values to align with the mask positions - if is_bool_dtype(key): - pass - elif isinstance(key, slice): - if not value_is_scalar and key.step is not None and key.step < 0: - value = value[::-1] - else: - if not value_is_scalar: - if is_scalar(key): - raise ValueError("Length of indexer and values mismatch") - key = np.asarray(key) - if len(key) != len(value): - raise ValueError("Length of indexer and values mismatch") - - if np.any(key < -len(self)): - min_key = np.asarray(key).min() - raise IndexError( - f"index {min_key} is out of bounds for array of length {len(self)}" - ) - if np.any(key >= len(self)): - max_key = np.asarray(key).max() - raise IndexError( - f"index {max_key} is out of bounds for array of length {len(self)}" - ) + if len(indices) != len(value): + raise ValueError("Length of indexer and values mismatch") - # convert negative indices to positive before sorting - if is_integer(key): - if key < 0: - key += len(self) - else: - key = np.asarray(key) - key[key < 0] += len(self) - if not value_is_scalar: - value = value[np.argsort(key)] - - # fast path - if is_integer(key) and value_is_scalar and self._data.num_chunks == 1: - idx = int(key) # type: ignore[arg-type] - chunk = pa.concat_arrays( - [ - self._data.chunks[0][:idx], - pa.array([value], type=pa.string()), - self._data.chunks[0][idx + 1 :], - ] - ) - self._data = pa.chunked_array([chunk]) - return + argsort = np.argsort(indices) + indices = indices[argsort] + value = value[argsort] - # create mask for positions to set - mask: npt.NDArray[np.bool_] - if is_bool_dtype(key): - mask = key # type: ignore[assignment] - else: - mask = np.zeros(len(self), dtype=np.bool_) - mask[key] = True + self._data = self._set_via_chunk_iteration(indices=indices, value=value) - if not value_is_scalar: - if len(value) != np.sum(mask): + def _key_to_indices(self, key: int | slice | np.ndarray) -> npt.NDArray[np.intp]: + """Convert indexing key for self to positional indices.""" + if isinstance(key, slice): + indices = np.arange(len(self))[key] + elif is_bool_dtype(key): + key = np.asarray(key) + if len(key) != len(self): raise ValueError("Length of indexer and values mismatch") + indices = key.nonzero()[0] + else: + key_arr = np.array([key]) if is_integer(key) else np.asarray(key) + indices = np.arange(len(self))[key_arr] + return indices - indices = mask.nonzero()[0] + def _set_via_chunk_iteration( + self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] + ) -> pa.ChunkedArray: + """ + Loop through the array chunks and set the new values while + leaving the chunking layout unchanged. + """ - # loop through the array chunks and set the new values while - # leaving the chunking layout unchanged - start = stop = 0 + chunk_indices = self._within_chunk_indices(indices) new_data = [] - for chunk in self._data.iterchunks(): - start, stop = stop, stop + len(chunk) + for i, chunk in enumerate(self._data.iterchunks()): - if len(indices) == 0 or indices[0] >= stop: - new_data.append(chunk) - continue - - n = int(np.searchsorted(indices, stop, side="left")) - c_indices, indices = indices[:n], indices[n:] - - if value_is_scalar: - c_value = value - else: - c_value, value = value[:n], value[n:] + c_ind = chunk_indices[i] + n = len(c_ind) + c_value, value = value[:n], value[n:] if n == 1: # fast path - idx = c_indices[0] - start - v = [c_value] if value_is_scalar else c_value - chunk = pa.concat_arrays( - [ - chunk[:idx], - pa.array(v, type=pa.string()), - chunk[idx + 1 :], - ] - ) - + chunk = self._set_single_index_in_chunk(chunk, c_ind[0], c_value[0]) elif n > 0: - submask = mask[start:stop] + mask = np.zeros(len(chunk), dtype=np.bool_) + mask[c_ind] = True if not pa_version_under5p0: if c_value is None or isna(np.array(c_value)).all(): - chunk = pc.if_else(submask, None, chunk) + chunk = pc.if_else(mask, None, chunk) else: - chunk = pc.replace_with_mask(chunk, submask, c_value) + chunk = pc.replace_with_mask(chunk, mask, c_value) else: # The pyarrow compute functions were added in # version 5.0. For prior versions we implement # our own by converting to numpy and back. chunk = chunk.to_numpy(zero_copy_only=False) - chunk[submask] = c_value + chunk[mask] = c_value chunk = pa.array(chunk, type=pa.string()) new_data.append(chunk) - self._data = pa.chunked_array(new_data) + return pa.chunked_array(new_data) + + @staticmethod + def _set_single_index_in_chunk(chunk: pa.Array, index: int, value: Any) -> pa.Array: + """Set a single position in a pyarrow array.""" + assert is_scalar(value) + return pa.concat_arrays( + [ + chunk[:index], + pa.array([value], type=pa.string()), + chunk[index + 1 :], + ] + ) + + def _within_chunk_indices( + self, indices: npt.NDArray[np.intp] + ) -> list[npt.NDArray[np.intp]]: + """ + Convert a list of indices for self into a list of tuples each containing + the indices within each chunk of the chunked array. + """ + # indices must be sorted + chunk_indices = [] + for start, stop in self._chunk_ranges(): + if len(indices) == 0 or indices[0] >= stop: + c_ind = np.array([], dtype=np.intp) + else: + n = int(np.searchsorted(indices, stop, side="left")) + c_ind = indices[:n] - start + indices = indices[n:] + chunk_indices.append(c_ind) + return chunk_indices + + def _chunk_ranges(self) -> list[tuple]: + """ + Return a list of tuples each containing the left (inclusive) + and right (exclusive) bounds of each chunk. + """ + lengths = [len(c) for c in self._data.iterchunks()] + stops = np.cumsum(lengths) + starts = np.concatenate([[0], stops[:-1]]) + return list(zip(starts, stops)) def take( self, From 76a25a98f775febcce55c38afc88dce3e4ec5204 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 18 Mar 2022 01:06:24 -0400 Subject: [PATCH 6/6] fix docstring --- pandas/core/arrays/string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e64b1135c33b3..9b582a4e911a8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -457,8 +457,8 @@ def _within_chunk_indices( self, indices: npt.NDArray[np.intp] ) -> list[npt.NDArray[np.intp]]: """ - Convert a list of indices for self into a list of tuples each containing - the indices within each chunk of the chunked array. + Convert indices for self into a list of ndarrays each containing + the indices *within* each chunk of the chunked array. """ # indices must be sorted chunk_indices = []