From ff6e271961434063b277e723e918cc7f3cbc5dea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 4 Nov 2023 22:47:43 -0400 Subject: [PATCH 1/8] BUG: Index.getitem returning wrong result with negative step for arrow --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/arrays/arrow/array.py | 7 +++++++ pandas/tests/indexes/object/test_indexing.py | 14 +++++++++++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 3b1cd1c152baa..f4c32b6ecc056 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) .. --------------------------------------------------------------------------- .. _whatsnew_213.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4bcc03643dac8..43927f554a875 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -553,6 +553,13 @@ def __getitem__(self, item: PositionalIndexer): ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. + if isinstance(item, slice): + if item.start == item.stop: + pass + elif item.start == -len(self) - 1: + item = slice(None, item.stop, item.step) + elif item.stop == -len(self) - 1: + item = slice(item.start, None, item.step) value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..7a6e93a4605d7 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.missing import is_matching_na +import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -144,6 +145,13 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + @pytest.mark.parametrize( + "dtype", + [ + "object", + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], + ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -167,12 +175,12 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected): - index = Index(list("bcdxy")) + def test_slice_locs_negative_step(self, in_slice, expected, dtype): + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected)) + expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): From 7684911a70609559f7efcb84c73662ba0579a169 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 09:18:25 -0500 Subject: [PATCH 2/8] Update --- pandas/core/arrays/arrow/array.py | 5 +++-- pandas/tests/indexes/object/test_indexing.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 43927f554a875..e073b8c20acbe 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,10 +556,11 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start == -len(self) - 1: + elif item.start <= -len(self) - 1: item = slice(None, item.stop, item.step) - elif item.stop == -len(self) - 1: + elif item.stop <= -len(self) - 1: item = slice(item.start, None, item.step) + value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 7a6e93a4605d7..93d46ebdd0b51 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -183,6 +183,17 @@ def test_slice_locs_negative_step(self, in_slice, expected, dtype): expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) + @td.skip_if_no("pyarrow") + def test_slice_locs_negative_step_oob(self): + index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + def test_slice_locs_dup(self): index = Index(["a", "a", "b", "c", "d", "d"]) assert index.slice_locs("a", "d") == (0, 6) From 7474cb2b70e548a6000dd797c8f5c524b08624cb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 09:21:00 -0500 Subject: [PATCH 3/8] Update --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e073b8c20acbe..87d355ef79142 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,9 +556,9 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start <= -len(self) - 1: + elif item.start < -len(self): item = slice(None, item.stop, item.step) - elif item.stop <= -len(self) - 1: + elif item.stop < -len(self): item = slice(item.start, None, item.step) value = self._pa_array[item] From b27b0f82b85e20d0c3ae724acbece252999a8b26 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 09:26:59 -0500 Subject: [PATCH 4/8] Fix --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 87d355ef79142..9a6b91f21d90f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,9 +556,9 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start < -len(self): + elif item.start is not None and item.start < -len(self): item = slice(None, item.stop, item.step) - elif item.stop < -len(self): + elif item.stop is not None and item.stop < -len(self): item = slice(item.start, None, item.step) value = self._pa_array[item] From 659577e6c43dfe5125b865f707ec537f66b89cd3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 10:13:53 -0500 Subject: [PATCH 5/8] Update array.py --- pandas/core/arrays/arrow/array.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9a6b91f21d90f..7e5452a21af9a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,8 +556,6 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start is not None and item.start < -len(self): - item = slice(None, item.stop, item.step) elif item.stop is not None and item.stop < -len(self): item = slice(item.start, None, item.step) From 86fe4f129a648fa2f765e9f3587d8829eedbc7ff Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Nov 2023 00:35:46 +0100 Subject: [PATCH 6/8] Fix --- pandas/core/arrays/arrow/array.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7e5452a21af9a..820a3856d48fc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,7 +556,12 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.stop is not None and item.stop < -len(self): + elif ( + item.stop is not None + and item.stop < -len(self) + and item.step is not None + and item.step < 0 + ): item = slice(item.start, None, item.step) value = self._pa_array[item] From 242552937fe6d8e12c2af90e2d55b91a9d9bdb5c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 17 Nov 2023 19:35:18 +0100 Subject: [PATCH 7/8] Add gh ref --- pandas/core/arrays/arrow/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 820a3856d48fc..d162b66e5d369 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -554,6 +554,7 @@ def __getitem__(self, item: PositionalIndexer): # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. if isinstance(item, slice): + # Arrow bug https://github.com/apache/arrow/issues/38768 if item.start == item.stop: pass elif ( From 06b4f8909fef759bdda8877602b40f4be3b9925e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 21 Nov 2023 21:17:28 +0100 Subject: [PATCH 8/8] Update --- doc/source/whatsnew/v2.1.3.rst | 1 - doc/source/whatsnew/v2.1.4.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index a7f20aa67685f..af626895a9e0e 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -21,7 +21,6 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) - Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 25afcbb3bb532..e52c42dd31211 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - .. ---------------------------------------------------------------------------