From 96136efdfb4b25a89c1ff5ffbe9d124212002360 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 25 Apr 2024 20:06:23 -0400 Subject: [PATCH 1/7] preserve index in list accessor --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/accessors.py | 21 +++++++++--------- .../series/accessors/test_list_accessor.py | 22 ++++++++++++++++--- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7263329d2e53b..62e1125398279 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -289,6 +289,7 @@ Performance improvements Bug fixes ~~~~~~~~~ +- Fixed bug in :class:`ListAccessor` not preserving index. (:issue:`58425`) - Fixed bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) - Fixed bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 19ec253e81ef2..c3c641a34e5d3 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -110,7 +110,9 @@ def len(self) -> Series: from pandas import Series value_lengths = pc.list_value_length(self._pa_array) - return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) + return Series( + value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index + ) def __getitem__(self, key: int | slice) -> Series: """ @@ -149,7 +151,9 @@ def __getitem__(self, key: int | slice) -> Series: # if key < 0: # key = pc.add(key, pc.list_value_length(self._pa_array)) element = pc.list_element(self._pa_array, key) - return Series(element, dtype=ArrowDtype(element.type)) + return Series( + element, dtype=ArrowDtype(element.type), index=self._data.index + ) elif isinstance(key, slice): if pa_version_under11p0: raise NotImplementedError( @@ -167,7 +171,7 @@ def __getitem__(self, key: int | slice) -> Series: if step is None: step = 1 sliced = pc.list_slice(self._pa_array, start, stop, step) - return Series(sliced, dtype=ArrowDtype(sliced.type)) + return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") @@ -195,15 +199,12 @@ def flatten(self) -> Series: ... ) >>> s.list.flatten() 0 1 - 1 2 - 2 3 - 3 3 + 0 2 + 0 3 + 1 3 dtype: int64[pyarrow] """ - from pandas import Series - - flattened = pc.list_flatten(self._pa_array) - return Series(flattened, dtype=ArrowDtype(flattened.type)) + return self._data.dropna().explode() class StructAccessor(ArrowAccessor): diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 1c60567c1a530..2d2eeac3d3c1b 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -31,10 +31,22 @@ def test_list_getitem(list_dtype): tm.assert_series_equal(actual, expected) +def test_list_getitem_index(): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], + ) + actual = ser.list[1] + expected = Series([2, None, None], dtype="int64[pyarrow]", index=[1, 3, 7]) + tm.assert_series_equal(actual, expected) + + def test_list_getitem_slice(): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], ) if pa_version_under11p0: with pytest.raises( @@ -44,7 +56,9 @@ def test_list_getitem_slice(): else: actual = ser.list[1:None:None] expected = Series( - [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())) + [[2, 3], [None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], ) tm.assert_series_equal(actual, expected) @@ -61,11 +75,13 @@ def test_list_len(): def test_list_flatten(): ser = Series( - [[1, 2, 3], [4, None], None], + [[1, 2, 3], None, [4, None]], dtype=ArrowDtype(pa.list_(pa.int64())), ) actual = ser.list.flatten() - expected = Series([1, 2, 3, 4, None], dtype=ArrowDtype(pa.int64())) + expected = Series( + [1, 2, 3, 4, None], dtype=ArrowDtype(pa.int64()), index=[0, 0, 0, 2, 2] + ) tm.assert_series_equal(actual, expected) From 2af7fdb2491895da4f6e4aaaeaa49e1330aa8cfe Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 25 Apr 2024 20:11:10 -0400 Subject: [PATCH 2/7] gh reference --- pandas/tests/series/accessors/test_list_accessor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 2d2eeac3d3c1b..620df45dac36c 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -32,6 +32,7 @@ def test_list_getitem(list_dtype): def test_list_getitem_index(): + # GH 58425 ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), From 9c2f6f9517f6abbfaf3c4734127fa8701fbab27f Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 25 Apr 2024 20:42:39 -0400 Subject: [PATCH 3/7] explode fix --- pandas/core/arrays/arrow/accessors.py | 7 ++++++- pandas/tests/series/accessors/test_list_accessor.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index c3c641a34e5d3..802648e4f3261 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -204,7 +204,12 @@ def flatten(self) -> Series: 1 3 dtype: int64[pyarrow] """ - return self._data.dropna().explode() + from pandas import Series + + counts = pa.compute.list_value_length(self._pa_array).fill_null(0) + flattened = pa.compute.list_flatten(self._pa_array) + index = self._data.index.repeat(counts) + return Series(flattened, dtype=ArrowDtype(flattened.type), index=index) class StructAccessor(ArrowAccessor): diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 620df45dac36c..2b5119eb12009 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -76,7 +76,7 @@ def test_list_len(): def test_list_flatten(): ser = Series( - [[1, 2, 3], None, [4, None]], + [[1, 2, 3], None, [4, None], []], dtype=ArrowDtype(pa.list_(pa.int64())), ) actual = ser.list.flatten() From 92698056e2fcd0f97987ffe262e2e69cda0e7a4f Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 25 Apr 2024 20:45:06 -0400 Subject: [PATCH 4/7] cleanup --- pandas/core/arrays/arrow/accessors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 802648e4f3261..d8f948a37d206 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -206,9 +206,9 @@ def flatten(self) -> Series: """ from pandas import Series - counts = pa.compute.list_value_length(self._pa_array).fill_null(0) + counts = pa.compute.list_value_length(self._pa_array) flattened = pa.compute.list_flatten(self._pa_array) - index = self._data.index.repeat(counts) + index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type))) return Series(flattened, dtype=ArrowDtype(flattened.type), index=index) From 38f66b68e5f594c14a1a4aa2ab45bbe051f9f5f9 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 25 Apr 2024 20:49:16 -0400 Subject: [PATCH 5/7] improve test --- pandas/tests/series/accessors/test_list_accessor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 2b5119eb12009..c153e800cb534 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -76,12 +76,14 @@ def test_list_len(): def test_list_flatten(): ser = Series( - [[1, 2, 3], None, [4, None], []], + [[1, 2, 3], None, [4, None], [], [7, 8]], dtype=ArrowDtype(pa.list_(pa.int64())), ) actual = ser.list.flatten() expected = Series( - [1, 2, 3, 4, None], dtype=ArrowDtype(pa.int64()), index=[0, 0, 0, 2, 2] + [1, 2, 3, 4, None, 7, 8], + dtype=ArrowDtype(pa.int64()), + index=[0, 0, 0, 2, 2, 4, 4], ) tm.assert_series_equal(actual, expected) From 46b3e5d608dc539fc98904734987e79de381b4ac Mon Sep 17 00:00:00 2001 From: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:11:38 -0400 Subject: [PATCH 6/7] Update v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 633f5b3b99e21..c8ee17161d3e7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -465,7 +465,7 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) -- Bug in :class:`ListAccessor` not preserving index. (:issue:`58425`) +- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) From 4ac5f3f80840fd90c890208373cc0d214f7c1226 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Mon, 29 Apr 2024 17:56:07 -0400 Subject: [PATCH 7/7] f --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b99232213e6bb..5eec18cd6e5b0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -471,7 +471,6 @@ Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) -- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) @@ -483,6 +482,7 @@ Other - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) +- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) .. ***DO NOT USE THIS SECTION***