From a7f4a848458b9867b85bfb81cb86c6b6b578e11e Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 12:14:37 -0400 Subject: [PATCH 01/21] inital implementation, no documentation --- pandas/core/arrays/arrow/__init__.py | 7 +- pandas/core/arrays/arrow/accessors.py | 111 +++++++++++++++--- pandas/core/series.py | 6 +- .../series/accessors/test_list_accessor.py | 89 ++++++++++++++ .../series/accessors/test_struct_accessor.py | 24 +--- 5 files changed, 198 insertions(+), 39 deletions(-) create mode 100644 pandas/tests/series/accessors/test_list_accessor.py diff --git a/pandas/core/arrays/arrow/__init__.py b/pandas/core/arrays/arrow/__init__.py index a3d33f91f597d..5fc50f786fc6a 100644 --- a/pandas/core/arrays/arrow/__init__.py +++ b/pandas/core/arrays/arrow/__init__.py @@ -1,4 +1,7 @@ -from pandas.core.arrays.arrow.accessors import StructAccessor +from pandas.core.arrays.arrow.accessors import ( + ListAccessor, + StructAccessor, +) from pandas.core.arrays.arrow.array import ArrowExtensionArray -__all__ = ["ArrowExtensionArray", "StructAccessor"] +__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"] diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index cbd727d842d83..05f40061d183e 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -2,6 +2,10 @@ from __future__ import annotations +from abc import ( + ABCMeta, + abstractmethod, +) from typing import TYPE_CHECKING from pandas.compat import pa_version_under10p1 @@ -19,7 +23,90 @@ ) -class StructAccessor: +class ArrowAccessor(metaclass=ABCMeta): + def __init__(self, data) -> None: + self._data = data + self._validate(data) + + @abstractmethod + def _is_valid_pyarrow_dtype(self, pyarrow_dtype: pa.DataType) -> bool: + pass + + @property + @abstractmethod + def _validation_msg(self) -> str: + pass + + def _validate(self, data): + dtype = data.dtype + if not isinstance(dtype, ArrowDtype): + # Raise AttributeError so that inspect can handle non-struct Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype): + # Raise AttributeError so that inspect can handle invalid Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + @property + def _pa_array(self) -> pa.Array: + return self._data.array._pa_array + + +class ListAccessor(ArrowAccessor): + _validation_msg = ( + "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." + ) + + def __init__(self, data=None) -> None: + super().__init__(data) + + def _is_valid_pyarrow_dtype(self, pyarrow_dtype: pa.DataType) -> bool: + return ( + pa.types.is_list(pyarrow_dtype) + or pa.types.is_fixed_size_list(pyarrow_dtype) + or pa.types.is_large_list(pyarrow_dtype) + ) + + def len(self) -> Series: + from pandas import Series + + value_lengths = pc.list_value_length(self._pa_array) + return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) + + def __getitem__(self, key: int) -> Series: + from pandas import Series + + if isinstance(key, int): + # TODO: Support negative key but pyarrow does not allow + # element index to be an array. + # if key < 0: + # key = pc.add(key, pc.list_value_length(self._pa_array)) + element = pc.list_element(self._pa_array, key) + return Series(element, dtype=ArrowDtype(element.type)) + elif isinstance(key, slice): + # TODO: Support negative start/stop/step, ideally this would be added + # upstream in pyarrow. + start, stop, step = key.start, key.stop, key.step + if start is None: + # TODO: When adding negative step support + # this should be setto last element of array + # when step is negative. + start = 0 + if step is None: + step = 1 + sliced = pc.list_slice(self._pa_array, start, stop, step) + return Series(sliced, dtype=ArrowDtype(sliced.type)) + else: + raise ValueError(f"key must be an int or slice, got {type(key).__name__}") + + def flatten(self) -> Series: + from pandas import Series + + flattened = pc.list_flatten(self._pa_array) + return Series(flattened, dtype=ArrowDtype(flattened.type)) + + +class StructAccessor(ArrowAccessor): """ Accessor object for structured data properties of the Series values. @@ -34,18 +121,10 @@ class StructAccessor: ) def __init__(self, data=None) -> None: - self._parent = data - self._validate(data) - - def _validate(self, data): - dtype = data.dtype - if not isinstance(dtype, ArrowDtype): - # Raise AttributeError so that inspect can handle non-struct Series. - raise AttributeError(self._validation_msg.format(dtype=dtype)) + super().__init__(data) - if not pa.types.is_struct(dtype.pyarrow_dtype): - # Raise AttributeError so that inspect can handle non-struct Series. - raise AttributeError(self._validation_msg.format(dtype=dtype)) + def _is_valid_pyarrow_dtype(self, pyarrow_dtype: pa.DataType) -> bool: + return pa.types.is_struct(pyarrow_dtype) @property def dtypes(self) -> Series: @@ -80,7 +159,7 @@ def dtypes(self) -> Series: Series, ) - pa_type = self._parent.dtype.pyarrow_dtype + pa_type = self._data.dtype.pyarrow_dtype types = [ArrowDtype(struct.type) for struct in pa_type] names = [struct.name for struct in pa_type] return Series(types, index=Index(names)) @@ -135,7 +214,7 @@ def field(self, name_or_index: str | int) -> Series: """ from pandas import Series - pa_arr = self._parent.array._pa_array + pa_arr = self._data.array._pa_array if isinstance(name_or_index, int): index = name_or_index elif isinstance(name_or_index, str): @@ -151,7 +230,7 @@ def field(self, name_or_index: str | int) -> Series: return Series( field_arr, dtype=ArrowDtype(field_arr.type), - index=self._parent.index, + index=self._data.index, name=pa_field.name, ) @@ -190,7 +269,7 @@ def explode(self) -> DataFrame: """ from pandas import concat - pa_type = self._parent.dtype.pyarrow_dtype + pa_type = self._data.dtype.pyarrow_dtype return concat( [self.field(i) for i in range(pa_type.num_fields)], axis="columns" ) diff --git a/pandas/core/series.py b/pandas/core/series.py index c5f622a113258..88d3616423155 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -100,7 +100,10 @@ from pandas.core.accessor import CachedAccessor from pandas.core.apply import SeriesApply from pandas.core.arrays import ExtensionArray -from pandas.core.arrays.arrow import StructAccessor +from pandas.core.arrays.arrow import ( + ListAccessor, + StructAccessor, +) from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor from pandas.core.arrays.string_ import StringDtype @@ -5891,6 +5894,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) sparse = CachedAccessor("sparse", SparseAccessor) struct = CachedAccessor("struct", StructAccessor) + list = CachedAccessor("list", ListAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py new file mode 100644 index 0000000000000..76f55d5d736cc --- /dev/null +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -0,0 +1,89 @@ +import re + +import pytest + +from pandas import ( + ArrowDtype, + Series, +) +import pandas._testing as tm + +pa = pytest.importorskip("pyarrow") + + +@pytest.mark.parametrize( + "list_dtype", + ( + pa.list_(pa.int64()), + pa.list_(pa.int64(), list_size=3), + pa.large_list(pa.int64()), + ), +) +def test_list_getitem(list_dtype: pa.DataType): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(list_dtype), + ) + actual = ser.list[1] + expected = Series([2, None, None], dtype="int64[pyarrow]") + tm.assert_series_equal(actual, expected) + + +def test_list_getitem_slice(): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + actual = ser.list[1:None:None] + expected = Series([[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64()))) + tm.assert_series_equal(actual, expected) + + +def test_list_len(): + ser = Series( + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + actual = ser.list.len() + expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32())) + tm.assert_series_equal(actual, expected) + + +def test_list_flatten(): + ser = Series( + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + actual = ser.list.flatten() + expected = Series([1, 2, 3, 4, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(actual, expected) + + +def test_list_getitem_slice_invalid(): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): + ser.list[1:None:0] + + +@pytest.mark.parametrize( + "list_dtype", + ( + pa.list_(pa.int64()), + pa.list_(pa.int64(), list_size=3), + pa.large_list(pa.int64()), + ), +) +def test_list_getitem_invalid_index(list_dtype: pa.DataType): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(list_dtype), + ) + with pytest.raises(pa.lib.ArrowInvalid, match="Index -1 is out of bounds"): + ser.list[-1] + with pytest.raises(pa.lib.ArrowInvalid, match="Index 5 is out of bounds"): + ser.list[5] + with pytest.raises(ValueError, match="key must be an int or slice, got str"): + ser.list["abc"] diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index c645bb6807052..e79adfec84a42 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -14,28 +14,12 @@ pa = pytest.importorskip("pyarrow") -def test_struct_accessor_dtypes(): +def test_list_getitem(): ser = Series( - [], - dtype=ArrowDtype( - pa.struct( - [ - ("int_col", pa.int64()), - ("string_col", pa.string()), - ( - "struct_col", - pa.struct( - [ - ("int_col", pa.int64()), - ("float_col", pa.float64()), - ] - ), - ), - ] - ) - ), + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), ) - actual = ser.struct.dtypes + actual = ser.list[1] expected = Series( [ ArrowDtype(pa.int64()), From 7b34cbfd6213c0d0e1450f26ddc0127fe52acddd Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 12:17:09 -0400 Subject: [PATCH 02/21] revert --- .../series/accessors/test_struct_accessor.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index e79adfec84a42..c645bb6807052 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -14,12 +14,28 @@ pa = pytest.importorskip("pyarrow") -def test_list_getitem(): +def test_struct_accessor_dtypes(): ser = Series( - [[1, 2, 3], [4, None], None], - dtype=ArrowDtype(pa.list_(pa.int64())), + [], + dtype=ArrowDtype( + pa.struct( + [ + ("int_col", pa.int64()), + ("string_col", pa.string()), + ( + "struct_col", + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ] + ), + ), + ] + ) + ), ) - actual = ser.list[1] + actual = ser.struct.dtypes expected = Series( [ ArrowDtype(pa.int64()), From cd1b31507403bae120568dfd56c0efc9431693e2 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 12:36:31 -0400 Subject: [PATCH 03/21] non list test --- .../tests/series/accessors/test_list_accessor.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 76f55d5d736cc..ffb7ac061a072 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -68,6 +68,21 @@ def test_list_getitem_slice_invalid(): ser.list[1:None:0] +def test_list_accessor_non_list_dtype(): + ser = Series( + [1, 2, 4], + dtype=ArrowDtype(pa.int64()), + ) + with pytest.raises( + AttributeError, + match=re.escape( + "Can only use the '.list' accessor with 'list[pyarrow]' dtype, " + "not int64[pyarrow]." + ), + ): + ser.list[1:None:0] + + @pytest.mark.parametrize( "list_dtype", ( From 8ba970abc759d486544f4c219ac3a8d67c9e216d Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 13:55:39 -0400 Subject: [PATCH 04/21] docstring wip --- pandas/core/arrays/arrow/accessors.py | 92 ++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 05f40061d183e..b6a830de9a861 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -29,7 +29,7 @@ def __init__(self, data) -> None: self._validate(data) @abstractmethod - def _is_valid_pyarrow_dtype(self, pyarrow_dtype: pa.DataType) -> bool: + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: pass @property @@ -53,6 +53,15 @@ def _pa_array(self) -> pa.Array: class ListAccessor(ArrowAccessor): + """ + Accessor object for list data properties of the Series values. + + Parameters + ---------- + data : Series + Series containing Arrow list data. + """ + _validation_msg = ( "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." ) @@ -60,7 +69,7 @@ class ListAccessor(ArrowAccessor): def __init__(self, data=None) -> None: super().__init__(data) - def _is_valid_pyarrow_dtype(self, pyarrow_dtype: pa.DataType) -> bool: + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: return ( pa.types.is_list(pyarrow_dtype) or pa.types.is_fixed_size_list(pyarrow_dtype) @@ -68,12 +77,62 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype: pa.DataType) -> bool: ) def len(self) -> Series: + """ + Return the length of each list in the Series. + + Returns + ------- + pandas.Series + The length of each list. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list.len() + 0 3 + 1 1 + dtype: int32[pyarrow] + """ from pandas import Series value_lengths = pc.list_value_length(self._pa_array) return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) def __getitem__(self, key: int) -> Series: + """ + Index or slice lists in the Series. + + Returns + ------- + pandas.Series + The list at requested index. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list[0] + 0 1 + 1 3 + dtype: int64[pyarrow] + """ from pandas import Series if isinstance(key, int): @@ -100,6 +159,33 @@ def __getitem__(self, key: int) -> Series: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") def flatten(self) -> Series: + """ + Flatten list values. + + Returns + ------- + pandas.Series + The data from all lists in the series flattened. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list.flatten() + 0 1 + 1 2 + 2 3 + 3 3 + dtype: int64[pyarrow] + """ from pandas import Series flattened = pc.list_flatten(self._pa_array) @@ -123,7 +209,7 @@ class StructAccessor(ArrowAccessor): def __init__(self, data=None) -> None: super().__init__(data) - def _is_valid_pyarrow_dtype(self, pyarrow_dtype: pa.DataType) -> bool: + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: return pa.types.is_struct(pyarrow_dtype) @property From adbf4e17a30aeb53a0fb6b60edca7e551b92627c Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 13:58:00 -0400 Subject: [PATCH 05/21] add list accessor to series.rst --- doc/source/reference/series.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 9acbab7a42800..af262f9e6c336 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -526,6 +526,23 @@ Sparse-dtype specific methods and attributes are provided under the Series.sparse.to_coo +.. _api.series.list: + +List accessor +~~~~~~~~~~~~~ + +Arrow list-dtype specific methods and attributes are provided under the +``Series.list`` accessor. + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + + Series.list.flatten + Series.list.len + Series.list.__getitem__ + + .. _api.series.struct: Struct accessor From 585ad2dc5b1a0dec493c7233b3e3fa7b476482c7 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 14:10:43 -0400 Subject: [PATCH 06/21] whatsnew --- doc/source/whatsnew/v2.2.0.rst | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d9909b0dbfad8..f44e0fe79a386 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -64,10 +64,30 @@ DataFrame. (:issue:`54938`) ) series.struct.explode() -.. _whatsnew_220.enhancements.enhancement2: +.. _whatsnew_220.enhancements.list_accessor: -enhancement2 -^^^^^^^^^^^^ +Series.list accessor for PyArrow list data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``Series.list`` accessor provides attributes and methods for processing +data with ``list[pyarrow]`` dtype Series. For example, +:meth:`Series.list.__getitem__` allows indexing pyarrow lists in +a Series. (:issue:`55323`) + +.. ipython:: python + + import pyarrow as pa + series = pd.Series( + [ + [1, 2, 3], + [4, 5], + [6], + ], + dtype=pd.ArrowDtype( + pa.list_(pa.int64()) + ), + ) + series.list[0] .. _whatsnew_220.enhancements.other: From a750b6bd8beca07137be1238b78f27f02b2cc904 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 14:23:00 -0400 Subject: [PATCH 07/21] fix --- pandas/core/arrays/arrow/accessors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index b6a830de9a861..a0af1bacc3388 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -24,6 +24,7 @@ class ArrowAccessor(metaclass=ABCMeta): + @abstractmethod def __init__(self, data) -> None: self._data = data self._validate(data) From a0e58280638064a85a8a70d2a545d28b4b911fb7 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 15:14:14 -0400 Subject: [PATCH 08/21] fix typehint --- pandas/tests/series/accessors/test_list_accessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index ffb7ac061a072..2560782461b01 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -19,7 +19,7 @@ pa.large_list(pa.int64()), ), ) -def test_list_getitem(list_dtype: pa.DataType): +def test_list_getitem(list_dtype): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(list_dtype), @@ -91,7 +91,7 @@ def test_list_accessor_non_list_dtype(): pa.large_list(pa.int64()), ), ) -def test_list_getitem_invalid_index(list_dtype: pa.DataType): +def test_list_getitem_invalid_index(list_dtype): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(list_dtype), From 456458e9ae84a3d32474261d59aae0bbe126ab78 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 16:35:35 -0400 Subject: [PATCH 09/21] private --- pandas/core/arrays/arrow/accessors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index a0af1bacc3388..06301e5acff70 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -23,7 +23,7 @@ ) -class ArrowAccessor(metaclass=ABCMeta): +class _ArrowAccessor(metaclass=ABCMeta): @abstractmethod def __init__(self, data) -> None: self._data = data @@ -53,7 +53,7 @@ def _pa_array(self) -> pa.Array: return self._data.array._pa_array -class ListAccessor(ArrowAccessor): +class ListAccessor(_ArrowAccessor): """ Accessor object for list data properties of the Series values. @@ -193,7 +193,7 @@ def flatten(self) -> Series: return Series(flattened, dtype=ArrowDtype(flattened.type)) -class StructAccessor(ArrowAccessor): +class StructAccessor(_ArrowAccessor): """ Accessor object for structured data properties of the Series values. From 3cf2e8bd244f7f20eee5d19661fbb4c78a23065c Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 16:40:07 -0400 Subject: [PATCH 10/21] fix docstring --- pandas/core/arrays/arrow/accessors.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 06301e5acff70..8315b5cb74e8b 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -108,10 +108,15 @@ def len(self) -> Series: value_lengths = pc.list_value_length(self._pa_array) return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) - def __getitem__(self, key: int) -> Series: + def __getitem__(self, key: int | slice) -> Series: """ Index or slice lists in the Series. + Parameters + ---------- + key : int | slice + Index or slice of indices to access from each list. + Returns ------- pandas.Series From 7144348aafeb6502ba0d23310facd2d3b435e47f Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 31 Oct 2023 17:31:29 -0400 Subject: [PATCH 11/21] fail on iter --- pandas/core/arrays/arrow/accessors.py | 5 +++++ pandas/tests/series/accessors/test_list_accessor.py | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 8315b5cb74e8b..704f4d028b18a 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -17,6 +17,8 @@ from pandas.core.dtypes.dtypes import ArrowDtype if TYPE_CHECKING: + from collections.abc import Iterator + from pandas import ( DataFrame, Series, @@ -164,6 +166,9 @@ def __getitem__(self, key: int | slice) -> Series: else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") + def __iter__(self) -> Iterator: + raise TypeError(f"'{type(self).__name__}' object is not iterable") + def flatten(self) -> Series: """ Flatten list values. diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 2560782461b01..53e92e7cd6931 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -102,3 +102,12 @@ def test_list_getitem_invalid_index(list_dtype): ser.list[5] with pytest.raises(ValueError, match="key must be an int or slice, got str"): ser.list["abc"] + + +def test_list_accessor_not_iterable(): + ser = Series( + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + with pytest.raises(TypeError, match="'ListAccessor' object is not iterable"): + iter(ser.list) From 175a91b3eac21260b4c1a21f4dd7ecc8fc2f48f7 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 1 Nov 2023 11:19:01 -0400 Subject: [PATCH 12/21] list_slice only impl in pyarrow 11 --- pandas/core/arrays/arrow/accessors.py | 10 +++++++++- .../tests/series/accessors/test_list_accessor.py | 16 +++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 704f4d028b18a..2e37120b0c614 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -8,7 +8,10 @@ ) from typing import TYPE_CHECKING -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + pa_version_under10p1, + pa_version_under11p0, +) if not pa_version_under10p1: import pyarrow as pa @@ -151,6 +154,11 @@ def __getitem__(self, key: int | slice) -> Series: element = pc.list_element(self._pa_array, key) return Series(element, dtype=ArrowDtype(element.type)) elif isinstance(key, slice): + if pa_version_under11p0: + raise NotImplementedError( + f"List slice not supported by pyarrow {pa.__version__}." + ) + # TODO: Support negative start/stop/step, ideally this would be added # upstream in pyarrow. start, stop, step = key.start, key.stop, key.step diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 53e92e7cd6931..bec9ecb76026f 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -10,6 +10,8 @@ pa = pytest.importorskip("pyarrow") +from pandas.compat import pa_version_under11p0 + @pytest.mark.parametrize( "list_dtype", @@ -34,9 +36,17 @@ def test_list_getitem_slice(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), ) - actual = ser.list[1:None:None] - expected = Series([[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64()))) - tm.assert_series_equal(actual, expected) + if pa_version_under11p0: + with pytest.raises( + NotImplementedError, match="List slice not supported by pyarrow " + ): + ser.list[1:None:None] + else: + actual = ser.list[1:None:None] + expected = Series( + [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())) + ) + tm.assert_series_equal(actual, expected) def test_list_len(): From 79962be8652086caca12bb8e98a6152e80042241 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 1 Nov 2023 12:10:11 -0400 Subject: [PATCH 13/21] fix docstring? --- pandas/core/arrays/arrow/accessors.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 2e37120b0c614..25bee8cd114e6 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -68,9 +68,11 @@ class ListAccessor(_ArrowAccessor): Series containing Arrow list data. """ - _validation_msg = ( - "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." - ) + @property + def _validation_msg(self) -> str: + return ( + "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." + ) def __init__(self, data=None) -> None: super().__init__(data) @@ -221,9 +223,11 @@ class StructAccessor(_ArrowAccessor): Series containing Arrow struct data. """ - _validation_msg = ( - "Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}." - ) + @property + def _validation_msg(self) -> str: + return ( + "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." + ) def __init__(self, data=None) -> None: super().__init__(data) From 26fff0469a1c834ea526fd1e8f6a998c3bbebe46 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 1 Nov 2023 14:04:12 -0400 Subject: [PATCH 14/21] fix --- pandas/core/arrays/arrow/accessors.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 25bee8cd114e6..654786a8c7468 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -38,11 +38,6 @@ def __init__(self, data) -> None: def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: pass - @property - @abstractmethod - def _validation_msg(self) -> str: - pass - def _validate(self, data): dtype = data.dtype if not isinstance(dtype, ArrowDtype): @@ -68,11 +63,9 @@ class ListAccessor(_ArrowAccessor): Series containing Arrow list data. """ - @property - def _validation_msg(self) -> str: - return ( - "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." - ) + _validation_msg = ( + "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." + ) def __init__(self, data=None) -> None: super().__init__(data) @@ -223,11 +216,9 @@ class StructAccessor(_ArrowAccessor): Series containing Arrow struct data. """ - @property - def _validation_msg(self) -> str: - return ( - "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." - ) + _validation_msg = ( + "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." + ) def __init__(self, data=None) -> None: super().__init__(data) From 166aef8417f703a82428c4b2f565f35cd52bfe70 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 1 Nov 2023 15:48:37 -0400 Subject: [PATCH 15/21] fix test --- pandas/tests/series/accessors/test_list_accessor.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index bec9ecb76026f..1c60567c1a530 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -74,8 +74,14 @@ def test_list_getitem_slice_invalid(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), ) - with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): - ser.list[1:None:0] + if pa_version_under11p0: + with pytest.raises( + NotImplementedError, match="List slice not supported by pyarrow " + ): + ser.list[1:None:0] + else: + with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): + ser.list[1:None:0] def test_list_accessor_non_list_dtype(): From 459109898405e69f431617ad5fec4e74ddcbaa21 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 1 Nov 2023 15:52:36 -0400 Subject: [PATCH 16/21] fix validation msg --- pandas/core/arrays/arrow/accessors.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 654786a8c7468..a43df7d67ef92 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -30,8 +30,9 @@ class _ArrowAccessor(metaclass=ABCMeta): @abstractmethod - def __init__(self, data) -> None: + def __init__(self, data: Series, validation_msg: str) -> None: self._data = data + self._validation_msg = validation_msg self._validate(data) @abstractmethod @@ -63,12 +64,12 @@ class ListAccessor(_ArrowAccessor): Series containing Arrow list data. """ - _validation_msg = ( - "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." - ) - def __init__(self, data=None) -> None: - super().__init__(data) + super().__init__( + data, + validation_msg="Can only use the '.list' accessor with " + "'list[pyarrow]' dtype, not {dtype}.", + ) def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: return ( @@ -216,12 +217,14 @@ class StructAccessor(_ArrowAccessor): Series containing Arrow struct data. """ - _validation_msg = ( - "Can only use the '.list' accessor with 'list[pyarrow]' dtype, not {dtype}." - ) - def __init__(self, data=None) -> None: - super().__init__(data) + super().__init__( + data, + validation_msg=( + "Can only use the '.struct' accessor with 'struct[pyarrow]' " + "dtype, not {dtype}." + ), + ) def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: return pa.types.is_struct(pyarrow_dtype) From 5b97f585ec9543463f66f3fcd8c99608339b6ae8 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 1 Nov 2023 15:54:33 -0400 Subject: [PATCH 17/21] fix --- pandas/tests/series/accessors/test_struct_accessor.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index c645bb6807052..1ec5b3b726d17 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -9,7 +9,6 @@ Series, ) import pandas._testing as tm -from pandas.core.arrays.arrow.accessors import StructAccessor pa = pytest.importorskip("pyarrow") @@ -141,7 +140,11 @@ def test_struct_accessor_explode(): ], ) def test_struct_accessor_api_for_invalid(invalid): - msg = re.escape(StructAccessor._validation_msg.format(dtype=invalid.dtype)) - - with pytest.raises(AttributeError, match=msg): + with pytest.raises( + AttributeError, + match=re.escape( + "Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, " + f"not {invalid.dtype}." + ), + ): invalid.struct From 2ef8cea7ce30565f7e0bbc2959aae30cd5d82691 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 1 Nov 2023 16:42:32 -0400 Subject: [PATCH 18/21] fix --- pandas/core/arrays/arrow/accessors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index a43df7d67ef92..f0f276ec33fbd 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -372,7 +372,7 @@ def explode(self) -> DataFrame: """ from pandas import concat - pa_type = self._data.dtype.pyarrow_dtype + pa_type = self._pa_array.type return concat( [self.field(i) for i in range(pa_type.num_fields)], axis="columns" ) From 934543f32b04887a6b3e341c7030187dca33aa14 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 2 Nov 2023 13:26:33 -0400 Subject: [PATCH 19/21] remove private --- pandas/core/arrays/arrow/accessors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index f0f276ec33fbd..e1de68e5e7799 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -28,7 +28,7 @@ ) -class _ArrowAccessor(metaclass=ABCMeta): +class ArrowAccessor(metaclass=ABCMeta): @abstractmethod def __init__(self, data: Series, validation_msg: str) -> None: self._data = data @@ -54,7 +54,7 @@ def _pa_array(self) -> pa.Array: return self._data.array._pa_array -class ListAccessor(_ArrowAccessor): +class ListAccessor(ArrowAccessor): """ Accessor object for list data properties of the Series values. @@ -207,7 +207,7 @@ def flatten(self) -> Series: return Series(flattened, dtype=ArrowDtype(flattened.type)) -class StructAccessor(_ArrowAccessor): +class StructAccessor(ArrowAccessor): """ Accessor object for structured data properties of the Series values. From 22b48f59a19fcc3dc3f23ad4ad967512b84f3320 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 2 Nov 2023 13:29:17 -0400 Subject: [PATCH 20/21] maybe fix --- pandas/core/arrays/arrow/accessors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index e1de68e5e7799..71b8043c9a71c 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -30,7 +30,7 @@ class ArrowAccessor(metaclass=ABCMeta): @abstractmethod - def __init__(self, data: Series, validation_msg: str) -> None: + def __init__(self, data, validation_msg: str) -> None: self._data = data self._validation_msg = validation_msg self._validate(data) From 65cc6651e43fd69670d1094f02f9b5f59815686a Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 2 Nov 2023 13:34:43 -0400 Subject: [PATCH 21/21] one more remove --- pandas/core/arrays/arrow/accessors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 71b8043c9a71c..7f88267943526 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -50,7 +50,7 @@ def _validate(self, data): raise AttributeError(self._validation_msg.format(dtype=dtype)) @property - def _pa_array(self) -> pa.Array: + def _pa_array(self): return self._data.array._pa_array