Skip to content

Implement first-class List type #60629

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
"""
if isinstance(value, pa.Scalar):
pa_scalar = value
elif isna(value):
elif not is_list_like(value) and isna(value):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment why this is necessary. e.g. off the top of my head I dont know if pa.ListScalar subclasses pa.Scalar

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this particular case the isna(value) check fails when value is a list, since it doesn't return a boolean back.

The "not is_list_like" was a quick way to prevent this branch from throwing an exception, but open to better ways of expressing that

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As to your question, pa.ListScalar does inherit from pa.Scalar (all Scalars in pyarrow do) but that is not the type that is hitting this branch, since it is caught in the one preceding

pa_scalar = pa.scalar(None, type=pa_type)
else:
# Workaround https://github.com/apache/arrow/issues/37291
Expand Down Expand Up @@ -1350,7 +1350,16 @@ def take(
# TODO(ARROW-9433): Treat negative indices as NULL
indices_array = pa.array(indices_array, mask=fill_mask)
result = self._pa_array.take(indices_array)
if isna(fill_value):
if is_list_like(fill_value):
# TODO: this should be hit by ListArray. Ideally we do:
# pc.replace_with_mask(result, fill_mask, pa.scalar(fill_value))
# but pyarrow does not yet implement that for list types
new_values = [
fill_value if should_fill else x.as_py()
for x, should_fill in zip(result, fill_mask)
]
return type(self)(new_values)
elif isna(fill_value):
return type(self)(result)
# TODO: ArrowNotImplementedError: Function fill_null has no
# kernel matching input types (array[string], scalar[string])
Expand Down
108 changes: 99 additions & 9 deletions pandas/core/arrays/list_.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.common import is_string_dtype
from pandas.core.dtypes.common import (
is_bool_dtype,
is_integer_dtype,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import ArrowDtype

from pandas.core.arrays.arrow.array import ArrowExtensionArray
from pandas.core.arrays.base import ExtensionArray

if TYPE_CHECKING:
from collections.abc import Sequence
Expand Down Expand Up @@ -146,6 +151,15 @@ def __init__(
else:
if value_type is None:
if isinstance(values, (pa.Array, pa.ChunkedArray)):
parent_type = values.type
if not isinstance(parent_type, (pa.ListType, pa.LargeListType)):
# Ideally could cast here, but I don't think pyarrow implements
# many list casts
new_values = [
[x.as_py()] if x.is_valid else None for x in values
]
values = pa.array(new_values, type=pa.large_list(parent_type))

value_type = values.type.value_type
else:
value_type = pa.array(values).type.value_type
Expand Down Expand Up @@ -193,19 +207,89 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):

return cls(values)

@classmethod
def _box_pa(
cls, value, pa_type: pa.DataType | None = None
) -> pa.Array | pa.ChunkedArray | pa.Scalar:
"""
Box value into a pyarrow Array, ChunkedArray or Scalar.

Parameters
----------
value : any
pa_type : pa.DataType | None

Returns
-------
pa.Array or pa.ChunkedArray or pa.Scalar
"""
if (
isinstance(value, (pa.ListScalar, pa.LargeListScalar))
or isinstance(value, list)
or value is None
):
return cls._box_pa_scalar(value, pa_type)
return cls._box_pa_array(value, pa_type)

def __getitem__(self, item):
# PyArrow does not support NumPy's selection with an equal length
# mask, so let's convert those to integral positions if needed
if isinstance(item, np.ndarray) and item.dtype == bool:
pos = np.array(range(len(item)))
mask = pos[item]
return type(self)(self._pa_array.take(mask))
if isinstance(item, (np.ndarray, ExtensionArray)):
if is_bool_dtype(item.dtype):
mask_len = len(item)
if mask_len != len(self):
raise IndexError(
f"Boolean index has wrong length: {mask_len} "
f"instead of {len(self)}"
)
pos = np.array(range(len(item)))

if isinstance(item, ExtensionArray):
mask = pos[item.fillna(False)]
else:
mask = pos[item]
return type(self)(self._pa_array.take(mask))
elif is_integer_dtype(item.dtype):
if isinstance(item, ExtensionArray) and item.isna().any():
msg = "Cannot index with an integer indexer containing NA values"
raise ValueError(msg)

indexer = pa.array(item)
return type(self)(self._pa_array.take(indexer))
elif isinstance(item, int):
return self._pa_array[item]
value = self._pa_array[item]
if value.is_valid:
return value.as_py()
else:
return self.dtype.na_value
elif isinstance(item, list):
return type(self)(self._pa_array.take(item))
# pyarrow does not support taking yet from an empty list
# https://github.com/apache/arrow/issues/39917
if item:
try:
result = self._pa_array.take(item)
except pa.lib.ArrowInvalid as e:
if "Could not convert <NA>" in str(e):
msg = (
"Cannot index with an integer indexer containing NA values"
)
raise ValueError(msg) from e
raise e
else:
result = pa.array([], type=self._pa_array.type)

return type(self)(result)

try:
result = type(self)(self._pa_array[item])
except TypeError as e:
msg = (
"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
"(`None`) and integer or boolean arrays are valid indices"
)
raise IndexError(msg) from e

return type(self)(self._pa_array[item])
return result

def __setitem__(self, key, value) -> None:
msg = "ListArray does not support item assignment via setitem"
Expand Down Expand Up @@ -241,7 +325,13 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
return super().astype(dtype, copy)

def __eq__(self, other):
if isinstance(other, (pa.ListScalar, pa.LargeListScalar)):
if isinstance(other, list):
from pandas.arrays import BooleanArray

mask = np.array([False] * len(self))
values = np.array([x.as_py() == other for x in self._pa_array])
return BooleanArray(values, mask)
elif isinstance(other, (pa.ListScalar, pa.LargeListScalar)):
from pandas.arrays import BooleanArray

# TODO: pyarrow.compute does not implement broadcasting equality
Expand Down
26 changes: 18 additions & 8 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import warnings

import numpy as np
import pyarrow as pa

from pandas._config import config

Expand Down Expand Up @@ -150,6 +149,7 @@
)
from pandas.core.array_algos.replace import should_use_regex
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.list_ import ListDtype
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.flags import Flags
Expand Down Expand Up @@ -7013,11 +7013,20 @@ def fillna(
stacklevel=2,
)

holds_list_array = False
if isinstance(self, ABCSeries) and isinstance(self.dtype, ListDtype):
holds_list_array = True
elif isinstance(self, ABCDataFrame) and any(
isinstance(x, ListDtype) for x in self.dtypes
):
holds_list_array = True

if isinstance(value, (list, tuple)):
raise TypeError(
'"value" parameter must be a scalar or dict, but '
f'you passed a "{type(value).__name__}"'
)
if not holds_list_array:
raise TypeError(
'"value" parameter must be a scalar or dict, but '
f'you passed a "{type(value).__name__}"'
)

# set the default here, so functions examining the signature
# can detect if something was set (e.g. in groupby) (GH9221)
Expand All @@ -7037,8 +7046,9 @@ def fillna(
value = Series(value)
value = value.reindex(self.index)
value = value._values
elif isinstance(value, pa.ListScalar) or not is_list_like(value):
# TODO(wayd): maybe is_list_like should return false for ListScalar?
elif (
isinstance(value, list) and isinstance(self.dtype, ListDtype)
) or not is_list_like(value):
pass
else:
raise TypeError(
Expand Down Expand Up @@ -7102,7 +7112,7 @@ def fillna(
else:
return result

elif isinstance(value, pa.ListScalar) or not is_list_like(value):
elif holds_list_array or not is_list_like(value):
if axis == 1:
result = self.T.fillna(value=value, limit=limit).T
new_data = result._mgr
Expand Down
Loading
Loading