Skip to content

REF: Move value_counts, take, factorize to ArrowExtensionArray #46453

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 28, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from pandas.compat import (
pa_version_under1p01,
pa_version_under2p0,
pa_version_under3p0,
pa_version_under5p0,
)
from pandas.errors import AbstractMethodError
Expand Down Expand Up @@ -86,6 +87,8 @@
NumpyValueArrayLike,
)

from pandas import Series


def ravel_compat(meth: F) -> F:
"""
Expand Down Expand Up @@ -544,6 +547,7 @@ class ArrowExtensionArray(ExtensionArray):
"""

_data: pa.ChunkedArray
_pa_dtype: pa.DataType()

def __init__(self, values: pa.ChunkedArray) -> None:
self._data = values
Expand Down Expand Up @@ -599,6 +603,70 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
"""
return type(self)(self._data)

def isin(self, values):
if pa_version_under2p0:
return super().isin(values)

value_set = [
pa_scalar.as_py()
for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
if pa_scalar.type in (self._pa_dtype, pa.null())
]

# for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
# for null values, so we short-circuit to return all False array.
if not len(value_set):
return np.zeros(len(self), dtype=bool)

kwargs = {}
if pa_version_under3p0:
# in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
# with unexpected keyword argument in pyarrow 3.0.0+
kwargs["skip_null"] = True

result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs)
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)

def value_counts(self, dropna: bool = True) -> Series:
"""
Return a Series containing counts of each unique value.

Parameters
----------
dropna : bool, default True
Don't include counts of missing values.

Returns
-------
counts : Series

See Also
--------
Series.value_counts
"""
from pandas import (
Index,
Series,
)

vc = self._data.value_counts()

values = vc.field(0)
counts = vc.field(1)
if dropna and self._data.null_count > 0:
mask = values.is_valid()
values = values.filter(mask)
counts = counts.filter(mask)

# No missing values so we can adhere to the interface and return a numpy array.
counts = np.array(counts)

index = Index(type(self)(values))

return Series(counts, index=index).astype("Int64")

@classmethod
def _concat_same_type(
cls: type[ArrowExtensionArrayT], to_concat
Expand Down
79 changes: 6 additions & 73 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from collections.abc import Callable # noqa: PDF001
import re
from typing import (
TYPE_CHECKING,
Any,
Union,
overload,
Expand All @@ -28,7 +27,6 @@
from pandas.compat import (
pa_version_under1p01,
pa_version_under2p0,
pa_version_under3p0,
pa_version_under4p0,
)
from pandas.util._decorators import doc
Expand Down Expand Up @@ -77,9 +75,6 @@
}


if TYPE_CHECKING:
from pandas import Series

ArrowStringScalarOrNAT = Union[str, libmissing.NAType]


Expand Down Expand Up @@ -140,6 +135,8 @@ class ArrowStringArray(
Length: 4, dtype: string
"""

_pa_dtype = pa.string()

def __init__(self, values) -> None:
self._dtype = StringDtype(storage="pyarrow")
if isinstance(values, pa.Array):
Expand Down Expand Up @@ -170,11 +167,11 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
na_values = scalars._mask
result = scalars._data
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
return cls(pa.array(result, mask=na_values, type=pa.string()))
return cls(pa.array(result, mask=na_values, type=cls._pa_dtype))

# convert non-na-likes to str
result = lib.ensure_string_array(scalars, copy=copy)
return cls(pa.array(result, type=pa.string(), from_pandas=True))
return cls(pa.array(result, type=cls._pa_dtype, from_pandas=True))

@classmethod
def _from_sequence_of_strings(
Expand Down Expand Up @@ -269,7 +266,7 @@ def __getitem__(

if isinstance(item, np.ndarray):
if not len(item):
return type(self)(pa.chunked_array([], type=pa.string()))
return type(self)(pa.chunked_array([], type=self._pa_dtype))
elif is_integer_dtype(item.dtype):
return self.take(item)
elif is_bool_dtype(item.dtype):
Expand Down Expand Up @@ -455,70 +452,6 @@ def take(
indices_array[indices_array < 0] += len(self._data)
return type(self)(self._data.take(indices_array))

def isin(self, values):
if pa_version_under2p0:
return super().isin(values)

value_set = [
pa_scalar.as_py()
for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
if pa_scalar.type in (pa.string(), pa.null())
]

# for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
# for null values, so we short-circuit to return all False array.
if not len(value_set):
return np.zeros(len(self), dtype=bool)

kwargs = {}
if pa_version_under3p0:
# in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
# with unexpected keyword argument in pyarrow 3.0.0+
kwargs["skip_null"] = True

result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs)
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)

def value_counts(self, dropna: bool = True) -> Series:
"""
Return a Series containing counts of each unique value.

Parameters
----------
dropna : bool, default True
Don't include counts of missing values.

Returns
-------
counts : Series

See Also
--------
Series.value_counts
"""
from pandas import (
Index,
Series,
)

vc = self._data.value_counts()

values = vc.field(0)
counts = vc.field(1)
if dropna and self._data.null_count > 0:
mask = values.is_valid()
values = values.filter(mask)
counts = counts.filter(mask)

# No missing values so we can adhere to the interface and return a numpy array.
counts = np.array(counts)

index = Index(type(self)(values))

return Series(counts, index=index).astype("Int64")

def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)

Expand Down Expand Up @@ -590,7 +523,7 @@ def _str_map(
result = lib.map_infer_mask(
arr, f, mask.view("uint8"), convert=False, na_value=na_value
)
result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
result = pa.array(result, mask=mask, type=self._pa_dtype, from_pandas=True)
return type(self)(result)
else:
# This is when the result type is object. We reach this when
Expand Down