Skip to content

Commit 58a2069

Browse files
authored
REF: Move value_counts, take, factorize to ArrowExtensionArray (pandas-dev#46453)
1 parent 9a2d9ea commit 58a2069

File tree

2 files changed

+160
-165
lines changed

2 files changed

+160
-165
lines changed

pandas/core/arrays/_mixins.py

+160-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
)
4444

4545
from pandas.core.dtypes.common import (
46+
is_array_like,
4647
is_bool_dtype,
4748
is_dtype_equal,
4849
is_integer,
@@ -69,7 +70,10 @@
6970
from pandas.core.array_algos.transforms import shift
7071
from pandas.core.arrays.base import ExtensionArray
7172
from pandas.core.construction import extract_array
72-
from pandas.core.indexers import check_array_indexer
73+
from pandas.core.indexers import (
74+
check_array_indexer,
75+
validate_indices,
76+
)
7377
from pandas.core.sorting import nargminmax
7478

7579
NDArrayBackedExtensionArrayT = TypeVar(
@@ -86,6 +90,8 @@
8690
NumpyValueArrayLike,
8791
)
8892

93+
from pandas import Series
94+
8995

9096
def ravel_compat(meth: F) -> F:
9197
"""
@@ -599,6 +605,159 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
599605
"""
600606
return type(self)(self._data)
601607

608+
@doc(ExtensionArray.factorize)
609+
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
610+
encoded = self._data.dictionary_encode()
611+
indices = pa.chunked_array(
612+
[c.indices for c in encoded.chunks], type=encoded.type.index_type
613+
).to_pandas()
614+
if indices.dtype.kind == "f":
615+
indices[np.isnan(indices)] = na_sentinel
616+
indices = indices.astype(np.int64, copy=False)
617+
618+
if encoded.num_chunks:
619+
uniques = type(self)(encoded.chunk(0).dictionary)
620+
else:
621+
uniques = type(self)(pa.array([], type=encoded.type.value_type))
622+
623+
return indices.values, uniques
624+
625+
def take(
626+
self,
627+
indices: TakeIndexer,
628+
allow_fill: bool = False,
629+
fill_value: Any = None,
630+
):
631+
"""
632+
Take elements from an array.
633+
634+
Parameters
635+
----------
636+
indices : sequence of int or one-dimensional np.ndarray of int
637+
Indices to be taken.
638+
allow_fill : bool, default False
639+
How to handle negative values in `indices`.
640+
641+
* False: negative values in `indices` indicate positional indices
642+
from the right (the default). This is similar to
643+
:func:`numpy.take`.
644+
645+
* True: negative values in `indices` indicate
646+
missing values. These values are set to `fill_value`. Any other
647+
other negative values raise a ``ValueError``.
648+
649+
fill_value : any, optional
650+
Fill value to use for NA-indices when `allow_fill` is True.
651+
This may be ``None``, in which case the default NA value for
652+
the type, ``self.dtype.na_value``, is used.
653+
654+
For many ExtensionArrays, there will be two representations of
655+
`fill_value`: a user-facing "boxed" scalar, and a low-level
656+
physical NA value. `fill_value` should be the user-facing version,
657+
and the implementation should handle translating that to the
658+
physical version for processing the take if necessary.
659+
660+
Returns
661+
-------
662+
ExtensionArray
663+
664+
Raises
665+
------
666+
IndexError
667+
When the indices are out of bounds for the array.
668+
ValueError
669+
When `indices` contains negative values other than ``-1``
670+
and `allow_fill` is True.
671+
672+
See Also
673+
--------
674+
numpy.take
675+
api.extensions.take
676+
677+
Notes
678+
-----
679+
ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
680+
``iloc``, when `indices` is a sequence of values. Additionally,
681+
it's called by :meth:`Series.reindex`, or any other method
682+
that causes realignment, with a `fill_value`.
683+
"""
684+
# TODO: Remove once we got rid of the (indices < 0) check
685+
if not is_array_like(indices):
686+
indices_array = np.asanyarray(indices)
687+
else:
688+
# error: Incompatible types in assignment (expression has type
689+
# "Sequence[int]", variable has type "ndarray")
690+
indices_array = indices # type: ignore[assignment]
691+
692+
if len(self._data) == 0 and (indices_array >= 0).any():
693+
raise IndexError("cannot do a non-empty take")
694+
if indices_array.size > 0 and indices_array.max() >= len(self._data):
695+
raise IndexError("out of bounds value in 'indices'.")
696+
697+
if allow_fill:
698+
fill_mask = indices_array < 0
699+
if fill_mask.any():
700+
validate_indices(indices_array, len(self._data))
701+
# TODO(ARROW-9433): Treat negative indices as NULL
702+
indices_array = pa.array(indices_array, mask=fill_mask)
703+
result = self._data.take(indices_array)
704+
if isna(fill_value):
705+
return type(self)(result)
706+
# TODO: ArrowNotImplementedError: Function fill_null has no
707+
# kernel matching input types (array[string], scalar[string])
708+
result = type(self)(result)
709+
result[fill_mask] = fill_value
710+
return result
711+
# return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
712+
else:
713+
# Nothing to fill
714+
return type(self)(self._data.take(indices))
715+
else: # allow_fill=False
716+
# TODO(ARROW-9432): Treat negative indices as indices from the right.
717+
if (indices_array < 0).any():
718+
# Don't modify in-place
719+
indices_array = np.copy(indices_array)
720+
indices_array[indices_array < 0] += len(self._data)
721+
return type(self)(self._data.take(indices_array))
722+
723+
def value_counts(self, dropna: bool = True) -> Series:
724+
"""
725+
Return a Series containing counts of each unique value.
726+
727+
Parameters
728+
----------
729+
dropna : bool, default True
730+
Don't include counts of missing values.
731+
732+
Returns
733+
-------
734+
counts : Series
735+
736+
See Also
737+
--------
738+
Series.value_counts
739+
"""
740+
from pandas import (
741+
Index,
742+
Series,
743+
)
744+
745+
vc = self._data.value_counts()
746+
747+
values = vc.field(0)
748+
counts = vc.field(1)
749+
if dropna and self._data.null_count > 0:
750+
mask = values.is_valid()
751+
values = values.filter(mask)
752+
counts = counts.filter(mask)
753+
754+
# No missing values so we can adhere to the interface and return a numpy array.
755+
counts = np.array(counts)
756+
757+
index = Index(type(self)(values))
758+
759+
return Series(counts, index=index).astype("Int64")
760+
602761
@classmethod
603762
def _concat_same_type(
604763
cls: type[ArrowExtensionArrayT], to_concat

0 commit comments

Comments
 (0)