diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index b037f278872a9..6a904b8cdb042 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -43,6 +43,7 @@ ) from pandas.core.dtypes.common import ( + is_array_like, is_bool_dtype, is_dtype_equal, is_integer, @@ -69,7 +70,10 @@ from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray from pandas.core.construction import extract_array -from pandas.core.indexers import check_array_indexer +from pandas.core.indexers import ( + check_array_indexer, + validate_indices, +) from pandas.core.sorting import nargminmax NDArrayBackedExtensionArrayT = TypeVar( @@ -86,6 +90,8 @@ NumpyValueArrayLike, ) + from pandas import Series + def ravel_compat(meth: F) -> F: """ @@ -599,6 +605,159 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ return type(self)(self._data) + @doc(ExtensionArray.factorize) + def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + encoded = self._data.dictionary_encode() + indices = pa.chunked_array( + [c.indices for c in encoded.chunks], type=encoded.type.index_type + ).to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(np.int64, copy=False) + + if encoded.num_chunks: + uniques = type(self)(encoded.chunk(0).dictionary) + else: + uniques = type(self)(pa.array([], type=encoded.type.value_type)) + + return indices.values, uniques + + def take( + self, + indices: TakeIndexer, + allow_fill: bool = False, + fill_value: Any = None, + ): + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int or one-dimensional np.ndarray of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + # error: Incompatible types in assignment (expression has type + # "Sequence[int]", variable has type "ndarray") + indices_array = indices # type: ignore[assignment] + + if len(self._data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.size > 0 and indices_array.max() >= len(self._data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + fill_mask = indices_array < 0 + if fill_mask.any(): + validate_indices(indices_array, len(self._data)) + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=fill_mask) + result = self._data.take(indices_array) + if isna(fill_value): + return type(self)(result) + # TODO: ArrowNotImplementedError: Function fill_null has no + # kernel matching input types (array[string], scalar[string]) + result = type(self)(result) + result[fill_mask] = fill_value + return result + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self._data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self._data) + return type(self)(self._data.take(indices_array)) + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import ( + Index, + Series, + ) + + vc = self._data.value_counts() + + values = vc.field(0) + counts = vc.field(1) + if dropna and self._data.null_count > 0: + mask = values.is_valid() + values = values.filter(mask) + counts = counts.filter(mask) + + # No missing values so we can adhere to the interface and return a numpy array. + counts = np.array(counts) + + index = Index(type(self)(values)) + + return Series(counts, index=index).astype("Int64") + @classmethod def _concat_same_type( cls: type[ArrowExtensionArrayT], to_concat diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ac5bfe32ed5f6..154c143ac89df 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -3,8 +3,6 @@ from collections.abc import Callable # noqa: PDF001 import re from typing import ( - TYPE_CHECKING, - Any, Union, overload, ) @@ -22,7 +20,6 @@ Scalar, ScalarIndexer, SequenceIndexer, - TakeIndexer, npt, ) from pandas.compat import ( @@ -31,10 +28,8 @@ pa_version_under3p0, pa_version_under4p0, ) -from pandas.util._decorators import doc from pandas.core.dtypes.common import ( - is_array_like, is_bool_dtype, is_dtype_equal, is_integer, @@ -48,7 +43,6 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import ArrowExtensionArray -from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype @@ -59,7 +53,6 @@ from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, - validate_indices, ) from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -76,10 +69,6 @@ "ge": pc.greater_equal, } - -if TYPE_CHECKING: - from pandas import Series - ArrowStringScalarOrNAT = Union[str, libmissing.NAType] @@ -214,23 +203,6 @@ def to_numpy( result[mask] = na_value return result - @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: - encoded = self._data.dictionary_encode() - indices = pa.chunked_array( - [c.indices for c in encoded.chunks], type=encoded.type.index_type - ).to_pandas() - if indices.dtype.kind == "f": - indices[np.isnan(indices)] = na_sentinel - indices = indices.astype(np.int64, copy=False) - - if encoded.num_chunks: - uniques = type(self)(encoded.chunk(0).dictionary) - else: - uniques = type(self)(pa.array([], type=encoded.type.value_type)) - - return indices.values, uniques - @overload def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: ... @@ -357,104 +329,6 @@ def _maybe_convert_setitem_value(self, value): raise ValueError("Scalar must be NA or str") return value - def take( - self, - indices: TakeIndexer, - allow_fill: bool = False, - fill_value: Any = None, - ): - """ - Take elements from an array. - - Parameters - ---------- - indices : sequence of int or one-dimensional np.ndarray of int - Indices to be taken. - allow_fill : bool, default False - How to handle negative values in `indices`. - - * False: negative values in `indices` indicate positional indices - from the right (the default). This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - fill_value : any, optional - Fill value to use for NA-indices when `allow_fill` is True. - This may be ``None``, in which case the default NA value for - the type, ``self.dtype.na_value``, is used. - - For many ExtensionArrays, there will be two representations of - `fill_value`: a user-facing "boxed" scalar, and a low-level - physical NA value. `fill_value` should be the user-facing version, - and the implementation should handle translating that to the - physical version for processing the take if necessary. - - Returns - ------- - ExtensionArray - - Raises - ------ - IndexError - When the indices are out of bounds for the array. - ValueError - When `indices` contains negative values other than ``-1`` - and `allow_fill` is True. - - See Also - -------- - numpy.take - api.extensions.take - - Notes - ----- - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when `indices` is a sequence of values. Additionally, - it's called by :meth:`Series.reindex`, or any other method - that causes realignment, with a `fill_value`. - """ - # TODO: Remove once we got rid of the (indices < 0) check - if not is_array_like(indices): - indices_array = np.asanyarray(indices) - else: - # error: Incompatible types in assignment (expression has type - # "Sequence[int]", variable has type "ndarray") - indices_array = indices # type: ignore[assignment] - - if len(self._data) == 0 and (indices_array >= 0).any(): - raise IndexError("cannot do a non-empty take") - if indices_array.size > 0 and indices_array.max() >= len(self._data): - raise IndexError("out of bounds value in 'indices'.") - - if allow_fill: - fill_mask = indices_array < 0 - if fill_mask.any(): - validate_indices(indices_array, len(self._data)) - # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = pa.array(indices_array, mask=fill_mask) - result = self._data.take(indices_array) - if isna(fill_value): - return type(self)(result) - # TODO: ArrowNotImplementedError: Function fill_null has no - # kernel matching input types (array[string], scalar[string]) - result = type(self)(result) - result[fill_mask] = fill_value - return result - # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) - else: - # Nothing to fill - return type(self)(self._data.take(indices)) - else: # allow_fill=False - # TODO(ARROW-9432): Treat negative indices as indices from the right. - if (indices_array < 0).any(): - # Don't modify in-place - indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self._data) - return type(self)(self._data.take(indices_array)) - def isin(self, values): if pa_version_under2p0: return super().isin(values) @@ -481,44 +355,6 @@ def isin(self, values): # to False return np.array(result, dtype=np.bool_) - def value_counts(self, dropna: bool = True) -> Series: - """ - Return a Series containing counts of each unique value. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of missing values. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ - from pandas import ( - Index, - Series, - ) - - vc = self._data.value_counts() - - values = vc.field(0) - counts = vc.field(1) - if dropna and self._data.null_count > 0: - mask = values.is_valid() - values = values.filter(mask) - counts = counts.filter(mask) - - # No missing values so we can adhere to the interface and return a numpy array. - counts = np.array(counts) - - index = Index(type(self)(values)) - - return Series(counts, index=index).astype("Int64") - def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype)