From 03a8009c98dc2829e1b5014aa9a2a2cd8884a22e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 20 Mar 2022 20:06:17 -0700 Subject: [PATCH 1/4] REF: Move value_counts, isin to ArrowExtensionArray --- pandas/core/arrays/_mixins.py | 68 +++++++++++++++++++++++++ pandas/core/arrays/string_arrow.py | 79 +++--------------------------- 2 files changed, 74 insertions(+), 73 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index b037f278872a9..9024f1139a2a7 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -32,6 +32,7 @@ from pandas.compat import ( pa_version_under1p01, pa_version_under2p0, + pa_version_under3p0, pa_version_under5p0, ) from pandas.errors import AbstractMethodError @@ -86,6 +87,8 @@ NumpyValueArrayLike, ) + from pandas import Series + def ravel_compat(meth: F) -> F: """ @@ -544,6 +547,7 @@ class ArrowExtensionArray(ExtensionArray): """ _data: pa.ChunkedArray + _pa_dtype: pa.DataType() def __init__(self, values: pa.ChunkedArray) -> None: self._data = values @@ -599,6 +603,70 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ return type(self)(self._data) + def isin(self, values): + if pa_version_under2p0: + return super().isin(values) + + value_set = [ + pa_scalar.as_py() + for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] + if pa_scalar.type in (self._pa_dtype, pa.null()) + ] + + # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True + # for null values, so we short-circuit to return all False array. + if not len(value_set): + return np.zeros(len(self), dtype=bool) + + kwargs = {} + if pa_version_under3p0: + # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises + # with unexpected keyword argument in pyarrow 3.0.0+ + kwargs["skip_null"] = True + + result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) + # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls + # to False + return np.array(result, dtype=np.bool_) + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import ( + Index, + Series, + ) + + vc = self._data.value_counts() + + values = vc.field(0) + counts = vc.field(1) + if dropna and self._data.null_count > 0: + mask = values.is_valid() + values = values.filter(mask) + counts = counts.filter(mask) + + # No missing values so we can adhere to the interface and return a numpy array. + counts = np.array(counts) + + index = Index(type(self)(values)) + + return Series(counts, index=index).astype("Int64") + @classmethod def _concat_same_type( cls: type[ArrowExtensionArrayT], to_concat diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ac5bfe32ed5f6..b775fc370957b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -3,7 +3,6 @@ from collections.abc import Callable # noqa: PDF001 import re from typing import ( - TYPE_CHECKING, Any, Union, overload, @@ -28,7 +27,6 @@ from pandas.compat import ( pa_version_under1p01, pa_version_under2p0, - pa_version_under3p0, pa_version_under4p0, ) from pandas.util._decorators import doc @@ -77,9 +75,6 @@ } -if TYPE_CHECKING: - from pandas import Series - ArrowStringScalarOrNAT = Union[str, libmissing.NAType] @@ -140,6 +135,8 @@ class ArrowStringArray( Length: 4, dtype: string """ + _pa_dtype = pa.string() + def __init__(self, values) -> None: self._dtype = StringDtype(storage="pyarrow") if isinstance(values, pa.Array): @@ -170,11 +167,11 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=pa.string())) + return cls(pa.array(result, mask=na_values, type=cls._pa_dtype)) # convert non-na-likes to str result = lib.ensure_string_array(scalars, copy=copy) - return cls(pa.array(result, type=pa.string(), from_pandas=True)) + return cls(pa.array(result, type=cls._pa_dtype, from_pandas=True)) @classmethod def _from_sequence_of_strings( @@ -269,7 +266,7 @@ def __getitem__( if isinstance(item, np.ndarray): if not len(item): - return type(self)(pa.chunked_array([], type=pa.string())) + return type(self)(pa.chunked_array([], type=self._pa_dtype)) elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): @@ -455,70 +452,6 @@ def take( indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array)) - def isin(self, values): - if pa_version_under2p0: - return super().isin(values) - - value_set = [ - pa_scalar.as_py() - for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] - if pa_scalar.type in (pa.string(), pa.null()) - ] - - # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True - # for null values, so we short-circuit to return all False array. - if not len(value_set): - return np.zeros(len(self), dtype=bool) - - kwargs = {} - if pa_version_under3p0: - # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises - # with unexpected keyword argument in pyarrow 3.0.0+ - kwargs["skip_null"] = True - - result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) - # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls - # to False - return np.array(result, dtype=np.bool_) - - def value_counts(self, dropna: bool = True) -> Series: - """ - Return a Series containing counts of each unique value. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of missing values. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ - from pandas import ( - Index, - Series, - ) - - vc = self._data.value_counts() - - values = vc.field(0) - counts = vc.field(1) - if dropna and self._data.null_count > 0: - mask = values.is_valid() - values = values.filter(mask) - counts = counts.filter(mask) - - # No missing values so we can adhere to the interface and return a numpy array. - counts = np.array(counts) - - index = Index(type(self)(values)) - - return Series(counts, index=index).astype("Int64") - def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) @@ -590,7 +523,7 @@ def _str_map( result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + result = pa.array(result, mask=mask, type=self._pa_dtype, from_pandas=True) return type(self)(result) else: # This is when the result type is object. We reach this when From 46380bd6b53b4f1f1c9d9642d5fd3aac3f5ed31e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 21 Mar 2022 10:27:10 -0700 Subject: [PATCH 2/4] Revert "REF: Move value_counts, isin to ArrowExtensionArray" This reverts commit 03a8009c98dc2829e1b5014aa9a2a2cd8884a22e. --- pandas/core/arrays/_mixins.py | 68 ------------------------- pandas/core/arrays/string_arrow.py | 79 +++++++++++++++++++++++++++--- 2 files changed, 73 insertions(+), 74 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 9024f1139a2a7..b037f278872a9 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -32,7 +32,6 @@ from pandas.compat import ( pa_version_under1p01, pa_version_under2p0, - pa_version_under3p0, pa_version_under5p0, ) from pandas.errors import AbstractMethodError @@ -87,8 +86,6 @@ NumpyValueArrayLike, ) - from pandas import Series - def ravel_compat(meth: F) -> F: """ @@ -547,7 +544,6 @@ class ArrowExtensionArray(ExtensionArray): """ _data: pa.ChunkedArray - _pa_dtype: pa.DataType() def __init__(self, values: pa.ChunkedArray) -> None: self._data = values @@ -603,70 +599,6 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ return type(self)(self._data) - def isin(self, values): - if pa_version_under2p0: - return super().isin(values) - - value_set = [ - pa_scalar.as_py() - for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] - if pa_scalar.type in (self._pa_dtype, pa.null()) - ] - - # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True - # for null values, so we short-circuit to return all False array. - if not len(value_set): - return np.zeros(len(self), dtype=bool) - - kwargs = {} - if pa_version_under3p0: - # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises - # with unexpected keyword argument in pyarrow 3.0.0+ - kwargs["skip_null"] = True - - result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) - # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls - # to False - return np.array(result, dtype=np.bool_) - - def value_counts(self, dropna: bool = True) -> Series: - """ - Return a Series containing counts of each unique value. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of missing values. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ - from pandas import ( - Index, - Series, - ) - - vc = self._data.value_counts() - - values = vc.field(0) - counts = vc.field(1) - if dropna and self._data.null_count > 0: - mask = values.is_valid() - values = values.filter(mask) - counts = counts.filter(mask) - - # No missing values so we can adhere to the interface and return a numpy array. - counts = np.array(counts) - - index = Index(type(self)(values)) - - return Series(counts, index=index).astype("Int64") - @classmethod def _concat_same_type( cls: type[ArrowExtensionArrayT], to_concat diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b775fc370957b..ac5bfe32ed5f6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -3,6 +3,7 @@ from collections.abc import Callable # noqa: PDF001 import re from typing import ( + TYPE_CHECKING, Any, Union, overload, @@ -27,6 +28,7 @@ from pandas.compat import ( pa_version_under1p01, pa_version_under2p0, + pa_version_under3p0, pa_version_under4p0, ) from pandas.util._decorators import doc @@ -75,6 +77,9 @@ } +if TYPE_CHECKING: + from pandas import Series + ArrowStringScalarOrNAT = Union[str, libmissing.NAType] @@ -135,8 +140,6 @@ class ArrowStringArray( Length: 4, dtype: string """ - _pa_dtype = pa.string() - def __init__(self, values) -> None: self._dtype = StringDtype(storage="pyarrow") if isinstance(values, pa.Array): @@ -167,11 +170,11 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=cls._pa_dtype)) + return cls(pa.array(result, mask=na_values, type=pa.string())) # convert non-na-likes to str result = lib.ensure_string_array(scalars, copy=copy) - return cls(pa.array(result, type=cls._pa_dtype, from_pandas=True)) + return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod def _from_sequence_of_strings( @@ -266,7 +269,7 @@ def __getitem__( if isinstance(item, np.ndarray): if not len(item): - return type(self)(pa.chunked_array([], type=self._pa_dtype)) + return type(self)(pa.chunked_array([], type=pa.string())) elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): @@ -452,6 +455,70 @@ def take( indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array)) + def isin(self, values): + if pa_version_under2p0: + return super().isin(values) + + value_set = [ + pa_scalar.as_py() + for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] + if pa_scalar.type in (pa.string(), pa.null()) + ] + + # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True + # for null values, so we short-circuit to return all False array. + if not len(value_set): + return np.zeros(len(self), dtype=bool) + + kwargs = {} + if pa_version_under3p0: + # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises + # with unexpected keyword argument in pyarrow 3.0.0+ + kwargs["skip_null"] = True + + result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) + # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls + # to False + return np.array(result, dtype=np.bool_) + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import ( + Index, + Series, + ) + + vc = self._data.value_counts() + + values = vc.field(0) + counts = vc.field(1) + if dropna and self._data.null_count > 0: + mask = values.is_valid() + values = values.filter(mask) + counts = counts.filter(mask) + + # No missing values so we can adhere to the interface and return a numpy array. + counts = np.array(counts) + + index = Index(type(self)(values)) + + return Series(counts, index=index).astype("Int64") + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) @@ -523,7 +590,7 @@ def _str_map( result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - result = pa.array(result, mask=mask, type=self._pa_dtype, from_pandas=True) + result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) return type(self)(result) else: # This is when the result type is object. We reach this when From f5e30308e2a5ffbda24c8bce35d43a199aa2a168 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 21 Mar 2022 10:29:03 -0700 Subject: [PATCH 3/4] Just move value_counts, isin requires dtype infra --- pandas/core/arrays/_mixins.py | 40 +++++++++++++++++++++++++++ pandas/core/arrays/string_arrow.py | 43 ------------------------------ 2 files changed, 40 insertions(+), 43 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index b037f278872a9..f0dd072aca0d0 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -86,6 +86,8 @@ NumpyValueArrayLike, ) + from pandas import Series + def ravel_compat(meth: F) -> F: """ @@ -599,6 +601,44 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ return type(self)(self._data) + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import ( + Index, + Series, + ) + + vc = self._data.value_counts() + + values = vc.field(0) + counts = vc.field(1) + if dropna and self._data.null_count > 0: + mask = values.is_valid() + values = values.filter(mask) + counts = counts.filter(mask) + + # No missing values so we can adhere to the interface and return a numpy array. + counts = np.array(counts) + + index = Index(type(self)(values)) + + return Series(counts, index=index).astype("Int64") + @classmethod def _concat_same_type( cls: type[ArrowExtensionArrayT], to_concat diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ac5bfe32ed5f6..02c96a91bdd60 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -3,7 +3,6 @@ from collections.abc import Callable # noqa: PDF001 import re from typing import ( - TYPE_CHECKING, Any, Union, overload, @@ -76,10 +75,6 @@ "ge": pc.greater_equal, } - -if TYPE_CHECKING: - from pandas import Series - ArrowStringScalarOrNAT = Union[str, libmissing.NAType] @@ -481,44 +476,6 @@ def isin(self, values): # to False return np.array(result, dtype=np.bool_) - def value_counts(self, dropna: bool = True) -> Series: - """ - Return a Series containing counts of each unique value. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of missing values. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ - from pandas import ( - Index, - Series, - ) - - vc = self._data.value_counts() - - values = vc.field(0) - counts = vc.field(1) - if dropna and self._data.null_count > 0: - mask = values.is_valid() - values = values.filter(mask) - counts = counts.filter(mask) - - # No missing values so we can adhere to the interface and return a numpy array. - counts = np.array(counts) - - index = Index(type(self)(values)) - - return Series(counts, index=index).astype("Int64") - def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) From de10c12aa5df48f7628e1b36b2804ca796bde000 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 27 Mar 2022 15:47:33 -0700 Subject: [PATCH 4/4] Move take, factorize --- pandas/core/arrays/_mixins.py | 121 ++++++++++++++++++++++++++++- pandas/core/arrays/string_arrow.py | 121 ----------------------------- 2 files changed, 120 insertions(+), 122 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index f0dd072aca0d0..6a904b8cdb042 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -43,6 +43,7 @@ ) from pandas.core.dtypes.common import ( + is_array_like, is_bool_dtype, is_dtype_equal, is_integer, @@ -69,7 +70,10 @@ from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray from pandas.core.construction import extract_array -from pandas.core.indexers import check_array_indexer +from pandas.core.indexers import ( + check_array_indexer, + validate_indices, +) from pandas.core.sorting import nargminmax NDArrayBackedExtensionArrayT = TypeVar( @@ -601,6 +605,121 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ return type(self)(self._data) + @doc(ExtensionArray.factorize) + def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + encoded = self._data.dictionary_encode() + indices = pa.chunked_array( + [c.indices for c in encoded.chunks], type=encoded.type.index_type + ).to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(np.int64, copy=False) + + if encoded.num_chunks: + uniques = type(self)(encoded.chunk(0).dictionary) + else: + uniques = type(self)(pa.array([], type=encoded.type.value_type)) + + return indices.values, uniques + + def take( + self, + indices: TakeIndexer, + allow_fill: bool = False, + fill_value: Any = None, + ): + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int or one-dimensional np.ndarray of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + # error: Incompatible types in assignment (expression has type + # "Sequence[int]", variable has type "ndarray") + indices_array = indices # type: ignore[assignment] + + if len(self._data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.size > 0 and indices_array.max() >= len(self._data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + fill_mask = indices_array < 0 + if fill_mask.any(): + validate_indices(indices_array, len(self._data)) + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=fill_mask) + result = self._data.take(indices_array) + if isna(fill_value): + return type(self)(result) + # TODO: ArrowNotImplementedError: Function fill_null has no + # kernel matching input types (array[string], scalar[string]) + result = type(self)(result) + result[fill_mask] = fill_value + return result + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self._data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self._data) + return type(self)(self._data.take(indices_array)) + def value_counts(self, dropna: bool = True) -> Series: """ Return a Series containing counts of each unique value. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 02c96a91bdd60..154c143ac89df 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -3,7 +3,6 @@ from collections.abc import Callable # noqa: PDF001 import re from typing import ( - Any, Union, overload, ) @@ -21,7 +20,6 @@ Scalar, ScalarIndexer, SequenceIndexer, - TakeIndexer, npt, ) from pandas.compat import ( @@ -30,10 +28,8 @@ pa_version_under3p0, pa_version_under4p0, ) -from pandas.util._decorators import doc from pandas.core.dtypes.common import ( - is_array_like, is_bool_dtype, is_dtype_equal, is_integer, @@ -47,7 +43,6 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import ArrowExtensionArray -from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype @@ -58,7 +53,6 @@ from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, - validate_indices, ) from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -209,23 +203,6 @@ def to_numpy( result[mask] = na_value return result - @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: - encoded = self._data.dictionary_encode() - indices = pa.chunked_array( - [c.indices for c in encoded.chunks], type=encoded.type.index_type - ).to_pandas() - if indices.dtype.kind == "f": - indices[np.isnan(indices)] = na_sentinel - indices = indices.astype(np.int64, copy=False) - - if encoded.num_chunks: - uniques = type(self)(encoded.chunk(0).dictionary) - else: - uniques = type(self)(pa.array([], type=encoded.type.value_type)) - - return indices.values, uniques - @overload def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: ... @@ -352,104 +329,6 @@ def _maybe_convert_setitem_value(self, value): raise ValueError("Scalar must be NA or str") return value - def take( - self, - indices: TakeIndexer, - allow_fill: bool = False, - fill_value: Any = None, - ): - """ - Take elements from an array. - - Parameters - ---------- - indices : sequence of int or one-dimensional np.ndarray of int - Indices to be taken. - allow_fill : bool, default False - How to handle negative values in `indices`. - - * False: negative values in `indices` indicate positional indices - from the right (the default). This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - fill_value : any, optional - Fill value to use for NA-indices when `allow_fill` is True. - This may be ``None``, in which case the default NA value for - the type, ``self.dtype.na_value``, is used. - - For many ExtensionArrays, there will be two representations of - `fill_value`: a user-facing "boxed" scalar, and a low-level - physical NA value. `fill_value` should be the user-facing version, - and the implementation should handle translating that to the - physical version for processing the take if necessary. - - Returns - ------- - ExtensionArray - - Raises - ------ - IndexError - When the indices are out of bounds for the array. - ValueError - When `indices` contains negative values other than ``-1`` - and `allow_fill` is True. - - See Also - -------- - numpy.take - api.extensions.take - - Notes - ----- - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when `indices` is a sequence of values. Additionally, - it's called by :meth:`Series.reindex`, or any other method - that causes realignment, with a `fill_value`. - """ - # TODO: Remove once we got rid of the (indices < 0) check - if not is_array_like(indices): - indices_array = np.asanyarray(indices) - else: - # error: Incompatible types in assignment (expression has type - # "Sequence[int]", variable has type "ndarray") - indices_array = indices # type: ignore[assignment] - - if len(self._data) == 0 and (indices_array >= 0).any(): - raise IndexError("cannot do a non-empty take") - if indices_array.size > 0 and indices_array.max() >= len(self._data): - raise IndexError("out of bounds value in 'indices'.") - - if allow_fill: - fill_mask = indices_array < 0 - if fill_mask.any(): - validate_indices(indices_array, len(self._data)) - # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = pa.array(indices_array, mask=fill_mask) - result = self._data.take(indices_array) - if isna(fill_value): - return type(self)(result) - # TODO: ArrowNotImplementedError: Function fill_null has no - # kernel matching input types (array[string], scalar[string]) - result = type(self)(result) - result[fill_mask] = fill_value - return result - # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) - else: - # Nothing to fill - return type(self)(self._data.take(indices)) - else: # allow_fill=False - # TODO(ARROW-9432): Treat negative indices as indices from the right. - if (indices_array < 0).any(): - # Don't modify in-place - indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self._data) - return type(self)(self._data.take(indices_array)) - def isin(self, values): if pa_version_under2p0: return super().isin(values)