|
43 | 43 | )
|
44 | 44 |
|
45 | 45 | from pandas.core.dtypes.common import (
|
| 46 | + is_array_like, |
46 | 47 | is_bool_dtype,
|
47 | 48 | is_dtype_equal,
|
48 | 49 | is_integer,
|
|
69 | 70 | from pandas.core.array_algos.transforms import shift
|
70 | 71 | from pandas.core.arrays.base import ExtensionArray
|
71 | 72 | from pandas.core.construction import extract_array
|
72 |
| -from pandas.core.indexers import check_array_indexer |
| 73 | +from pandas.core.indexers import ( |
| 74 | + check_array_indexer, |
| 75 | + validate_indices, |
| 76 | +) |
73 | 77 | from pandas.core.sorting import nargminmax
|
74 | 78 |
|
75 | 79 | NDArrayBackedExtensionArrayT = TypeVar(
|
|
86 | 90 | NumpyValueArrayLike,
|
87 | 91 | )
|
88 | 92 |
|
| 93 | + from pandas import Series |
| 94 | + |
89 | 95 |
|
90 | 96 | def ravel_compat(meth: F) -> F:
|
91 | 97 | """
|
@@ -599,6 +605,159 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
|
599 | 605 | """
|
600 | 606 | return type(self)(self._data)
|
601 | 607 |
|
| 608 | + @doc(ExtensionArray.factorize) |
| 609 | + def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: |
| 610 | + encoded = self._data.dictionary_encode() |
| 611 | + indices = pa.chunked_array( |
| 612 | + [c.indices for c in encoded.chunks], type=encoded.type.index_type |
| 613 | + ).to_pandas() |
| 614 | + if indices.dtype.kind == "f": |
| 615 | + indices[np.isnan(indices)] = na_sentinel |
| 616 | + indices = indices.astype(np.int64, copy=False) |
| 617 | + |
| 618 | + if encoded.num_chunks: |
| 619 | + uniques = type(self)(encoded.chunk(0).dictionary) |
| 620 | + else: |
| 621 | + uniques = type(self)(pa.array([], type=encoded.type.value_type)) |
| 622 | + |
| 623 | + return indices.values, uniques |
| 624 | + |
| 625 | + def take( |
| 626 | + self, |
| 627 | + indices: TakeIndexer, |
| 628 | + allow_fill: bool = False, |
| 629 | + fill_value: Any = None, |
| 630 | + ): |
| 631 | + """ |
| 632 | + Take elements from an array. |
| 633 | +
|
| 634 | + Parameters |
| 635 | + ---------- |
| 636 | + indices : sequence of int or one-dimensional np.ndarray of int |
| 637 | + Indices to be taken. |
| 638 | + allow_fill : bool, default False |
| 639 | + How to handle negative values in `indices`. |
| 640 | +
|
| 641 | + * False: negative values in `indices` indicate positional indices |
| 642 | + from the right (the default). This is similar to |
| 643 | + :func:`numpy.take`. |
| 644 | +
|
| 645 | + * True: negative values in `indices` indicate |
| 646 | + missing values. These values are set to `fill_value`. Any other |
| 647 | + other negative values raise a ``ValueError``. |
| 648 | +
|
| 649 | + fill_value : any, optional |
| 650 | + Fill value to use for NA-indices when `allow_fill` is True. |
| 651 | + This may be ``None``, in which case the default NA value for |
| 652 | + the type, ``self.dtype.na_value``, is used. |
| 653 | +
|
| 654 | + For many ExtensionArrays, there will be two representations of |
| 655 | + `fill_value`: a user-facing "boxed" scalar, and a low-level |
| 656 | + physical NA value. `fill_value` should be the user-facing version, |
| 657 | + and the implementation should handle translating that to the |
| 658 | + physical version for processing the take if necessary. |
| 659 | +
|
| 660 | + Returns |
| 661 | + ------- |
| 662 | + ExtensionArray |
| 663 | +
|
| 664 | + Raises |
| 665 | + ------ |
| 666 | + IndexError |
| 667 | + When the indices are out of bounds for the array. |
| 668 | + ValueError |
| 669 | + When `indices` contains negative values other than ``-1`` |
| 670 | + and `allow_fill` is True. |
| 671 | +
|
| 672 | + See Also |
| 673 | + -------- |
| 674 | + numpy.take |
| 675 | + api.extensions.take |
| 676 | +
|
| 677 | + Notes |
| 678 | + ----- |
| 679 | + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, |
| 680 | + ``iloc``, when `indices` is a sequence of values. Additionally, |
| 681 | + it's called by :meth:`Series.reindex`, or any other method |
| 682 | + that causes realignment, with a `fill_value`. |
| 683 | + """ |
| 684 | + # TODO: Remove once we got rid of the (indices < 0) check |
| 685 | + if not is_array_like(indices): |
| 686 | + indices_array = np.asanyarray(indices) |
| 687 | + else: |
| 688 | + # error: Incompatible types in assignment (expression has type |
| 689 | + # "Sequence[int]", variable has type "ndarray") |
| 690 | + indices_array = indices # type: ignore[assignment] |
| 691 | + |
| 692 | + if len(self._data) == 0 and (indices_array >= 0).any(): |
| 693 | + raise IndexError("cannot do a non-empty take") |
| 694 | + if indices_array.size > 0 and indices_array.max() >= len(self._data): |
| 695 | + raise IndexError("out of bounds value in 'indices'.") |
| 696 | + |
| 697 | + if allow_fill: |
| 698 | + fill_mask = indices_array < 0 |
| 699 | + if fill_mask.any(): |
| 700 | + validate_indices(indices_array, len(self._data)) |
| 701 | + # TODO(ARROW-9433): Treat negative indices as NULL |
| 702 | + indices_array = pa.array(indices_array, mask=fill_mask) |
| 703 | + result = self._data.take(indices_array) |
| 704 | + if isna(fill_value): |
| 705 | + return type(self)(result) |
| 706 | + # TODO: ArrowNotImplementedError: Function fill_null has no |
| 707 | + # kernel matching input types (array[string], scalar[string]) |
| 708 | + result = type(self)(result) |
| 709 | + result[fill_mask] = fill_value |
| 710 | + return result |
| 711 | + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) |
| 712 | + else: |
| 713 | + # Nothing to fill |
| 714 | + return type(self)(self._data.take(indices)) |
| 715 | + else: # allow_fill=False |
| 716 | + # TODO(ARROW-9432): Treat negative indices as indices from the right. |
| 717 | + if (indices_array < 0).any(): |
| 718 | + # Don't modify in-place |
| 719 | + indices_array = np.copy(indices_array) |
| 720 | + indices_array[indices_array < 0] += len(self._data) |
| 721 | + return type(self)(self._data.take(indices_array)) |
| 722 | + |
| 723 | + def value_counts(self, dropna: bool = True) -> Series: |
| 724 | + """ |
| 725 | + Return a Series containing counts of each unique value. |
| 726 | +
|
| 727 | + Parameters |
| 728 | + ---------- |
| 729 | + dropna : bool, default True |
| 730 | + Don't include counts of missing values. |
| 731 | +
|
| 732 | + Returns |
| 733 | + ------- |
| 734 | + counts : Series |
| 735 | +
|
| 736 | + See Also |
| 737 | + -------- |
| 738 | + Series.value_counts |
| 739 | + """ |
| 740 | + from pandas import ( |
| 741 | + Index, |
| 742 | + Series, |
| 743 | + ) |
| 744 | + |
| 745 | + vc = self._data.value_counts() |
| 746 | + |
| 747 | + values = vc.field(0) |
| 748 | + counts = vc.field(1) |
| 749 | + if dropna and self._data.null_count > 0: |
| 750 | + mask = values.is_valid() |
| 751 | + values = values.filter(mask) |
| 752 | + counts = counts.filter(mask) |
| 753 | + |
| 754 | + # No missing values so we can adhere to the interface and return a numpy array. |
| 755 | + counts = np.array(counts) |
| 756 | + |
| 757 | + index = Index(type(self)(values)) |
| 758 | + |
| 759 | + return Series(counts, index=index).astype("Int64") |
| 760 | + |
602 | 761 | @classmethod
|
603 | 762 | def _concat_same_type(
|
604 | 763 | cls: type[ArrowExtensionArrayT], to_concat
|
|
0 commit comments